| /* $Id: RSSConnector.java 994959 2010-09-08 10:04:42Z kwright $ */ |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.manifoldcf.crawler.connectors.rss; |
| |
| import org.apache.manifoldcf.core.interfaces.*; |
| import org.apache.manifoldcf.agents.interfaces.*; |
| import org.apache.manifoldcf.crawler.interfaces.*; |
| import org.apache.manifoldcf.crawler.system.Logging; |
| import org.apache.manifoldcf.crawler.system.ManifoldCF; |
| |
| import org.apache.manifoldcf.core.fuzzyml.*; |
| import org.apache.manifoldcf.core.common.DateParser; |
| |
| import org.apache.http.conn.ConnectTimeoutException; |
| import org.apache.http.client.RedirectException; |
| import org.apache.http.client.CircularRedirectException; |
| import org.apache.http.NoHttpResponseException; |
| import org.apache.http.HttpException; |
| |
| import java.io.*; |
| import java.util.*; |
| import java.net.*; |
| import java.text.*; |
| import java.util.regex.*; |
| |
| /** This is the RSS implementation of the IRepositoryConnector interface. |
| * This connector basically looks at an RSS document in order to seed the |
| * document queue. The document is always fetched from the same URL (it's |
| * specified in the configuration parameters). The documents subsequently |
| * crawled are not scraped for additional links; only the primary document is |
| * ingested. On the other hand, redirections ARE honored, so that various |
| * sites that use this trick can be supported (e.g. the BBC) |
| * |
| */ |
| public class RSSConnector extends org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector |
| { |
| public static final String _rcsid = "@(#)$Id: RSSConnector.java 994959 2010-09-08 10:04:42Z kwright $"; |
| |
| protected final static String rssThrottleGroupType = "_RSS_"; |
| |
| // Usage flag values |
| protected static final int ROBOTS_NONE = 0; |
| protected static final int ROBOTS_DATA = 1; |
| protected static final int ROBOTS_ALL = 2; |
| |
| /** Dechromed content mode - none */ |
| public static final int DECHROMED_NONE = 0; |
| /** Dechromed content mode - description field */ |
| public static final int DECHROMED_DESCRIPTION = 1; |
| /** Dechromed content mode - content field */ |
| public static final int DECHROMED_CONTENT = 2; |
| |
| /** Chromed suppression mode - use chromed content if dechromed content not available */ |
| public static final int CHROMED_USE = 0; |
| /** Chromed suppression mode - skip documents if dechromed content not available */ |
| public static final int CHROMED_SKIP = 1; |
| /** Chromed suppression mode - index metadata only if dechromed content not available */ |
| public static final int CHROMED_METADATA_ONLY = 2; |
| |
| /** Robots usage flag */ |
| protected int robotsUsage = ROBOTS_ALL; |
| |
| /** The user-agent for this connector instance */ |
| protected String userAgent = null; |
| /** The email address for this connector instance */ |
| protected String from = null; |
| /** The minimum milliseconds between fetches */ |
| protected long minimumMillisecondsPerFetchPerServer = -1L; |
| /** The maximum open connections */ |
| protected int maxOpenConnectionsPerServer = 0; |
| /** The minimum milliseconds between bytes */ |
| protected double minimumMillisecondsPerBytePerServer = 0.0; |
| /** The throttle group name */ |
| protected String throttleGroupName = null; |
| /** The proxy host */ |
| protected String proxyHost = null; |
| /** The proxy port */ |
| protected int proxyPort = -1; |
| /** Proxy auth domain */ |
| protected String proxyAuthDomain = null; |
| /** Proxy auth username */ |
| protected String proxyAuthUsername = null; |
| /** Proxy auth password */ |
| protected String proxyAuthPassword = null; |
| |
| /** The throttled fetcher used by this instance */ |
| protected ThrottledFetcher fetcher = null; |
| /** The robots object used by this instance */ |
| protected Robots robots = null; |
| |
| /** Storage for fetcher objects */ |
| protected static Map<String,ThrottledFetcher> fetcherMap = new HashMap<String,ThrottledFetcher>(); |
| /** Storage for robots objects */ |
| protected static Map robotsMap = new HashMap(); |
| |
| /** Flag indicating whether session data is initialized */ |
| protected boolean isInitialized = false; |
| |
| // A couple of very important points. |
| // The canonical document identifier is simply a URL. |
| // Versions of the document are calculated using a checksum technique |
| |
| protected static DataCache cache = new DataCache(); |
| |
| |
| protected static final Map understoodProtocols = new HashMap(); |
| static |
| { |
| understoodProtocols.put("http","http"); |
| understoodProtocols.put("https","https"); |
| } |
| |
| // Activity types |
| public final static String ACTIVITY_FETCH = "fetch"; |
| public final static String ACTIVITY_ROBOTSPARSE = "robots parse"; |
| |
| /** Deny access token for default authority */ |
| private final static String defaultAuthorityDenyToken = "DEAD_AUTHORITY"; |
| |
| /** Constructor. |
| */ |
| public RSSConnector() |
| { |
| } |
| |
| /** Establish a session */ |
| protected void getSession() |
| throws ManifoldCFException |
| { |
| if (!isInitialized) |
| { |
| String x; |
| |
| String emailAddress = params.getParameter(RSSConfig.PARAMETER_EMAIL); |
| if (emailAddress == null) |
| throw new ManifoldCFException("Missing email address"); |
| userAgent = "Mozilla/5.0 (ApacheManifoldCFRSSFeedReader; "+((emailAddress==null)?"":emailAddress)+")"; |
| from = emailAddress; |
| |
| String robotsUsageString = params.getParameter(RSSConfig.PARAMETER_ROBOTSUSAGE); |
| robotsUsage = ROBOTS_ALL; |
| if (robotsUsageString == null || robotsUsageString.length() == 0 || robotsUsageString.equals(RSSConfig.VALUE_ALL)) |
| robotsUsage = ROBOTS_ALL; |
| else if (robotsUsageString.equals(RSSConfig.VALUE_NONE)) |
| robotsUsage = ROBOTS_NONE; |
| else if (robotsUsageString.equals(RSSConfig.VALUE_DATA)) |
| robotsUsage = ROBOTS_DATA; |
| |
| proxyHost = params.getParameter(RSSConfig.PARAMETER_PROXYHOST); |
| String proxyPortString = params.getParameter(RSSConfig.PARAMETER_PROXYPORT); |
| proxyAuthDomain = params.getParameter(RSSConfig.PARAMETER_PROXYAUTHDOMAIN); |
| proxyAuthUsername = params.getParameter(RSSConfig.PARAMETER_PROXYAUTHUSERNAME); |
| proxyAuthPassword = params.getObfuscatedParameter(RSSConfig.PARAMETER_PROXYAUTHPASSWORD); |
| |
| proxyPort = -1; |
| if (proxyPortString != null && proxyPortString.length() > 0) |
| { |
| try |
| { |
| proxyPort = Integer.parseInt(proxyPortString); |
| } |
| catch (NumberFormatException e) |
| { |
| throw new ManifoldCFException(e.getMessage(),e); |
| } |
| } |
| |
| // Read throttling configuration parameters |
| minimumMillisecondsPerBytePerServer = 0.0; |
| maxOpenConnectionsPerServer = 10; |
| minimumMillisecondsPerFetchPerServer = 0L; |
| |
| x = params.getParameter(RSSConfig.PARAMETER_BANDWIDTH); |
| if (x != null && x.length() > 0) |
| { |
| try |
| { |
| int maxKBytesPerSecondPerServer = Integer.parseInt(x); |
| if (maxKBytesPerSecondPerServer > 0) |
| minimumMillisecondsPerBytePerServer = 1.0/(double)maxKBytesPerSecondPerServer; |
| } |
| catch (NumberFormatException e) |
| { |
| throw new ManifoldCFException("Bad number: "+e.getMessage(),e); |
| } |
| } |
| |
| x = params.getParameter(RSSConfig.PARAMETER_MAXOPEN); |
| if (x != null && x.length() > 0) |
| { |
| try |
| { |
| maxOpenConnectionsPerServer = Integer.parseInt(x); |
| } |
| catch (NumberFormatException e) |
| { |
| throw new ManifoldCFException("Bad number: "+e.getMessage(),e); |
| } |
| } |
| |
| x = params.getParameter(RSSConfig.PARAMETER_MAXFETCHES); |
| if (x != null && x.length() > 0) |
| { |
| try |
| { |
| int maxFetches = Integer.parseInt(x); |
| if (maxFetches == 0) |
| maxFetches = 1; |
| minimumMillisecondsPerFetchPerServer = 60000L/((long)maxFetches); |
| } |
| catch (NumberFormatException e) |
| { |
| throw new ManifoldCFException("Bad number: "+e.getMessage(),e); |
| } |
| |
| } |
| |
| IThrottleGroups tg = ThrottleGroupsFactory.make(currentContext); |
| // Create the throttle group |
| tg.createOrUpdateThrottleGroup(rssThrottleGroupType, throttleGroupName, new ThrottleSpec(maxOpenConnectionsPerServer, |
| minimumMillisecondsPerFetchPerServer, minimumMillisecondsPerBytePerServer)); |
| |
| isInitialized = true; |
| } |
| } |
| |
| |
| /** Return the list of activities that this connector supports (i.e. writes into the log). |
| *@return the list. |
| */ |
| @Override |
| public String[] getActivitiesList() |
| { |
| return new String[]{ACTIVITY_FETCH, ACTIVITY_ROBOTSPARSE}; |
| } |
| |
| /** Tell the world what model this connector uses for getDocumentIdentifiers(). |
| * This must return a model value as specified above. |
| *@return the model type value. |
| */ |
| @Override |
| public int getConnectorModel() |
| { |
| // This connector is currently structured that the RSS feeds are the seeds. |
| return MODEL_ALL; |
| } |
| |
| // All methods below this line will ONLY be called if a connect() call succeeded |
| // on this instance! |
| |
| /** Connect. The configuration parameters are included. |
| *@param configParams are the configuration parameters for this connection. |
| * Note well: There are no exceptions allowed from this call, since it is expected to mainly establish connection parameters. |
| */ |
| @Override |
| public void connect(ConfigParams configParams) |
| { |
| super.connect(configParams); |
| |
| // Do the necessary bookkeeping around connection counting |
| throttleGroupName = params.getParameter(RSSConfig.PARAMETER_THROTTLEGROUP); |
| if (throttleGroupName == null) |
| throttleGroupName = ""; |
| |
| fetcher = getFetcher(); |
| robots = getRobots(fetcher); |
| |
| // Let the system know we have a connection. |
| fetcher.noteConnectionEstablished(); |
| robots.noteConnectionEstablished(); |
| } |
| |
| /** This method is periodically called for all connectors that are connected but not |
| * in active use. |
| */ |
| @Override |
| public void poll() |
| throws ManifoldCFException |
| { |
| fetcher.poll(); |
| robots.poll(); |
| } |
| |
| /** Check status of connection. |
| */ |
| @Override |
| public String check() |
| throws ManifoldCFException |
| { |
| getSession(); |
| return super.check(); |
| } |
| |
| /** Close the connection. Call this before discarding the repository connector. |
| */ |
| @Override |
| public void disconnect() |
| throws ManifoldCFException |
| { |
| isInitialized = false; |
| |
| // Let the system know we are freeing the connection |
| robots.noteConnectionReleased(); |
| fetcher.noteConnectionReleased(); |
| |
| userAgent = null; |
| from = null; |
| minimumMillisecondsPerFetchPerServer = -1L; |
| maxOpenConnectionsPerServer = 0; |
| minimumMillisecondsPerBytePerServer = 0.0; |
| throttleGroupName = null; |
| proxyHost = null; |
| proxyPort = -1; |
| proxyAuthDomain = null; |
| proxyAuthUsername = null; |
| proxyAuthPassword = null; |
| |
| super.disconnect(); |
| } |
| |
| |
| /** Get the bin name string for a document identifier. The bin name describes the queue to which the |
| * document will be assigned for throttling purposes. Throttling controls the rate at which items in a |
| * given queue are fetched; it does not say anything about the overall fetch rate, which may operate on |
| * multiple queues or bins. |
| * For example, if you implement a web crawler, a good choice of bin name would be the server name, since |
| * that is likely to correspond to a real resource that will need real throttle protection. |
| *@param documentIdentifier is the document identifier. |
| *@return the bin name. |
| */ |
| @Override |
| public String[] getBinNames(String documentIdentifier) |
| { |
| try |
| { |
| WebURL uri = new WebURL(documentIdentifier); |
| return new String[]{uri.getHost()}; |
| } |
| catch (URISyntaxException e) |
| { |
| return new String[]{""}; |
| } |
| |
| } |
| |
| /** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents |
| * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object. |
| * |
| * This method can choose to find repository changes that happen only during the specified time interval. |
| * The seeds recorded by this method will be viewed by the framework based on what the |
| * getConnectorModel() method returns. |
| * |
| * It is not a big problem if the connector chooses to create more seeds than are |
| * strictly necessary; it is merely a question of overall work required. |
| * |
| * The times passed to this method may be interpreted for greatest efficiency. The time ranges |
| * any given job uses with this connector will not overlap, but will proceed starting at 0 and going |
| * to the "current time", each time the job is run. For continuous crawling jobs, this method will |
| * be called once, when the job starts, and at various periodic intervals as the job executes. |
| * |
| * When a job's specification is changed, the framework automatically resets the seeding start time to 0. The |
| * seeding start time may also be set to 0 on each job run, depending on the connector model returned by |
| * getConnectorModel(). |
| * |
| * Note that it is always ok to send MORE documents rather than less to this method. |
| *@param activities is the interface this method should use to perform whatever framework actions are desired. |
| *@param spec is a document specification (that comes from the job). |
| *@param startTime is the beginning of the time range to consider, inclusive. |
| *@param endTime is the end of the time range to consider, exclusive. |
| */ |
| @Override |
| public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec, |
| long startTime, long endTime) |
| throws ManifoldCFException, ServiceInterruption |
| { |
| getSession(); |
| |
| Filter f = new Filter(spec,true); |
| |
| // Go through all the seeds. |
| Iterator iter = f.getSeeds(); |
| while (iter.hasNext()) |
| { |
| String canonicalURL = (String)iter.next(); |
| activities.addSeedDocument(canonicalURL); |
| } |
| } |
| |
| /** Convert an absolute or relative URL to a document identifier. This may involve several steps at some point, |
| * but right now it does NOT involve converting the host name to a canonical host name. |
| * (Doing so would destroy the ability of virtually hosted sites to do the right thing, |
| * since the original host name would be lost.) Thus, we do the conversion to IP address |
| * right before we actually fetch the document. |
| *@param policies are the canonicalization policies in effect. |
| *@param parentIdentifier the identifier of the document in which the raw url was found, or null if none. |
| *@param rawURL is the raw, un-normalized and un-canonicalized url. |
| *@return the canonical URL (the document identifier), or null if the url was illegal. |
| */ |
| protected static String makeDocumentIdentifier(CanonicalizationPolicies policies, String parentIdentifier, String rawURL) |
| throws ManifoldCFException |
| { |
| try |
| { |
| // First, find the matching canonicalization policy, if any |
| CanonicalizationPolicy p = policies.findMatch(rawURL); |
| |
| // Filter out control characters |
| StringBuilder sb = new StringBuilder(); |
| int i = 0; |
| while (i < rawURL.length()) |
| { |
| char x = rawURL.charAt(i++); |
| // Only 7-bit ascii is allowed in URLs - and that has limits too (no control characters) |
| if (x >= ' ' && x < 128) |
| sb.append(x); |
| } |
| rawURL = sb.toString(); |
| |
| WebURL url; |
| if (parentIdentifier != null) |
| { |
| WebURL parentURL = new WebURL(parentIdentifier); |
| url = parentURL.resolve(rawURL); |
| } |
| else |
| url = new WebURL(rawURL); |
| |
| String protocol = url.getScheme(); |
| String host = url.getHost(); |
| |
| // The new URL better darn well have a host and a protocol, and we only know how to deal with |
| // http and https. |
| if (protocol == null || host == null) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it has no protocol or host"); |
| return null; |
| } |
| if (understoodProtocols.get(protocol) == null) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it has an unsupported protocol '"+protocol+"'"); |
| return null; |
| } |
| |
| // Canonicalization procedure. |
| // The query part of the URL may contain bad parameters (session id's, for instance), or may be ordered in such a |
| // way as to prevent an effectively identical URL from being matched. The anchor part of the URL should also be stripped. |
| // This code performs both of these activities in a simple way; rewrites of various pieces may get more involved if we add |
| // the ability to perform mappings using criteria specified in the UI. Right now we don't. |
| String id = doCanonicalization(p,url); |
| if (id == null) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it could not be canonicalized"); |
| return null; |
| } |
| |
| // As a last basic legality check, go through looking for illegal characters. |
| i = 0; |
| while (i < id.length()) |
| { |
| char x = id.charAt(i++); |
| // Only 7-bit ascii is allowed in URLs - and that has limits too (no control characters) |
| if (x < ' ' || x > 127) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it has illegal characters in it"); |
| return null; |
| } |
| } |
| |
| return id; |
| } |
| catch (java.net.URISyntaxException e) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it is badly formed: "+e.getMessage()); |
| return null; |
| } |
| catch (java.lang.IllegalArgumentException e) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because there was an argument error: "+e.getMessage(),e); |
| return null; |
| } |
| catch (java.lang.NullPointerException e) |
| { |
| // This gets tossed by url.toAsciiString() for reasons I don't understand, but which have to do with a malformed URL. |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it is missing fields: "+e.getMessage(),e); |
| return null; |
| } |
| } |
| |
| /** Code to canonicalize a URL. If URL cannot be canonicalized (and is illegal) return null. |
| */ |
| protected static String doCanonicalization(CanonicalizationPolicy p, WebURL url) |
| throws ManifoldCFException, java.net.URISyntaxException |
| { |
| // Note well: The java.net.URI class mistreats the query part of the URI, near as I can tell, in the following ways: |
| // (1) It decodes the whole thing without regards to the argument interpretation, so the escaped ampersands etc in the arguments are converted |
| // to non-escaped ones (ugh). This is why I changed the code below to parse the RAW query string and decode it myself. |
| // (2) On reassembly of the query string, the class does not properly escape ":", "/", or a bunch of other characters the class description *says* |
| // it will escape. This means it creates URI's that are illegal according to RFC 2396 - although it is true that RFC 2396 also contains |
| // apparent errors. |
| // |
| // I've therefore opted to deal with this problem by doing much of the query string processing myself - including its final reassembly into the |
| // URI at the end of the processing. |
| // |
| |
| // To make the url be canonical, we need to strip off everything after the #. We also need to order the arguments in a canonical |
| // way, and remove session identifiers of the types we know about. |
| String queryString = url.getRawQuery(); |
| if (queryString != null) |
| { |
| // Rewrite the query string. To do this, we first parse it (by looking for ampersands and equal signs), and then |
| // we ditch any keys that we really don't want (session identifiers particularly). Finally, we go through the |
| // keys in sorted order and reassemble the query, making sure that any arguments that have the same name |
| // appear in the same order. |
| |
| // I don't use the 'split' operation because I think it's a lot more oomph (and performance loss) than is needed |
| // for this simple parsing task. |
| |
| // When reordering a url, the following is done: |
| // (1) The individual order of all arguments with the same name is preserved |
| // (2) The arguments themselves appear in sorted order, minus any arguments that should be removed because they |
| // are interpreted to be session arguments. |
| // |
| // When a url is NOT reordered, the following is done: |
| // (1) Each argument is examined IN TURN. |
| // (2) If the argument is a session argument and should be excluded, it is simply skipped. |
| |
| // Canonicalization note: Broadvision |
| // |
| // The format of Broadvision's urls is as follows: |
| // http://blah/path/path?arg|arg|arg|BVSession@@@@=xxxx&more stuff |
| // The session identifier is the BVSession@@@@. In theory I could strip this away, but I've found that |
| // most Broadvision sites require session even for basic navigation! |
| |
| if (p == null || p.canReorder()) |
| { |
| // Reorder the arguments. |
| HashMap argumentMap = new HashMap(); |
| int index = 0; |
| while (index < queryString.length()) |
| { |
| int newIndex = queryString.indexOf("&",index); |
| if (newIndex == -1) |
| newIndex = queryString.length(); |
| String argument = queryString.substring(index,newIndex); |
| int valueIndex = argument.indexOf("="); |
| String key; |
| if (valueIndex == -1) |
| key = argument; |
| else |
| key = argument.substring(0,valueIndex); |
| |
| // If this is a disallowed argument, simply don't include it in the final map. |
| boolean includeArgument = true; |
| if ((p == null || p.canRemovePhpSession()) && key.equals("PHPSESSID")) |
| includeArgument = false; |
| if ((p == null || p.canRemoveBvSession()) && key.indexOf("BVSession@@@@") != -1) |
| includeArgument = false; |
| |
| if (includeArgument) |
| { |
| ArrayList list = (ArrayList)argumentMap.get(key); |
| if (list == null) |
| { |
| list = new ArrayList(); |
| argumentMap.put(key,list); |
| } |
| list.add(argument); |
| } |
| |
| if (newIndex < queryString.length()) |
| index = newIndex + 1; |
| else |
| index = newIndex; |
| } |
| |
| // Reassemble query string in sorted order |
| String[] sortArray = new String[argumentMap.size()]; |
| int i = 0; |
| Iterator iter = argumentMap.keySet().iterator(); |
| while (iter.hasNext()) |
| { |
| sortArray[i++] = (String)iter.next(); |
| } |
| java.util.Arrays.sort(sortArray); |
| |
| StringBuilder newString = new StringBuilder(); |
| boolean isFirst = true; |
| i = 0; |
| while (i < sortArray.length) |
| { |
| String key = sortArray[i++]; |
| ArrayList list = (ArrayList)argumentMap.get(key); |
| int j = 0; |
| while (j < list.size()) |
| { |
| if (isFirst == false) |
| { |
| newString.append("&"); |
| } |
| else |
| isFirst = false; |
| newString.append((String)list.get(j++)); |
| } |
| } |
| queryString = newString.toString(); |
| } |
| else |
| { |
| // Do not reorder! |
| StringBuilder newString = new StringBuilder(); |
| int index = 0; |
| boolean isFirst = true; |
| while (index < queryString.length()) |
| { |
| int newIndex = queryString.indexOf("&",index); |
| if (newIndex == -1) |
| newIndex = queryString.length(); |
| String argument = queryString.substring(index,newIndex); |
| int valueIndex = argument.indexOf("="); |
| String key; |
| if (valueIndex == -1) |
| key = argument; |
| else |
| key = argument.substring(0,valueIndex); |
| |
| // If this is a disallowed argument, simply don't include it in the final query. |
| boolean includeArgument = true; |
| if ((p == null || p.canRemovePhpSession()) && key.equals("PHPSESSID")) |
| includeArgument = false; |
| if ((p == null || p.canRemoveBvSession()) && key.indexOf("BVSession@@@@") != -1) |
| includeArgument = false; |
| |
| if (includeArgument) |
| { |
| if (!isFirst) |
| newString.append("&"); |
| else |
| isFirst = false; |
| newString.append(argument); |
| } |
| |
| if (newIndex < queryString.length()) |
| index = newIndex + 1; |
| else |
| index = newIndex; |
| } |
| queryString = newString.toString(); |
| } |
| } |
| |
| // Now, rewrite path to get rid of jsessionid etc. |
| String pathString = url.getPath(); |
| if (pathString != null) |
| { |
| int index = pathString.indexOf(";jsessionid="); |
| if ((p == null || p.canRemoveJavaSession()) && index != -1) |
| { |
| // There's a ";jsessionid=" |
| // Strip the java session id |
| pathString = pathString.substring(0,index); |
| } |
| if ((p == null || p.canRemoveAspSession()) && pathString.startsWith("/s(")) |
| { |
| // It's asp.net |
| index = pathString.indexOf(")"); |
| if (index != -1) |
| pathString = pathString.substring(index+1); |
| } |
| |
| } |
| |
| // Put it back into the URL without the ref, and with the modified query and path parts. |
| url = new WebURL(url.getScheme(),url.getHost(),url.getPort(),pathString,queryString); |
| String rval = url.toASCIIString(); |
| return rval; |
| } |
| |
| |
| |
| /** Get document versions given an array of document identifiers. |
| * This method is called for EVERY document that is considered. It is |
| * therefore important to perform as little work as possible here. |
| *@param documentIdentifiers is the array of local document identifiers, as understood by this connector. |
| *@param oldVersions is the corresponding array of version strings that have been saved for the document identifiers. |
| * A null value indicates that this is a first-time fetch, while an empty string indicates that the previous document |
| * had an empty version string. |
| *@param activities is the interface this method should use to perform whatever framework actions are desired. |
| *@param spec is the current document specification for the current job. If there is a dependency on this |
| * specification, then the version string should include the pertinent data, so that reingestion will occur |
| * when the specification changes. This is primarily useful for metadata. |
| *@param jobType is an integer describing how the job is being run, whether continuous or once-only. |
| *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one. |
| *@return the corresponding version strings, with null in the places where the document no longer exists. |
| * Empty version strings indicate that there is no versioning ability for the corresponding document, and the document |
| * will always be processed. |
| */ |
| @Override |
| public String[] getDocumentVersions(String[] documentIdentifiers, String[] oldVersions, IVersionActivity activities, |
| DocumentSpecification spec, int jobType, boolean usesDefaultAuthority) |
| throws ManifoldCFException, ServiceInterruption |
| { |
| getSession(); |
| |
| // The connection limit is designed to permit this connector to coexist with potentially other connectors, such as the web connector. |
| // There is currently no good way to enforce connection limits across all installed connectors - this will require considerably more |
| // thought to set up properly. |
| int connectionLimit = 200; |
| |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In getDocumentVersions for "+Integer.toString(documentIdentifiers.length)+" documents"); |
| |
| Filter f = new Filter(spec,false); |
| |
| String[] acls = f.getAcls(); |
| // Sort it, |
| java.util.Arrays.sort(acls); |
| |
| // Build a map of the metadata names and values from the spec |
| ArrayList namesAndValues = f.getMetadata(); |
| // Create an array of name/value fixedlists |
| String[] metadata = new String[namesAndValues.size()]; |
| int k = 0; |
| String[] fixedListStrings = new String[2]; |
| while (k < metadata.length) |
| { |
| NameValue nv = (NameValue)namesAndValues.get(k); |
| String name = nv.getName(); |
| String value = nv.getValue(); |
| fixedListStrings[0] = name; |
| fixedListStrings[1] = value; |
| StringBuilder newsb = new StringBuilder(); |
| packFixedList(newsb,fixedListStrings,'='); |
| metadata[k++] = newsb.toString(); |
| } |
| java.util.Arrays.sort(metadata); |
| |
| Logging.connectors.debug("RSS: Done setting up metadata version strings"); |
| |
| // NOTE: There are two kinds of documents in here; documents that are RSS feeds (that presumably have a content-type |
| // of text/xml), and documents that need to be indexed. |
| // |
| // For the latter, the metadata etc is part of the version string. For the former, the only thing that is part of the version string is the |
| // document's checksum. |
| // |
| // The need to exclude documents from fetch based on whether they match an expression causes some difficulties, because we really |
| // DON'T want this to apply to the feeds themselves. Since the distinguishing characteristic of a feed is that it is in the seed list, |
| // and that its content-type is text/xml, we could use either of these characteristics to treat feeds differently from |
| // fetchable urls. But the latter approach requires a fetch, which is forbidden. So - the spec will be used to characterize the url. |
| // However, the spec might change, and the url might be dropped from the list - and then what?? |
| // |
| // The final solution is to simply not queue what cannot be mapped. |
| |
| int feedTimeout = f.getFeedTimeoutValue(); |
| |
| // A preliminary fetch must be done in either case, because otherwise we cannot detect a deletion. |
| // But, since we don't want to fetch twice, write these urls to temporary files. The |
| // files will be cleaned up as soon as possible. |
| |
| String[] rval = new String[documentIdentifiers.length]; |
| |
| // HttpClient client = new HttpClient(httpConMan.getManager()); |
| |
| // The document specification has already been used to trim out documents that are not |
| // allowed from appearing in the queue. So, even that has already been done. |
| int i = 0; |
| while (i < documentIdentifiers.length) |
| { |
| // If it is in this list, we presume that it has been vetted against the map etc., so we don't do that again. We just fetch it. |
| // And, if the content type is xml, we calculate the version as if it is a feed rather than a document. |
| |
| // Get the url |
| String urlValue = documentIdentifiers[i]; |
| |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Getting version string for '"+urlValue+"'"); |
| |
| // If there's a carrydown "data" value for this url, we use that value rather than actually fetching the document. This also means we don't need to |
| // do a robots check, because we aren't actually crawling anything. So, ALWAYS do this first... |
| CharacterInput[] dechromedData = activities.retrieveParentDataAsFiles(urlValue,"data"); |
| try |
| { |
| if (dechromedData.length > 0) |
| { |
| // Data already available. The fetch cycle can be entirely avoided, as can the robots check. |
| String ingestURL = f.mapDocumentURL(urlValue); |
| if (ingestURL != null) |
| { |
| // Open up an input stream corresponding to the carrydown data. The stream will be encoded as utf-8. |
| try |
| { |
| InputStream is = dechromedData[0].getUtf8Stream(); |
| try |
| { |
| StringBuilder sb = new StringBuilder(); |
| long checkSum = cache.addData(activities,urlValue,"text/html",is); |
| // Grab what we need from the passed-down data for the document. These will all become part |
| // of the version string. |
| String[] pubDates = activities.retrieveParentData(urlValue,"pubdate"); |
| String[] sources = activities.retrieveParentData(urlValue,"source"); |
| String[] titles = activities.retrieveParentData(urlValue,"title"); |
| String[] authorNames = activities.retrieveParentData(urlValue,"authorname"); |
| String[] authorEmails = activities.retrieveParentData(urlValue,"authoremail"); |
| String[] categories = activities.retrieveParentData(urlValue,"category"); |
| String[] descriptions = activities.retrieveParentData(urlValue,"description"); |
| java.util.Arrays.sort(pubDates); |
| java.util.Arrays.sort(sources); |
| java.util.Arrays.sort(titles); |
| java.util.Arrays.sort(authorNames); |
| java.util.Arrays.sort(authorEmails); |
| java.util.Arrays.sort(categories); |
| java.util.Arrays.sort(descriptions); |
| |
| if (sources.length == 0) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Warning; URL '"+ingestURL+"' doesn't seem to have any RSS feed source!"); |
| } |
| |
| sb.append('+'); |
| packList(sb,acls,'+'); |
| if (acls.length > 0) |
| { |
| sb.append('+'); |
| pack(sb,defaultAuthorityDenyToken,'+'); |
| } |
| else |
| sb.append('-'); |
| // Now, do the metadata |
| packList(sb,metadata,'+'); |
| // The ingestion URL |
| pack(sb,ingestURL,'+'); |
| // The pub dates |
| packList(sb,pubDates,'+'); |
| // The titles |
| packList(sb,titles,'+'); |
| // The sources |
| packList(sb,sources,'+'); |
| // The categories |
| packList(sb,categories,'+'); |
| // The descriptions |
| packList(sb,descriptions,'+'); |
| // The author names |
| packList(sb,authorNames,'+'); |
| // The author emails |
| packList(sb,authorEmails,'+'); |
| |
| // Do the checksum part, which does not need to be parseable. |
| sb.append(new Long(checkSum).toString()); |
| |
| rval[i] = sb.toString(); |
| } |
| finally |
| { |
| is.close(); |
| } |
| } |
| catch (java.net.SocketTimeoutException e) |
| { |
| throw new ManifoldCFException("IO exception reading data from string: "+e.getMessage(),e); |
| } |
| catch (InterruptedIOException e) |
| { |
| throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); |
| } |
| catch (IOException e) |
| { |
| throw new ManifoldCFException("IO exception reading data from string: "+e.getMessage(),e); |
| } |
| } |
| else |
| { |
| // Document a seed or unmappable; just skip |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Skipping carry-down document '"+urlValue+"' because it is unmappable or is a seed."); |
| } |
| } |
| else |
| { |
| // Get the old version string |
| String oldVersionString = oldVersions[i]; |
| |
| // Unpack the old version as much as possible. |
| // We are interested in what the ETag and Last-Modified headers were last time. |
| String lastETagValue = null; |
| String lastModifiedValue = null; |
| // Note well: Non-continuous jobs cannot use etag because the rss document MUST be fetched each time for such jobs, |
| // or the documents it points at would get deleted. |
| // |
| // NOTE: I disabled this code because we really need the feed's TTL value in order to reschedule properly. I can't get the |
| // TTL value without refetching the document - therefore ETag and Last-Modified cannot be used :-( |
| if (false && jobType == JOBMODE_CONTINUOUS && oldVersionString != null && oldVersionString.startsWith("-")) |
| { |
| // It's a feed, so the last etag and last-modified fields should be encoded in this version string. |
| StringBuilder lastETagBuffer = new StringBuilder(); |
| int unpackPos = unpack(lastETagBuffer,oldVersionString,1,'+'); |
| StringBuilder lastModifiedBuffer = new StringBuilder(); |
| unpackPos = unpack(lastModifiedBuffer,oldVersionString,unpackPos,'+'); |
| if (lastETagBuffer.length() > 0) |
| lastETagValue = lastETagBuffer.toString(); |
| if (lastModifiedBuffer.length() > 0) |
| lastModifiedValue = lastModifiedBuffer.toString(); |
| } |
| |
| if (Logging.connectors.isDebugEnabled() && (lastETagValue != null || lastModifiedValue != null)) |
| Logging.connectors.debug("RSS: Document '"+urlValue+"' was found to have a previous ETag value of '"+((lastETagValue==null)?"null":lastETagValue)+ |
| "' and a previous Last-Modified value of '"+((lastModifiedValue==null)?"null":lastModifiedValue)+"'"); |
| |
| // Robots check. First, we need to separate the url into its components |
| try |
| { |
| URL url = new URL(urlValue); |
| String protocol = url.getProtocol(); |
| int port = url.getPort(); |
| String hostName = url.getHost(); |
| String pathPart = url.getFile(); |
| |
| // Check with robots to see if it's allowed |
| if (robotsUsage >= ROBOTS_DATA && !robots.isFetchAllowed(currentContext,throttleGroupName, |
| protocol,port,hostName,url.getPath(), |
| userAgent,from, |
| proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, |
| activities, connectionLimit)) |
| { |
| activities.recordActivity(null,ACTIVITY_FETCH, |
| null,urlValue,Integer.toString(-2),"Robots exclusion",null); |
| |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Skipping url '"+urlValue+"' because robots.txt says to"); |
| rval[i] = null; |
| } |
| else |
| { |
| |
| // Now, use the fetcher, and get the file. |
| IThrottledConnection connection = fetcher.createConnection(currentContext, |
| throttleGroupName, |
| hostName, |
| connectionLimit, |
| feedTimeout, |
| proxyHost, |
| proxyPort, |
| proxyAuthDomain, |
| proxyAuthUsername, |
| proxyAuthPassword); |
| try |
| { |
| // Begin the fetch |
| connection.beginFetch("Data"); |
| try |
| { |
| // Execute the request. |
| // Use the connect timeout from the document specification! |
| int status = connection.executeFetch(protocol,port,pathPart,userAgent,from, |
| lastETagValue,lastModifiedValue); |
| switch (status) |
| { |
| case IThrottledConnection.STATUS_NOCHANGE: |
| rval[i] = oldVersionString; |
| break; |
| case IThrottledConnection.STATUS_OK: |
| try |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Successfully fetched "+urlValue); |
| // Document successfully fetched! |
| // If its content is xml, presume it's a feed... |
| String contentType = connection.getResponseHeader("Content-Type"); |
| // Some sites have multiple content types. We just look at the LAST one in that case. |
| if (contentType != null) |
| { |
| String[] contentTypes = contentType.split(","); |
| if (contentTypes.length > 0) |
| contentType = contentTypes[contentTypes.length-1].trim(); |
| else |
| contentType = null; |
| } |
| boolean isXML = (contentType != null && |
| (contentType.startsWith("text/xml") || |
| contentType.startsWith("application/rss+xml") || |
| contentType.startsWith("application/xml") || |
| contentType.startsWith("application/atom+xml") || |
| contentType.startsWith("application/xhtml+xml") || |
| contentType.startsWith("text/XML") || |
| contentType.startsWith("application/rdf+xml") || |
| contentType.startsWith("text/application") || |
| contentType.startsWith("XML") )); |
| String ingestURL = null; |
| if (!isXML) |
| { |
| // If the chromed content mode is set to "skip", and we got here, it means |
| // we should not include the content. |
| if (f.getChromedContentMode() == CHROMED_SKIP) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Removing url '"+urlValue+"' because it no longer has dechromed content available"); |
| rval[i] = null; |
| break; |
| } |
| |
| // Decide whether to exclude this document based on what we see here. |
| // Basically, we want to get rid of everything that we don't know what |
| // to do with in the ingestion system. |
| if (!isContentInteresting(activities,contentType)) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Removing url '"+urlValue+"' because it had the wrong content type: "+((contentType==null)?"null":"'"+contentType+"'")); |
| rval[i] = null; |
| break; |
| } |
| |
| ingestURL = f.mapDocumentURL(urlValue); |
| } |
| else |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: The url '"+urlValue+"' is a feed"); |
| |
| if (!f.isSeed(urlValue)) |
| { |
| // Remove the feed from consideration, since it has left the list of seeds |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Removing feed url '"+urlValue+"' because it is not a seed."); |
| rval[i] = null; |
| break; |
| } |
| } |
| |
| InputStream is = connection.getResponseBodyStream(); |
| try |
| { |
| long checkSum = cache.addData(activities,urlValue,contentType,is); |
| StringBuilder sb = new StringBuilder(); |
| if (ingestURL != null) |
| { |
| // We think it is ingestable. The version string accordingly starts with a "+". |
| |
| // Grab what we need from the passed-down data for the document. These will all become part |
| // of the version string. |
| String[] pubDates = activities.retrieveParentData(urlValue,"pubdate"); |
| String[] sources = activities.retrieveParentData(urlValue,"source"); |
| String[] titles = activities.retrieveParentData(urlValue,"title"); |
| String[] authorNames = activities.retrieveParentData(urlValue,"authorname"); |
| String[] authorEmails = activities.retrieveParentData(urlValue,"authoremail"); |
| String[] categories = activities.retrieveParentData(urlValue,"category"); |
| String[] descriptions = activities.retrieveParentData(urlValue,"description"); |
| java.util.Arrays.sort(pubDates); |
| java.util.Arrays.sort(sources); |
| java.util.Arrays.sort(titles); |
| java.util.Arrays.sort(authorNames); |
| java.util.Arrays.sort(authorEmails); |
| java.util.Arrays.sort(categories); |
| java.util.Arrays.sort(descriptions); |
| |
| if (sources.length == 0) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Warning; URL '"+ingestURL+"' doesn't seem to have any RSS feed source!"); |
| } |
| |
| sb.append('+'); |
| packList(sb,acls,'+'); |
| if (acls.length > 0) |
| { |
| sb.append('+'); |
| pack(sb,defaultAuthorityDenyToken,'+'); |
| } |
| else |
| sb.append('-'); |
| // Now, do the metadata |
| packList(sb,metadata,'+'); |
| // The ingestion URL |
| pack(sb,ingestURL,'+'); |
| // The pub dates |
| packList(sb,pubDates,'+'); |
| // The titles |
| packList(sb,titles,'+'); |
| // The sources |
| packList(sb,sources,'+'); |
| // The categories |
| packList(sb,categories,'+'); |
| // The descriptions |
| packList(sb,descriptions,'+'); |
| // The author names |
| packList(sb,authorNames,'+'); |
| // The author emails |
| packList(sb,authorEmails,'+'); |
| } |
| else |
| { |
| sb.append('-'); |
| String etag = connection.getResponseHeader("ETag"); |
| if (etag == null) |
| pack(sb,"",'+'); |
| else |
| pack(sb,etag,'+'); |
| String lastModified = connection.getResponseHeader("Last-Modified"); |
| if (lastModified == null) |
| pack(sb,"",'+'); |
| else |
| pack(sb,lastModified,'+'); |
| |
| } |
| |
| // Do the checksum part, which does not need to be parseable. |
| sb.append(new Long(checkSum).toString()); |
| |
| rval[i] = sb.toString(); |
| } |
| finally |
| { |
| is.close(); |
| } |
| } |
| catch (java.net.SocketTimeoutException e) |
| { |
| Logging.connectors.warn("RSS: Socket timeout exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e); |
| rval[i] = null; |
| } |
| catch (ConnectTimeoutException e) |
| { |
| Logging.connectors.warn("RSS: Connecto timeout exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e); |
| rval[i] = null; |
| } |
| catch (InterruptedIOException e) |
| { |
| throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); |
| } |
| catch (IOException e) |
| { |
| Logging.connectors.warn("RSS: IO exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e); |
| rval[i] = null; |
| } |
| |
| break; |
| |
| case IThrottledConnection.STATUS_SITEERROR: |
| case IThrottledConnection.STATUS_PAGEERROR: |
| default: |
| // Record an *empty* version. |
| // This signals the processDocuments() method that we really don't want to ingest this document, but we also don't |
| // want to blow the document out of the queue, since then we'd wind up perhaps fetching it multiple times. |
| rval[i] = ""; |
| break; |
| } |
| } |
| finally |
| { |
| connection.doneFetch(activities); |
| } |
| } |
| finally |
| { |
| connection.close(); |
| } |
| } |
| } |
| catch (MalformedURLException e) |
| { |
| Logging.connectors.debug("RSS: URL '"+urlValue+"' is malformed; skipping",e); |
| rval[i] = null; |
| } |
| } |
| } |
| finally |
| { |
| int j = 0; |
| while (j < dechromedData.length) |
| { |
| CharacterInput ci = (CharacterInput)dechromedData[j++]; |
| if (ci != null) |
| ci.discard(); |
| } |
| |
| } |
| |
| i++; |
| } |
| |
| return rval; |
| } |
| |
| /** Process a set of documents. |
| * This is the method that should cause each document to be fetched, processed, and the results either added |
| * to the queue of documents for the current job, and/or entered into the incremental ingestion manager. |
| * The document specification allows this class to filter what is done based on the job. |
| *@param documentIdentifiers is the set of document identifiers to process. |
| *@param activities is the interface this method should use to queue up new document references |
| * and ingest documents. |
| *@param spec is the document specification. |
| *@param scanOnly is an array corresponding to the document identifiers. It is set to true to indicate when the processing |
| * should only find other references, and should not actually call the ingestion methods. |
| */ |
| @Override |
| public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity activities, |
| DocumentSpecification spec, boolean[] scanOnly, int jobType) |
| throws ManifoldCFException, ServiceInterruption |
| { |
| getSession(); |
| |
| // The version strings contain the ingest url as well as everything we need to determine if this url is a feed or just a document. |
| // So, there is no need to reparse the specification. |
| Filter f = null; |
| |
| String[] fixedList = new String[2]; |
| |
| int i = 0; |
| while (i < documentIdentifiers.length) |
| { |
| String urlValue = documentIdentifiers[i]; |
| String version = versions[i]; |
| |
| if (version.length() == 0) |
| { |
| // This document had an http response incompatible with its existence on the web. |
| i++; |
| // Leave document in jobqueue, but do NOT get rid of it, or we will wind up seeing it queued again by |
| // somebody else. We *do* have to signal the document to be removed from the index, however, or it will |
| // stick around until the job is deleted. |
| activities.noDocument(urlValue,version); |
| continue; |
| } |
| |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Processing '"+urlValue+"'"); |
| |
| // The only links we extract come from documents that we think are RSS feeds. |
| // When we think that's the case, we attempt to parse it as RSS XML. |
| if (version.startsWith("-")) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Interpreting document '"+urlValue+"' as a feed"); |
| |
| // We think it is a feed. |
| // The version string does not have anything useful in it. |
| if (f == null) |
| f = new Filter(spec,false); |
| |
| // If this is a continuous job, AND scanonly is true, it means that the document was either identical to the |
| // previous fetch, or was not fetched at all. In that case, it may not even be there, and we *certainly* don't |
| // want to attempt to process it in any case. |
| // |
| |
| // NOTE: I re-enabled the scan permanently because we need the TTL value to be set whatever the cost. If the |
| // TTL value is not set, we default to the specified job's feed-rescan time, which is not going to be current enough for some feeds. |
| if (true || scanOnly[i] == false || jobType != JOBMODE_CONTINUOUS) |
| { |
| handleRSSFeedSAX(urlValue,activities,f); |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Extraction of feed '"+urlValue+"' complete"); |
| |
| // Record the feed's version string, so we won't refetch unless needed. |
| // This functionality is required for the last ETag and Last-Modified fields to be sent to the rss server, and to |
| // keep track of the adaptive parameters. |
| activities.recordDocument(urlValue,version); |
| } |
| else |
| { |
| // The problem here is that we really do need to set the rescan time to something reasonable. |
| // But we might not even have read the feed! So what to do?? |
| // One answer is to build a connector-specific table that carries the last value of every feed around. |
| // Another answer is to change the version code to always read the feed (and the heck with ETag and Last-Modified). |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Feed '"+urlValue+"' does not appear to differ from previous fetch for a continuous job; not extracting!"); |
| |
| long currentTime = System.currentTimeMillis(); |
| |
| Long defaultRescanTime = f.getDefaultRescanTime(currentTime); |
| |
| if (defaultRescanTime != null) |
| { |
| Long minimumTime = f.getMinimumRescanTime(currentTime); |
| if (minimumTime != null) |
| { |
| if (defaultRescanTime.longValue() < minimumTime.longValue()) |
| defaultRescanTime = minimumTime; |
| } |
| } |
| |
| activities.setDocumentScheduleBounds(urlValue,defaultRescanTime,defaultRescanTime,null,null); |
| |
| } |
| } |
| else if (scanOnly[i] == false && version.startsWith("+")) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Interpreting '"+urlValue+"' as a document"); |
| |
| |
| |
| if (isDataIngestable(activities,urlValue)) |
| { |
| // Treat it as an ingestable document. |
| // Version *should* start with a "+". |
| ArrayList acls = new ArrayList(); |
| StringBuilder denyAclBuffer = new StringBuilder(); |
| int startPos = unpackList(acls,version,1,'+'); |
| if (startPos < version.length() && version.charAt(startPos++) == '+') |
| { |
| startPos = unpack(denyAclBuffer,version,startPos,'+'); |
| } |
| ArrayList metadata = new ArrayList(); |
| startPos = unpackList(metadata,version,startPos,'+'); |
| StringBuilder ingestUrlBuffer = new StringBuilder(); |
| startPos = unpack(ingestUrlBuffer,version,startPos,'+'); |
| String ingestURL = ingestUrlBuffer.toString(); |
| ArrayList pubDates = new ArrayList(); |
| startPos = unpackList(pubDates,version,startPos,'+'); |
| ArrayList titles = new ArrayList(); |
| startPos = unpackList(titles,version,startPos,'+'); |
| ArrayList sources = new ArrayList(); |
| startPos = unpackList(sources,version,startPos,'+'); |
| ArrayList categories = new ArrayList(); |
| startPos = unpackList(categories,version,startPos,'+'); |
| ArrayList descriptions = new ArrayList(); |
| startPos = unpackList(descriptions,version,startPos,'+'); |
| ArrayList authorNames = new ArrayList(); |
| startPos = unpackList(authorNames,version,startPos,'+'); |
| ArrayList authorEmails = new ArrayList(); |
| startPos = unpackList(authorEmails,version,startPos,'+'); |
| |
| if (ingestURL.length() > 0) |
| { |
| long dataSize = cache.getDataLength(urlValue); |
| RepositoryDocument rd = new RepositoryDocument(); |
| |
| // Set content type |
| rd.setMimeType(cache.getContentType(urlValue)); |
| |
| // Turn into acls and add into description |
| String[] aclArray = new String[acls.size()]; |
| int j = 0; |
| while (j < aclArray.length) |
| { |
| aclArray[j] = (String)acls.get(j); |
| j++; |
| } |
| rd.setSecurityACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT,aclArray); |
| |
| // Deny acl too |
| if (denyAclBuffer.length() > 0) |
| { |
| String[] denyAclArray = new String[]{denyAclBuffer.toString()}; |
| rd.setSecurityDenyACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT,denyAclArray); |
| } |
| |
| // Grab metadata |
| HashMap metaHash = new HashMap(); |
| int k = 0; |
| while (k < metadata.size()) |
| { |
| String metadataItem = (String)metadata.get(k++); |
| unpackFixedList(fixedList,metadataItem,0,'='); |
| HashMap hashValue = (HashMap)metaHash.get(fixedList[0]); |
| if (hashValue == null) |
| { |
| hashValue = new HashMap(); |
| metaHash.put(fixedList[0],hashValue); |
| } |
| hashValue.put(fixedList[1],fixedList[1]); |
| } |
| Iterator metaIter = metaHash.keySet().iterator(); |
| while (metaIter.hasNext()) |
| { |
| String key = (String)metaIter.next(); |
| HashMap metaList = (HashMap)metaHash.get(key); |
| String[] values = new String[metaList.size()]; |
| Iterator iter = metaList.keySet().iterator(); |
| k = 0; |
| while (iter.hasNext()) |
| { |
| values[k] = (String)iter.next(); |
| k++; |
| } |
| rd.addField(key,values); |
| } |
| |
| // Loop through the titles to add those to the metadata |
| String[] titleValues = new String[titles.size()]; |
| k = 0; |
| while (k < titleValues.length) |
| { |
| titleValues[k] = (String)titles.get(k); |
| k++; |
| } |
| if (k > 0) |
| rd.addField("title",titleValues); |
| |
| // Loop through the author names to add those to the metadata |
| String[] authorNameValues = new String[authorNames.size()]; |
| k = 0; |
| while (k < authorNameValues.length) |
| { |
| authorNameValues[k] = (String)authorNames.get(k); |
| k++; |
| } |
| if (k > 0) |
| rd.addField("authorname",authorNameValues); |
| |
| // Loop through the author emails to add those to the metadata |
| String[] authorEmailValues = new String[authorEmails.size()]; |
| k = 0; |
| while (k < authorEmailValues.length) |
| { |
| authorEmailValues[k] = (String)authorEmails.get(k); |
| k++; |
| } |
| if (k > 0) |
| rd.addField("authoremail",authorEmailValues); |
| |
| // Loop through the descriptions to add those to the metadata |
| String[] descriptionValues = new String[descriptions.size()]; |
| k = 0; |
| while (k < descriptionValues.length) |
| { |
| descriptionValues[k] = (String)descriptions.get(k); |
| k++; |
| } |
| if (k > 0) |
| rd.addField("summary",descriptionValues); |
| |
| // Loop through the sources to add those to the metadata |
| String[] sourceValues = new String[sources.size()]; |
| k = 0; |
| while (k < sourceValues.length) |
| { |
| sourceValues[k] = (String)sources.get(k); |
| k++; |
| } |
| if (k > 0) |
| rd.addField("source",sourceValues); |
| |
| // Add the categories now |
| String[] categoryValues = new String[categories.size()]; |
| k = 0; |
| while (k < categoryValues.length) |
| { |
| categoryValues[k] = (String)categories.get(k); |
| k++; |
| } |
| if (k > 0) |
| rd.addField("category",categoryValues); |
| |
| // The pubdates are a ms since epoch value; we want the minimum one for the origination time. |
| Long minimumOrigTime = null; |
| String[] pubDateValues = new String[pubDates.size()]; |
| String[] pubDateValuesISO = new String[pubDates.size()]; |
| TimeZone tz = TimeZone.getTimeZone("UTC"); |
| DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'"); |
| df.setTimeZone(tz); |
| k = 0; |
| while (k < pubDates.size()) |
| { |
| String pubDate = (String)pubDates.get(k); |
| pubDateValues[k] = pubDate; |
| try |
| { |
| Long pubDateLong = new Long(pubDate); |
| if (minimumOrigTime == null || pubDateLong.longValue() < minimumOrigTime.longValue()) |
| minimumOrigTime = pubDateLong; |
| pubDateValuesISO[k] = df.format(new Date(pubDateLong.longValue())); |
| } |
| catch (NumberFormatException e) |
| { |
| // Do nothing; the version string seems to not mean anything |
| } |
| k++; |
| } |
| if (k > 0) |
| { |
| rd.addField("pubdate",pubDateValues); |
| rd.addField("pubdateiso",pubDateValuesISO); |
| } |
| |
| if (minimumOrigTime != null) |
| activities.setDocumentOriginationTime(urlValue,minimumOrigTime); |
| |
| InputStream is = cache.getData(urlValue); |
| if (is != null) |
| { |
| try |
| { |
| rd.setBinary(is,dataSize); |
| try |
| { |
| activities.ingestDocumentWithException(urlValue,version,ingestURL,rd); |
| } |
| catch (IOException e) |
| { |
| handleIOException(e,"reading data"); |
| } |
| } |
| finally |
| { |
| try |
| { |
| is.close(); |
| } |
| catch (IOException e) |
| { |
| handleIOException(e,"closing stream"); |
| } |
| } |
| } |
| } |
| } |
| else |
| { |
| activities.noDocument(urlValue,version); |
| |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because it cannot be indexed"); |
| } |
| } |
| else |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because it cannot have interesting links"); |
| } |
| i++; |
| } |
| |
| } |
| |
| protected static void handleIOException(IOException e, String context) |
| throws ManifoldCFException, ServiceInterruption |
| { |
| if (e instanceof java.net.SocketTimeoutException) |
| throw new ManifoldCFException("IO error "+context+": "+e.getMessage(),e); |
| else if (e instanceof InterruptedIOException) |
| throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); |
| else |
| throw new ManifoldCFException("IO error "+context+": "+e.getMessage(),e); |
| } |
| |
| /** Free a set of documents. This method is called for all documents whose versions have been fetched using |
| * the getDocumentVersions() method, including those that returned null versions. It may be used to free resources |
| * committed during the getDocumentVersions() method. It is guaranteed to be called AFTER any calls to |
| * processDocuments() for the documents in question. |
| *@param documentIdentifiers is the set of document identifiers. |
| *@param versions is the corresponding set of version identifiers (individual identifiers may be null). |
| */ |
| @Override |
| public void releaseDocumentVersions(String[] documentIdentifiers, String[] versions) |
| throws ManifoldCFException |
| { |
| int i = 0; |
| while (i < documentIdentifiers.length) |
| { |
| String version = versions[i]; |
| if (version != null) |
| |
| { |
| String urlValue = documentIdentifiers[i]; |
| cache.deleteData(urlValue); |
| } |
| i++; |
| } |
| } |
| |
| // UI support methods. |
| // |
| // These support methods come in two varieties. The first bunch is involved in setting up connection configuration information. The second bunch |
| // is involved in presenting and editing document specification information for a job. The two kinds of methods are accordingly treated differently, |
| // in that the first bunch cannot assume that the current connector object is connected, while the second bunch can. That is why the first bunch |
| // receives a thread context argument for all UI methods, while the second bunch does not need one (since it has already been applied via the connect() |
| // method, above). |
| |
| /** Output the configuration header section. |
| * This method is called in the head section of the connector's configuration page. Its purpose is to add the required tabs to the list, and to output any |
| * javascript methods that might be needed by the configuration editing HTML. |
| *@param threadContext is the local thread context. |
| *@param out is the output to which any HTML should be sent. |
| *@param parameters are the configuration parameters, as they currently exist, for this connection being configured. |
| *@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector. |
| */ |
| @Override |
| public void outputConfigurationHeader(IThreadContext threadContext, IHTTPOutput out, |
| Locale locale, ConfigParams parameters, List<String> tabsArray) |
| throws ManifoldCFException, IOException |
| { |
| tabsArray.add(Messages.getString(locale,"RSSConnector.Email")); |
| tabsArray.add(Messages.getString(locale,"RSSConnector.Robots")); |
| tabsArray.add(Messages.getString(locale,"RSSConnector.Bandwidth")); |
| tabsArray.add(Messages.getString(locale,"RSSConnector.Proxy")); |
| out.print( |
| "<script type=\"text/javascript\">\n"+ |
| "<!--\n"+ |
| "function checkConfig()\n"+ |
| "{\n"+ |
| " if (editconnection.email.value != \"\" && editconnection.email.value.indexOf(\"@\") == -1)\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.NeedAValidEmailAddress")+"\");\n"+ |
| " editconnection.email.focus();\n"+ |
| " return false;\n"+ |
| " }\n"+ |
| " if (editconnection.bandwidth.value != \"\" && !isInteger(editconnection.bandwidth.value))\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.EnterAValidNumberOrBlankForNoLimit")+"\");\n"+ |
| " editconnection.bandwidth.focus();\n"+ |
| " return false;\n"+ |
| " }\n"+ |
| " if (editconnection.connections.value == \"\" || !isInteger(editconnection.connections.value))\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.EnterAValidNumberForTheMaxNumberOfOpenConnectionsPerServer")+"\");\n"+ |
| " editconnection.connections.focus();\n"+ |
| " return false;\n"+ |
| " }\n"+ |
| " if (editconnection.fetches.value != \"\" && !isInteger(editconnection.fetches.value))\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.EnterAValidNumberOrBlankForNoLimit")+"\");\n"+ |
| " editconnection.fetches.focus();\n"+ |
| " return false;\n"+ |
| " }\n"+ |
| " return true;\n"+ |
| "}\n"+ |
| "\n"+ |
| "function checkConfigForSave()\n"+ |
| "{\n"+ |
| " if (editconnection.email.value == \"\")\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.EmailAddressRequiredToBeIncludedInAllRequestHeaders")+"\");\n"+ |
| " SelectTab(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.Email")+"\");\n"+ |
| " editconnection.email.focus();\n"+ |
| " return false;\n"+ |
| " }\n"+ |
| " return true;\n"+ |
| "}\n"+ |
| "\n"+ |
| "//-->\n"+ |
| "</script>\n" |
| ); |
| } |
| |
| /** Output the configuration body section. |
| * This method is called in the body section of the connector's configuration page. Its purpose is to present the required form elements for editing. |
| * The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags. The name of the |
| * form is "editconnection". |
| *@param threadContext is the local thread context. |
| *@param out is the output to which any HTML should be sent. |
| *@param parameters are the configuration parameters, as they currently exist, for this connection being configured. |
| *@param tabName is the current tab name. |
| */ |
| @Override |
| public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out, |
| Locale locale, ConfigParams parameters, String tabName) |
| throws ManifoldCFException, IOException |
| { |
| String email = parameters.getParameter(RSSConfig.PARAMETER_EMAIL); |
| if (email == null) |
| email = ""; |
| String robotsUsage = parameters.getParameter(RSSConfig.PARAMETER_ROBOTSUSAGE); |
| if (robotsUsage == null) |
| robotsUsage = RSSConfig.VALUE_ALL; |
| String bandwidth = parameters.getParameter(RSSConfig.PARAMETER_BANDWIDTH); |
| if (bandwidth == null) |
| bandwidth = "64"; |
| String connections = parameters.getParameter(RSSConfig.PARAMETER_MAXOPEN); |
| if (connections == null) |
| connections = "2"; |
| String fetches = parameters.getParameter(RSSConfig.PARAMETER_MAXFETCHES); |
| if (fetches == null) |
| fetches = "12"; |
| String throttleGroup = parameters.getParameter(RSSConfig.PARAMETER_THROTTLEGROUP); |
| if (throttleGroup == null) |
| throttleGroup = ""; |
| String proxyHost = parameters.getParameter(RSSConfig.PARAMETER_PROXYHOST); |
| if (proxyHost == null) |
| proxyHost = ""; |
| String proxyPort = parameters.getParameter(RSSConfig.PARAMETER_PROXYPORT); |
| if (proxyPort == null) |
| proxyPort = ""; |
| String proxyAuthDomain = parameters.getParameter(RSSConfig.PARAMETER_PROXYAUTHDOMAIN); |
| if (proxyAuthDomain == null) |
| proxyAuthDomain = ""; |
| String proxyAuthUsername = parameters.getParameter(RSSConfig.PARAMETER_PROXYAUTHUSERNAME); |
| if (proxyAuthUsername == null) |
| proxyAuthUsername = ""; |
| String proxyAuthPassword = parameters.getObfuscatedParameter(RSSConfig.PARAMETER_PROXYAUTHPASSWORD); |
| if (proxyAuthPassword == null) |
| proxyAuthPassword = ""; |
| else |
| proxyAuthPassword = out.mapPasswordToKey(proxyAuthPassword); |
| |
| // Email tab |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.Email"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.EmailAddressToContactColon") + "</nobr></td><td class=\"value\"><input type=\"text\" size=\"32\" name=\"email\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(email)+"\"/></td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| "<input type=\"hidden\" name=\"email\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(email)+"\"/>\n" |
| ); |
| } |
| |
| // Robots tab |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.Robots"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.RobotsTxtUsageColon") + "</nobr></td>\n"+ |
| " <td class=\"value\">\n"+ |
| " <select name=\"robotsusage\" size=\"3\">\n"+ |
| " <option value=\"none\" "+(robotsUsage.equals(RSSConfig.VALUE_NONE)?"selected=\"selected\"":"")+">" + Messages.getBodyString(locale,"RSSConnector.DontLookAtRobotsTxt") + "</option>\n"+ |
| " <option value=\"data\" "+(robotsUsage.equals(RSSConfig.VALUE_DATA)?"selected=\"selected\"":"")+">" + Messages.getBodyString(locale,"RSSConnector.ObeyRobotsTxtForDataFetchesOnly") + "</option>\n"+ |
| " <option value=\"all\" "+(robotsUsage.equals(RSSConfig.VALUE_ALL)?"selected=\"selected\"":"")+">" + Messages.getBodyString(locale,"RSSConnector.ObeyRobotsTxtForAllFetches") + "</option>\n"+ |
| " </select>\n"+ |
| " </td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| "<input type=\"hidden\" name=\"robotsusage\" value=\""+robotsUsage+"\"/>\n" |
| ); |
| } |
| |
| // Bandwidth tab |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.Bandwidth"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.MaxKBytesPerSecondPerServerColon") + "</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"6\" name=\"bandwidth\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(bandwidth)+"\"/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.MaxConnectionsPerServerColon") + "</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"4\" name=\"connections\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(connections)+"\"/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.MaxFetchesPerMinutePerServerColon") + "</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"4\" name=\"fetches\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(fetches)+"\"/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ThrottleGroupNameColon") + "</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"32\" name=\"throttlegroup\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(throttleGroup)+"\"/></td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| "<input type=\"hidden\" name=\"bandwidth\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(bandwidth)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"connections\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(connections)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"fetches\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(fetches)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"throttlegroup\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(throttleGroup)+"\"/>\n" |
| ); |
| } |
| |
| // Proxy tab |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.Proxy"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ProxyHostColon") + "</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"40\" name=\"proxyhost\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyHost)+"\"/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ProxyPortColon") + "</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"5\" name=\"proxyport\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyPort)+"\"/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ProxyAuthenticationDomainColon") + "</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"32\" name=\"proxyauthdomain\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthDomain)+"\"/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ProxyAuthenticationUserNameColon") + "</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"32\" name=\"proxyauthusername\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthUsername)+"\"/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ProxyAuthenticationPasswordColon") + "</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"password\" size=\"16\" name=\"proxyauthpassword\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthPassword)+"\"/></td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| "<input type=\"hidden\" name=\"proxyhost\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyHost)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"proxyport\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyPort)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"proxyauthusername\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthUsername)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"proxyauthdomain\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthDomain)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"proxyauthpassword\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthPassword)+"\"/>\n" |
| ); |
| } |
| } |
| |
| /** Process a configuration post. |
| * This method is called at the start of the connector's configuration page, whenever there is a possibility that form data for a connection has been |
| * posted. Its purpose is to gather form information and modify the configuration parameters accordingly. |
| * The name of the posted form is "editconnection". |
| *@param threadContext is the local thread context. |
| *@param variableContext is the set of variables available from the post, including binary file post information. |
| *@param parameters are the configuration parameters, as they currently exist, for this connection being configured. |
| *@return null if all is well, or a string error message if there is an error that should prevent saving of the connection (and cause a redirection to an error page). |
| */ |
| @Override |
| public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext, |
| Locale locale, ConfigParams parameters) |
| throws ManifoldCFException |
| { |
| String email = variableContext.getParameter("email"); |
| if (email != null) |
| parameters.setParameter(RSSConfig.PARAMETER_EMAIL,email); |
| String robotsUsage = variableContext.getParameter("robotsusage"); |
| if (robotsUsage != null) |
| parameters.setParameter(RSSConfig.PARAMETER_ROBOTSUSAGE,robotsUsage); |
| String bandwidth = variableContext.getParameter("bandwidth"); |
| if (bandwidth != null) |
| parameters.setParameter(RSSConfig.PARAMETER_BANDWIDTH,bandwidth); |
| String connections = variableContext.getParameter("connections"); |
| if (connections != null) |
| parameters.setParameter(RSSConfig.PARAMETER_MAXOPEN,connections); |
| String fetches = variableContext.getParameter("fetches"); |
| if (fetches != null) |
| parameters.setParameter(RSSConfig.PARAMETER_MAXFETCHES,fetches); |
| String throttleGroup = variableContext.getParameter("throttlegroup"); |
| if (throttleGroup != null) |
| parameters.setParameter(RSSConfig.PARAMETER_THROTTLEGROUP,throttleGroup); |
| String proxyHost = variableContext.getParameter("proxyhost"); |
| if (proxyHost != null) |
| parameters.setParameter(RSSConfig.PARAMETER_PROXYHOST,proxyHost); |
| String proxyPort = variableContext.getParameter("proxyport"); |
| if (proxyPort != null) |
| parameters.setParameter(RSSConfig.PARAMETER_PROXYPORT,proxyPort); |
| String proxyAuthDomain = variableContext.getParameter("proxyauthdomain"); |
| if (proxyAuthDomain != null) |
| parameters.setParameter(RSSConfig.PARAMETER_PROXYAUTHDOMAIN,proxyAuthDomain); |
| String proxyAuthUsername = variableContext.getParameter("proxyauthusername"); |
| if (proxyAuthUsername != null) |
| parameters.setParameter(RSSConfig.PARAMETER_PROXYAUTHUSERNAME,proxyAuthUsername); |
| String proxyAuthPassword = variableContext.getParameter("proxyauthpassword"); |
| if (proxyAuthPassword != null) |
| parameters.setObfuscatedParameter(RSSConfig.PARAMETER_PROXYAUTHPASSWORD,variableContext.mapKeyToPassword(proxyAuthPassword)); |
| |
| return null; |
| } |
| |
| /** View configuration. |
| * This method is called in the body section of the connector's view configuration page. Its purpose is to present the connection information to the user. |
| * The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags. |
| *@param threadContext is the local thread context. |
| *@param out is the output to which any HTML should be sent. |
| *@param parameters are the configuration parameters, as they currently exist, for this connection being configured. |
| */ |
| @Override |
| public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out, |
| Locale locale, ConfigParams parameters) |
| throws ManifoldCFException, IOException |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\" colspan=\"1\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ParametersColon") + "</nobr></td>\n"+ |
| " <td class=\"value\" colspan=\"3\">\n" |
| ); |
| Iterator iter = parameters.listParameters(); |
| while (iter.hasNext()) |
| { |
| String param = (String)iter.next(); |
| String value = parameters.getParameter(param); |
| if (param.length() >= "password".length() && param.substring(param.length()-"password".length()).equalsIgnoreCase("password")) |
| { |
| out.print( |
| " <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param)+"=********</nobr><br/>\n" |
| ); |
| } |
| else if (param.length() >="keystore".length() && param.substring(param.length()-"keystore".length()).equalsIgnoreCase("keystore")) |
| { |
| IKeystoreManager kmanager = KeystoreManagerFactory.make("",value); |
| out.print( |
| " <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param)+"=<"+Integer.toString(kmanager.getContents().length)+Messages.getBodyString(locale,"RSSConnector.certificates")+"></nobr><br/>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| " <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param)+"="+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(value)+"</nobr><br/>\n" |
| ); |
| } |
| } |
| out.print( |
| " </td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| |
| /** Output the specification header section. |
| * This method is called in the head section of a job page which has selected a repository connection of the current type. Its purpose is to add the required tabs |
| * to the list, and to output any javascript methods that might be needed by the job editing HTML. |
| *@param out is the output to which any HTML should be sent. |
| *@param ds is the current document specification for this job. |
| *@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector. |
| */ |
| @Override |
| public void outputSpecificationHeader(IHTTPOutput out, Locale locale, DocumentSpecification ds, List<String> tabsArray) |
| throws ManifoldCFException, IOException |
| { |
| tabsArray.add(Messages.getString(locale,"RSSConnector.URLs")); |
| tabsArray.add(Messages.getString(locale,"RSSConnector.Canonicalization")); |
| tabsArray.add(Messages.getString(locale,"RSSConnector.URLMappings")); |
| tabsArray.add(Messages.getString(locale,"RSSConnector.Exclusions")); |
| tabsArray.add(Messages.getString(locale,"RSSConnector.TimeValues")); |
| tabsArray.add(Messages.getString(locale,"RSSConnector.Security")); |
| tabsArray.add(Messages.getString(locale,"RSSConnector.Metadata")); |
| tabsArray.add(Messages.getString(locale,"RSSConnector.DechromedContent")); |
| out.print( |
| "<script type=\"text/javascript\">\n"+ |
| "<!--\n"+ |
| "function SpecOp(n, opValue, anchorvalue)\n"+ |
| "{\n"+ |
| " eval(\"editjob.\"+n+\".value = \\\"\"+opValue+\"\\\"\");\n"+ |
| " postFormSetAnchor(anchorvalue);\n"+ |
| "}\n"+ |
| "\n"+ |
| "function AddRegexp(anchorvalue)\n"+ |
| "{\n"+ |
| " if (editjob.rssmatch.value == \"\")\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.MatchMustHaveARegexpValue")+"\");\n"+ |
| " editjob.rssmatch.focus();\n"+ |
| " return;\n"+ |
| " }\n"+ |
| "\n"+ |
| " SpecOp(\"rssop\",\"Add\",anchorvalue);\n"+ |
| "}\n"+ |
| "\n"+ |
| "function RemoveRegexp(index, anchorvalue)\n"+ |
| "{\n"+ |
| " editjob.rssindex.value = index;\n"+ |
| " SpecOp(\"rssop\",\"Delete\",anchorvalue);\n"+ |
| "}\n"+ |
| "\n"+ |
| "function SpecAddToken(anchorvalue)\n"+ |
| "{\n"+ |
| " if (editjob.spectoken.value == \"\")\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.TypeInAnAccessToken")+"\");\n"+ |
| " editjob.spectoken.focus();\n"+ |
| " return;\n"+ |
| " }\n"+ |
| " SpecOp(\"accessop\",\"Add\",anchorvalue);\n"+ |
| "}\n"+ |
| "\n"+ |
| "function SpecAddMetadata(anchorvalue)\n"+ |
| "{\n"+ |
| " if (editjob.specmetaname.value == \"\")\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.TypeInMetadataName")+"\");\n"+ |
| " editjob.specmetaname.focus();\n"+ |
| " return;\n"+ |
| " }\n"+ |
| " if (editjob.specmetavalue.value == \"\")\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getString(locale,"RSSConnector.TypeInMetadataValue")+"\");\n"+ |
| " editjob.specmetavalue.focus();\n"+ |
| " return;\n"+ |
| " }\n"+ |
| " SpecOp(\"metadataop\",\"Add\",anchorvalue);\n"+ |
| "}\n"+ |
| "\n"+ |
| "function URLRegexpDelete(index, anchorvalue)\n"+ |
| "{\n"+ |
| " editjob.urlregexpnumber.value = index;\n"+ |
| " SpecOp(\"urlregexpop\",\"Delete\",anchorvalue);\n"+ |
| "}\n"+ |
| "\n"+ |
| "function URLRegexpAdd(anchorvalue)\n"+ |
| "{\n"+ |
| " SpecOp(\"urlregexpop\",\"Add\",anchorvalue);\n"+ |
| "}\n"+ |
| "\n"+ |
| "function checkSpecification()\n"+ |
| "{\n"+ |
| " if (editjob.feedtimeout.value == \"\" || !isInteger(editjob.feedtimeout.value))\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.ATimeoutValueInSecondsIsRequired")+"\");\n"+ |
| " editjob.feedtimeout.focus();\n"+ |
| " return false;\n"+ |
| " }\n"+ |
| " if (editjob.feedrefetch.value == \"\" || !isInteger(editjob.feedrefetch.value))\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.ARefetchIntervalInMinutesIsRequired")+"\");\n"+ |
| " editjob.feedrefetch.focus();\n"+ |
| " return false;\n"+ |
| " }\n"+ |
| " if (editjob.minfeedrefetch.value == \"\" || !isInteger(editjob.minfeedrefetch.value))\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.AMinimumRefetchIntervalInMinutesIsRequire")+"\");\n"+ |
| " editjob.minfeedrefetch.focus();\n"+ |
| " return false;\n"+ |
| " }\n"+ |
| " if (editjob.badfeedrefetch.value != \"\" && !isInteger(editjob.badfeedrefetch.value))\n"+ |
| " {\n"+ |
| " alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.ABadFeedRefetchIntervalInMinutesIsRequired")+"\");\n"+ |
| " editjob.badfeedrefetch.focus();\n"+ |
| " return false;\n"+ |
| " }\n"+ |
| "\n"+ |
| " return true;\n"+ |
| "}\n"+ |
| "\n"+ |
| "//-->\n"+ |
| "</script>\n" |
| ); |
| } |
| |
| /** Output the specification body section. |
| * This method is called in the body section of a job page which has selected a repository connection of the current type. Its purpose is to present the required form elements for editing. |
| * The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags. The name of the |
| * form is "editjob". |
| *@param out is the output to which any HTML should be sent. |
| *@param ds is the current document specification for this job. |
| *@param tabName is the current tab name. |
| */ |
| @Override |
| public void outputSpecificationBody(IHTTPOutput out, Locale locale, DocumentSpecification ds, String tabName) |
| throws ManifoldCFException, IOException |
| { |
| int i; |
| int k; |
| |
| |
| // Build the url seed string, and the url regexp match and map |
| StringBuilder sb = new StringBuilder(); |
| ArrayList regexp = new ArrayList(); |
| ArrayList matchStrings = new ArrayList(); |
| int feedTimeoutValue = 60; |
| int feedRefetchValue = 60; |
| int minFeedRefetchValue = 15; |
| Integer badFeedRefetchValue = null; |
| String exclusions = ""; |
| |
| // Now, loop through paths |
| i = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_FEED)) |
| { |
| String rssURL = sn.getAttributeValue(RSSConfig.ATTR_URL); |
| if (rssURL != null) |
| { |
| sb.append(rssURL).append("\n"); |
| } |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_EXCLUDES)) |
| { |
| exclusions = sn.getValue(); |
| if (exclusions == null) |
| exclusions = ""; |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_MAP)) |
| { |
| String match = sn.getAttributeValue(RSSConfig.ATTR_MATCH); |
| String map = sn.getAttributeValue(RSSConfig.ATTR_MAP); |
| if (match != null) |
| { |
| regexp.add(match); |
| if (map == null) |
| map = ""; |
| matchStrings.add(map); |
| } |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_FEEDTIMEOUT)) |
| { |
| String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| feedTimeoutValue = Integer.parseInt(value); |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_FEEDRESCAN)) |
| { |
| String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| feedRefetchValue = Integer.parseInt(value); |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_MINFEEDRESCAN)) |
| { |
| String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| minFeedRefetchValue = Integer.parseInt(value); |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_BADFEEDRESCAN)) |
| { |
| String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| badFeedRefetchValue = new Integer(value); |
| } |
| } |
| |
| // URLs tab |
| |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.URLs"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"value\" colspan=\"2\">\n"+ |
| " <textarea rows=\"25\" cols=\"80\" name=\"rssurls\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(sb.toString())+"</textarea>\n"+ |
| " </td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| "<input type=\"hidden\" name=\"rssurls\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(sb.toString())+"\"/>\n" |
| ); |
| } |
| |
| // Exclusions tab |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.Exclusions"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\" colspan=\"1\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.Exclude") + "</nobr></td>\n"+ |
| " <td class=\"value\" colspan=\"1\">\n"+ |
| " <textarea rows=\"25\" cols=\"60\" name=\"exclusions\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(exclusions)+"</textarea>\n"+ |
| " </td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| "<input type=\"hidden\" name=\"exclusions\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(exclusions)+"\"/>\n" |
| ); |
| } |
| |
| // Canonicalization tab |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.Canonicalization"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"boxcell\" colspan=\"2\">\n"+ |
| " <input type=\"hidden\" name=\"urlregexpop\" value=\"Continue\"/>\n"+ |
| " <input type=\"hidden\" name=\"urlregexpnumber\" value=\"\"/>\n"+ |
| " <table class=\"formtable\">\n"+ |
| " <tr class=\"formheaderrow\">\n"+ |
| " <td class=\"formcolumnheader\"></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.URLRegularExpression")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.Description")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.Reorder")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveJSPSessions")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveASPSessions")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemovePHPSessions")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveBVSessions")+"</nobr></td>\n"+ |
| " </tr>\n" |
| ); |
| int q = 0; |
| int l = 0; |
| while (q < ds.getChildCount()) |
| { |
| SpecificationNode specNode = ds.getChild(q++); |
| if (specNode.getType().equals(RSSConfig.NODE_URLSPEC)) |
| { |
| // Ok, this node matters to us |
| String regexpString = specNode.getAttributeValue(RSSConfig.ATTR_REGEXP); |
| String description = specNode.getAttributeValue(RSSConfig.ATTR_DESCRIPTION); |
| if (description == null) |
| description = ""; |
| String allowReorder = specNode.getAttributeValue(RSSConfig.ATTR_REORDER); |
| if (allowReorder == null || allowReorder.length() == 0) |
| allowReorder = RSSConfig.VALUE_NO; |
| String allowJavaSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL); |
| if (allowJavaSessionRemoval == null || allowJavaSessionRemoval.length() == 0) |
| allowJavaSessionRemoval = RSSConfig.VALUE_NO; |
| String allowASPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL); |
| if (allowASPSessionRemoval == null || allowASPSessionRemoval.length() == 0) |
| allowASPSessionRemoval = RSSConfig.VALUE_NO; |
| String allowPHPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL); |
| if (allowPHPSessionRemoval == null || allowPHPSessionRemoval.length() == 0) |
| allowPHPSessionRemoval = RSSConfig.VALUE_NO; |
| String allowBVSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL); |
| if (allowBVSessionRemoval == null || allowBVSessionRemoval.length() == 0) |
| allowBVSessionRemoval = RSSConfig.VALUE_NO; |
| out.print( |
| " <tr class=\""+(((l % 2)==0)?"evenformrow":"oddformrow")+"\">\n"+ |
| " <td class=\"formcolumncell\">\n"+ |
| " <a name=\""+"urlregexp_"+Integer.toString(l)+"\">\n"+ |
| " <input type=\"button\" value=\"Delete\" alt=\""+Messages.getAttributeString(locale,"RSSConnector.DeleteUrlRegexp")+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(regexpString)+"\" onclick='javascript:URLRegexpDelete("+Integer.toString(l)+",\"urlregexp_"+Integer.toString(l)+"\");'/>\n"+ |
| " </a>\n"+ |
| " </td>\n"+ |
| " <td class=\"formcolumncell\">\n"+ |
| " <input type=\"hidden\" name=\""+"urlregexp_"+Integer.toString(l)+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(regexpString)+"\"/>\n"+ |
| " <input type=\"hidden\" name=\""+"urlregexpdesc_"+Integer.toString(l)+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(description)+"\"/>\n"+ |
| " <input type=\"hidden\" name=\""+"urlregexpreorder_"+Integer.toString(l)+"\" value=\""+allowReorder+"\"/>\n"+ |
| " <input type=\"hidden\" name=\""+"urlregexpjava_"+Integer.toString(l)+"\" value=\""+allowJavaSessionRemoval+"\"/>\n"+ |
| " <input type=\"hidden\" name=\""+"urlregexpasp_"+Integer.toString(l)+"\" value=\""+allowASPSessionRemoval+"\"/>\n"+ |
| " <input type=\"hidden\" name=\""+"urlregexpphp_"+Integer.toString(l)+"\" value=\""+allowPHPSessionRemoval+"\"/>\n"+ |
| " <input type=\"hidden\" name=\""+"urlregexpbv_"+Integer.toString(l)+"\" value=\""+allowBVSessionRemoval+"\"/>\n"+ |
| " <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(regexpString)+"</nobr>\n"+ |
| " </td>\n"+ |
| " <td class=\"formcolumncell\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(description)+"</td>\n"+ |
| " <td class=\"formcolumncell\">"+allowReorder+"</td>\n"+ |
| " <td class=\"formcolumncell\">"+allowJavaSessionRemoval+"</td>\n"+ |
| " <td class=\"formcolumncell\">"+allowASPSessionRemoval+"</td>\n"+ |
| " <td class=\"formcolumncell\">"+allowPHPSessionRemoval+"</td>\n"+ |
| " <td class=\"formcolumncell\">"+allowBVSessionRemoval+"</td>\n"+ |
| " </tr>\n" |
| ); |
| |
| l++; |
| } |
| } |
| if (l == 0) |
| { |
| out.print( |
| " <tr class=\"formrow\"><td colspan=\"8\" class=\"formcolumnmessage\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.NoCanonicalizationSpecified")+"</nobr></td></tr>\n" |
| ); |
| } |
| out.print( |
| " <tr class=\"formrow\"><td colspan=\"8\" class=\"formseparator\"><hr/></td></tr>\n"+ |
| " <tr class=\"formrow\">\n"+ |
| " <td class=\"formcolumncell\">\n"+ |
| " <a name=\""+"urlregexp_"+Integer.toString(l)+"\">\n"+ |
| " <input type=\"button\" value=\"Add\" alt=\""+Messages.getAttributeString(locale,"RSSConnector.AddUlRegexp")+"\" onclick='javascript:URLRegexpAdd(\"urlregexp_"+Integer.toString(l+1)+"\");'/>\n"+ |
| " <input type=\"hidden\" name=\"urlregexpcount\" value=\""+Integer.toString(l)+"\"/>\n"+ |
| " </a>\n"+ |
| " </td>\n"+ |
| " <td class=\"formcolumncell\"><input type=\"text\" name=\"urlregexp\" size=\"30\" value=\"\"/></td>\n"+ |
| " <td class=\"formcolumncell\"><input type=\"text\" name=\"urlregexpdesc\" size=\"30\" value=\"\"/></td>\n"+ |
| " <td class=\"formcolumncell\"><input type=\"checkbox\" name=\"urlregexpreorder\" value=\"yes\"/></td>\n"+ |
| " <td class=\"formcolumncell\"><input type=\"checkbox\" name=\"urlregexpjava\" value=\"yes\" checked=\"true\"/></td>\n"+ |
| " <td class=\"formcolumncell\"><input type=\"checkbox\" name=\"urlregexpasp\" value=\"yes\" checked=\"true\"/></td>\n"+ |
| " <td class=\"formcolumncell\"><input type=\"checkbox\" name=\"urlregexpphp\" value=\"yes\" checked=\"true\"/></td>\n"+ |
| " <td class=\"formcolumncell\"><input type=\"checkbox\" name=\"urlregexpbv\" value=\"yes\" checked=\"true\"/></td>\n"+ |
| " </tr>\n"+ |
| " </table>\n"+ |
| " </td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| // Post the canonicalization specification |
| int q = 0; |
| int l = 0; |
| while (q < ds.getChildCount()) |
| { |
| SpecificationNode specNode = ds.getChild(q++); |
| if (specNode.getType().equals(RSSConfig.NODE_URLSPEC)) |
| { |
| // Ok, this node matters to us |
| String regexpString = specNode.getAttributeValue(RSSConfig.ATTR_REGEXP); |
| String description = specNode.getAttributeValue(RSSConfig.ATTR_DESCRIPTION); |
| if (description == null) |
| description = ""; |
| String allowReorder = specNode.getAttributeValue(RSSConfig.ATTR_REORDER); |
| if (allowReorder == null || allowReorder.length() == 0) |
| allowReorder = RSSConfig.VALUE_NO; |
| String allowJavaSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL); |
| if (allowJavaSessionRemoval == null || allowJavaSessionRemoval.length() == 0) |
| allowJavaSessionRemoval = RSSConfig.VALUE_NO; |
| String allowASPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL); |
| if (allowASPSessionRemoval == null || allowASPSessionRemoval.length() == 0) |
| allowASPSessionRemoval = RSSConfig.VALUE_NO; |
| String allowPHPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL); |
| if (allowPHPSessionRemoval == null || allowPHPSessionRemoval.length() == 0) |
| allowPHPSessionRemoval = RSSConfig.VALUE_NO; |
| String allowBVSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL); |
| if (allowBVSessionRemoval == null || allowBVSessionRemoval.length() == 0) |
| allowBVSessionRemoval = RSSConfig.VALUE_NO; |
| out.print( |
| "<input type=\"hidden\" name=\""+"urlregexp_"+Integer.toString(l)+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(regexpString)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\""+"urlregexpdesc_"+Integer.toString(l)+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(description)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\""+"urlregexpreorder_"+Integer.toString(l)+"\" value=\""+allowReorder+"\"/>\n"+ |
| "<input type=\"hidden\" name=\""+"urlregexpjava_"+Integer.toString(l)+"\" value=\""+allowJavaSessionRemoval+"\"/>\n"+ |
| "<input type=\"hidden\" name=\""+"urlregexpasp_"+Integer.toString(l)+"\" value=\""+allowASPSessionRemoval+"\"/>\n"+ |
| "<input type=\"hidden\" name=\""+"urlregexpphp_"+Integer.toString(l)+"\" value=\""+allowPHPSessionRemoval+"\"/>\n"+ |
| "<input type=\"hidden\" name=\""+"urlregexpbv_"+Integer.toString(l)+"\" value=\""+allowBVSessionRemoval+"\"/>\n" |
| ); |
| l++; |
| } |
| } |
| out.print( |
| "<input type=\"hidden\" name=\"urlregexpcount\" value=\""+Integer.toString(l)+"\"/>\n" |
| ); |
| } |
| |
| // Mappings tab |
| |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.URLMappings"))) |
| { |
| out.print( |
| "<input type=\"hidden\" name=\"rssop\" value=\"\"/>\n"+ |
| "<input type=\"hidden\" name=\"rssindex\" value=\"\"/>\n"+ |
| "<input type=\"hidden\" name=\"rssmapcount\" value=\""+Integer.toString(regexp.size())+"\"/>\n"+ |
| "\n"+ |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"4\"><hr/></td></tr>\n" |
| ); |
| |
| i = 0; |
| while (i < regexp.size()) |
| { |
| String prefix = "rssregexp_"+Integer.toString(i)+"_"; |
| out.print( |
| " <tr>\n"+ |
| " <td class=\"value\">\n"+ |
| " <a name=\""+"regexp_"+Integer.toString(i)+"\">\n"+ |
| " <input type=\"button\" value=\"Remove\" onclick='javascript:RemoveRegexp("+Integer.toString(i)+",\"regexp_"+Integer.toString(i)+"\")' alt=\""+Messages.getAttributeString(locale,"RSSConnector.RemoveRegexp")+Integer.toString(i)+"\"/>\n"+ |
| " </a>\n"+ |
| " </td>\n"+ |
| " <td class=\"value\"><input type=\"hidden\" name=\""+prefix+"match"+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape((String)regexp.get(i))+"\"/>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape((String)regexp.get(i))+"</td>\n"+ |
| " <td class=\"value\">==></td>\n"+ |
| " <td class=\"value\">\n" |
| ); |
| String match = (String)matchStrings.get(i); |
| out.print( |
| " <input type=\"hidden\" name=\""+prefix+"map"+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(match)+"\"/>\n" |
| ); |
| if (match.length() == 0) |
| { |
| out.print( |
| " <as is>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| " "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(match)+"\n" |
| ); |
| } |
| out.print( |
| " </td>\n"+ |
| " </tr>\n" |
| ); |
| i++; |
| } |
| out.print( |
| " <tr>\n"+ |
| " <td class=\"value\"><a name=\""+"regexp_"+Integer.toString(i)+"\"><input type=\"button\" value=\"Add\" onclick='javascript:AddRegexp(\"regexp_"+Integer.toString(i+1)+"\")' alt=\""+Messages.getAttributeString(locale,"RSSConnector.AddRegexp")+"\"/></a></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" name=\"rssmatch\" size=\"16\" value=\"\"/></td>\n"+ |
| " <td class=\"value\">==></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" name=\"rssmap\" size=\"16\" value=\"\"/></td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| "<input type=\"hidden\" name=\"rssmapcount\" value=\""+Integer.toString(regexp.size())+"\"/>\n" |
| ); |
| i = 0; |
| while (i < regexp.size()) |
| { |
| String prefix = "rssregexp_"+Integer.toString(i)+"_"; |
| String match = (String)matchStrings.get(i); |
| out.print( |
| "<input type=\"hidden\" name=\""+prefix+"match"+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape((String)regexp.get(i))+"\"/>\n"+ |
| "<input type=\"hidden\" name=\""+prefix+"map"+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(match)+"\"/>\n" |
| ); |
| i++; |
| } |
| } |
| |
| // Timeout Value tab |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.TimeValues"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.FeedConnectTimeout")+"</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"5\" name=\"feedtimeout\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedTimeoutValue))+"\"/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.DefaultFeedRefetchTime")+"</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"5\" name=\"feedrefetch\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedRefetchValue))+"\"/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.MinimumFeedRefetchTime")+"</nobr></td>\n"+ |
| " <td class=\"value\"><input type=\"text\" size=\"5\" name=\"minfeedrefetch\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(minFeedRefetchValue))+"\"/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.BadFeedRefetchTime")+"</nobr></td>\n"+ |
| " <td class=\"value\">\n"+ |
| " <input type=\"hidden\" name=\"badfeedrefetch_present\" value=\"true\"/>\n"+ |
| " <input type=\"text\" size=\"5\" name=\"badfeedrefetch\" value=\""+((badFeedRefetchValue==null)?"":org.apache.manifoldcf.ui.util.Encoder.attributeEscape(badFeedRefetchValue.toString()))+"\"/>\n"+ |
| " </td>\n"+ |
| " </tr>\n"+ |
| "\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| "<input type=\"hidden\" name=\"feedtimeout\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedTimeoutValue))+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"feedrefetch\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedRefetchValue))+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"minfeedrefetch\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(minFeedRefetchValue))+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"badfeedrefetch_present\" value=\"true\"/>\n"+ |
| "<input type=\"hidden\" name=\"badfeedrefetch\" value=\""+((badFeedRefetchValue==null)?"":org.apache.manifoldcf.ui.util.Encoder.attributeEscape(badFeedRefetchValue.toString()))+"\"/>\n" |
| ); |
| } |
| |
| // Dechromed content tab |
| String dechromedMode = RSSConfig.VALUE_NONE; |
| String chromedMode = RSSConfig.VALUE_USE; |
| i = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_DECHROMEDMODE)) |
| dechromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE); |
| else if (sn.getType().equals(RSSConfig.NODE_CHROMEDMODE)) |
| chromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE); |
| } |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.DechromedContent"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"1\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"value\"><nobr><input type=\"radio\" name=\"dechromedmode\" value=\"none\" "+(dechromedMode.equals(RSSConfig.VALUE_NONE)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.NoDechromedContent")+"</nobr></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"value\"><nobr><input type=\"radio\" name=\"dechromedmode\" value=\"description\" "+(dechromedMode.equals(RSSConfig.VALUE_DESCRIPTION)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.DechromedContentIfPresentInDescriptionField")+"</nobr></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"value\"><nobr><input type=\"radio\" name=\"dechromedmode\" value=\"content\" "+(dechromedMode.equals(RSSConfig.VALUE_CONTENT)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.DechromedContentIfPresentInContentField")+"</nobr></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"separator\"><hr/></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"value\"><nobr><input type=\"radio\" name=\"chromedmode\" value=\"use\" "+(chromedMode.equals(RSSConfig.VALUE_USE)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.UseChromedContentIfNoDechromedContentFound")+"</nobr></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"value\"><nobr><input type=\"radio\" name=\"chromedmode\" value=\"skip\" "+(chromedMode.equals(RSSConfig.VALUE_SKIP)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.NeverUseChromedContent")+"</nobr></td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"value\"><nobr><input type=\"radio\" name=\"chromedmode\" value=\"metadata\" "+(chromedMode.equals(RSSConfig.VALUE_METADATA)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.NoContentMetadataOnly")+"</nobr></td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| "<input type=\"hidden\" name=\"dechromedmode\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(dechromedMode)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\"chromedmode\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(chromedMode)+"\"/>\n" |
| ); |
| } |
| |
| // Security tab |
| // There is no native security, so all we care about are the tokens. |
| i = 0; |
| |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.Security"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" |
| ); |
| // Go through forced ACL |
| i = 0; |
| k = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_ACCESS)) |
| { |
| String accessDescription = "_"+Integer.toString(k); |
| String accessOpName = "accessop"+accessDescription; |
| String token = sn.getAttributeValue(RSSConfig.ATTR_TOKEN); |
| out.print( |
| " <tr>\n"+ |
| " <td class=\"description\">\n"+ |
| " <input type=\"hidden\" name=\""+accessOpName+"\" value=\"\"/>\n"+ |
| " <input type=\"hidden\" name=\""+"spectoken"+accessDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(token)+"\"/>\n"+ |
| " <a name=\""+"token_"+Integer.toString(k)+"\">\n"+ |
| " <input type=\"button\" value=\"Delete\" onClick='Javascript:SpecOp(\""+accessOpName+"\",\"Delete\",\"token_"+Integer.toString(k)+"\")' alt=\""+Messages.getAttributeString(locale,"RSSConnector.DeleteToken")+Integer.toString(k)+"\"/>\n"+ |
| " </a> \n"+ |
| " </td>\n"+ |
| " <td class=\"value\">\n"+ |
| " "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(token)+"\n"+ |
| " </td>\n"+ |
| " </tr>\n" |
| ); |
| k++; |
| } |
| } |
| if (k == 0) |
| { |
| out.print( |
| " <tr>\n"+ |
| " <td class=\"message\" colspan=\"2\">" + Messages.getBodyString(locale,"RSSConnector.NoAccessTokensPresent") + "</td>\n"+ |
| " </tr>\n" |
| ); |
| } |
| out.print( |
| " <tr><td class=\"lightseparator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\">\n"+ |
| " <input type=\"hidden\" name=\"tokencount\" value=\""+Integer.toString(k)+"\"/>\n"+ |
| " <input type=\"hidden\" name=\"accessop\" value=\"\"/>\n"+ |
| " <a name=\""+"token_"+Integer.toString(k)+"\">\n"+ |
| " <input type=\"button\" value=\"Add\" onClick='Javascript:SpecAddToken(\"token_"+Integer.toString(k+1)+"\")' alt=\""+Messages.getAttributeString(locale,"RSSConnector.AddAccessToken")+"\"/>\n"+ |
| " </a> \n"+ |
| " </td>\n"+ |
| " <td class=\"value\">\n"+ |
| " <input type=\"text\" size=\"30\" name=\"spectoken\" value=\"\"/>\n"+ |
| " </td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| } |
| else |
| { |
| // Finally, go through forced ACL |
| i = 0; |
| k = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_ACCESS)) |
| { |
| String accessDescription = "_"+Integer.toString(k); |
| String token = sn.getAttributeValue(RSSConfig.ATTR_TOKEN); |
| out.print( |
| "<input type=\"hidden\" name=\""+"spectoken"+accessDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(token)+"\"/>\n" |
| ); |
| k++; |
| } |
| } |
| out.print( |
| "<input type=\"hidden\" name=\"tokencount\" value=\""+Integer.toString(k)+"\"/>\n" |
| ); |
| } |
| |
| // "Metadata" tab |
| if (tabName.equals(Messages.getString(locale,"RSSConnector.Metadata"))) |
| { |
| out.print( |
| "<table class=\"displaytable\">\n"+ |
| " <tr><td class=\"separator\" colspan=\"4\"><hr/></td></tr>\n" |
| ); |
| // Go through metadata |
| i = 0; |
| k = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_METADATA)) |
| { |
| String metadataDescription = "_"+Integer.toString(k); |
| String metadataOpName = "metadataop"+metadataDescription; |
| String name = sn.getAttributeValue(RSSConfig.ATTR_NAME); |
| String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| out.print( |
| " <tr>\n"+ |
| " <td class=\"description\">\n"+ |
| " <input type=\"hidden\" name=\""+metadataOpName+"\" value=\"\"/>\n"+ |
| " <input type=\"hidden\" name=\""+"specmetaname"+metadataDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(name)+"\"/>\n"+ |
| " <input type=\"hidden\" name=\""+"specmetavalue"+metadataDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(value)+"\"/>\n"+ |
| " <a name=\""+"metadata_"+Integer.toString(k)+"\">\n"+ |
| " <input type=\"button\" value=\"Delete\" onClick='Javascript:SpecOp(\""+metadataOpName+"\",\"Delete\",\"metadata_"+Integer.toString(k)+"\")' alt=\""+Messages.getAttributeString(locale,"RSSConnector.DeleteMetadata")+Integer.toString(k)+"\"/>\n"+ |
| " </a> \n"+ |
| " </td>\n"+ |
| " <td class=\"value\">\n"+ |
| " "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(name)+"\n"+ |
| " </td>\n"+ |
| " <td class=\"value\">=</td>\n"+ |
| " <td class=\"value\">\n"+ |
| " "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(value)+"\n"+ |
| " </td>\n"+ |
| " </tr>\n" |
| ); |
| k++; |
| } |
| |
| } |
| if (k == 0) |
| { |
| out.print( |
| " <tr>\n"+ |
| " <td class=\"message\" colspan=\"4\">"+Messages.getBodyString(locale,"RSSConnector.NoMetadataPresent")+"</td>\n"+ |
| " </tr>\n" |
| ); |
| } |
| out.print( |
| " <tr><td class=\"lightseparator\" colspan=\"4\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\">\n"+ |
| " <input type=\"hidden\" name=\"metadatacount\" value=\""+Integer.toString(k)+"\"/>\n"+ |
| " <input type=\"hidden\" name=\"metadataop\" value=\"\"/>\n"+ |
| " <a name=\""+"metadata_"+Integer.toString(k)+"\">\n"+ |
| " <input type=\"button\" value=\"Add\" onClick='Javascript:SpecAddMetadata(\"metadata_"+Integer.toString(k+1)+"\")' alt=\""+Messages.getAttributeString(locale,"RSSConnector.AddMetadata")+"\"/>\n"+ |
| " </a> \n"+ |
| " </td>\n"+ |
| " <td class=\"value\">\n"+ |
| " <input type=\"text\" size=\"30\" name=\"specmetaname\" value=\"\"/>\n"+ |
| " </td>\n"+ |
| " <td class=\"value\">=</td>\n"+ |
| " <td class=\"value\">\n"+ |
| " <input type=\"text\" size=\"80\" name=\"specmetavalue\" value=\"\"/>\n"+ |
| " </td>\n"+ |
| " </tr>\n"+ |
| "</table>\n" |
| ); |
| |
| } |
| else |
| { |
| // Finally, go through metadata |
| i = 0; |
| k = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_METADATA)) |
| { |
| String metadataDescription = "_"+Integer.toString(k); |
| String name = sn.getAttributeValue(RSSConfig.ATTR_NAME); |
| String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| out.print( |
| "<input type=\"hidden\" name=\""+"specmetaname"+metadataDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(name)+"\"/>\n"+ |
| "<input type=\"hidden\" name=\""+"specmetavalue"+metadataDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(value)+"\"/>\n" |
| ); |
| k++; |
| } |
| } |
| out.print( |
| "<input type=\"hidden\" name=\"metadatacount\" value=\""+Integer.toString(k)+"\"/>\n" |
| ); |
| |
| } |
| } |
| |
| /** Process a specification post. |
| * This method is called at the start of job's edit or view page, whenever there is a possibility that form data for a connection has been |
| * posted. Its purpose is to gather form information and modify the document specification accordingly. |
| * The name of the posted form is "editjob". |
| *@param variableContext contains the post data, including binary file-upload information. |
| *@param ds is the current document specification for this job. |
| *@return null if all is well, or a string error message if there is an error that should prevent saving of the job (and cause a redirection to an error page). |
| */ |
| @Override |
| public String processSpecificationPost(IPostParameters variableContext, Locale locale, DocumentSpecification ds) |
| throws ManifoldCFException |
| { |
| // Get the map |
| String value = variableContext.getParameter("rssmapcount"); |
| if (value != null) |
| { |
| int mapsize = Integer.parseInt(value); |
| |
| // Clear it first |
| int j = 0; |
| while (j < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(j); |
| if (sn.getType().equals(RSSConfig.NODE_MAP)) |
| ds.removeChild(j); |
| else |
| j++; |
| } |
| |
| // Grab the map values |
| j = 0; |
| while (j < mapsize) |
| { |
| String prefix = "rssregexp_"+Integer.toString(j)+"_"; |
| String match = variableContext.getParameter(prefix+"match"); |
| String map = variableContext.getParameter(prefix+"map"); |
| if (map == null) |
| map = ""; |
| // Add to the documentum specification |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_MAP); |
| node.setAttribute(RSSConfig.ATTR_MATCH,match); |
| node.setAttribute(RSSConfig.ATTR_MAP,map); |
| ds.addChild(ds.getChildCount(),node); |
| |
| j++; |
| } |
| } |
| |
| // Get the cgiPath |
| String rssURLSequence = variableContext.getParameter("rssurls"); |
| if (rssURLSequence != null) |
| { |
| // Delete all url specs first |
| int i = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i); |
| if (sn.getType().equals(RSSConfig.NODE_FEED)) |
| ds.removeChild(i); |
| else |
| i++; |
| } |
| |
| try |
| { |
| java.io.Reader str = new java.io.StringReader(rssURLSequence); |
| try |
| { |
| java.io.BufferedReader is = new java.io.BufferedReader(str); |
| try |
| { |
| while (true) |
| { |
| String nextString = is.readLine(); |
| if (nextString == null) |
| break; |
| if (nextString.length() == 0) |
| continue; |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_FEED); |
| node.setAttribute(RSSConfig.ATTR_URL,nextString); |
| ds.addChild(ds.getChildCount(),node); |
| } |
| } |
| finally |
| { |
| is.close(); |
| } |
| } |
| finally |
| { |
| str.close(); |
| } |
| } |
| catch (java.io.IOException e) |
| { |
| throw new ManifoldCFException("IO error",e); |
| } |
| } |
| |
| // Read the url specs |
| String urlRegexpCount = variableContext.getParameter("urlregexpcount"); |
| if (urlRegexpCount != null && urlRegexpCount.length() > 0) |
| { |
| int regexpCount = Integer.parseInt(urlRegexpCount); |
| int j = 0; |
| while (j < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(j); |
| if (sn.getType().equals(RSSConfig.NODE_URLSPEC)) |
| ds.removeChild(j); |
| else |
| j++; |
| } |
| |
| // Grab the operation and the index (if any) |
| String operation = variableContext.getParameter("urlregexpop"); |
| if (operation == null) |
| operation = "Continue"; |
| int opIndex = -1; |
| if (operation.equals("Delete")) |
| opIndex = Integer.parseInt(variableContext.getParameter("urlregexpnumber")); |
| |
| // Reconstruct urlspec nodes |
| j = 0; |
| while (j < regexpCount) |
| { |
| // For each index, first look for a delete operation |
| if (!operation.equals("Delete") || j != opIndex) |
| { |
| // Add the jth node |
| String regexp = variableContext.getParameter("urlregexp_"+Integer.toString(j)); |
| String regexpDescription = variableContext.getParameter("urlregexpdesc_"+Integer.toString(j)); |
| String reorder = variableContext.getParameter("urlregexpreorder_"+Integer.toString(j)); |
| String javaSession = variableContext.getParameter("urlregexpjava_"+Integer.toString(j)); |
| String aspSession = variableContext.getParameter("urlregexpasp_"+Integer.toString(j)); |
| String phpSession = variableContext.getParameter("urlregexpphp_"+Integer.toString(j)); |
| String bvSession = variableContext.getParameter("urlregexpbv_"+Integer.toString(j)); |
| SpecificationNode newSn = new SpecificationNode(RSSConfig.NODE_URLSPEC); |
| newSn.setAttribute(RSSConfig.ATTR_REGEXP,regexp); |
| if (regexpDescription != null && regexpDescription.length() > 0) |
| newSn.setAttribute(RSSConfig.VALUE_DESCRIPTION,regexpDescription); |
| if (reorder != null && reorder.length() > 0) |
| newSn.setAttribute(RSSConfig.ATTR_REORDER,reorder); |
| if (javaSession != null && javaSession.length() > 0) |
| newSn.setAttribute(RSSConfig.ATTR_JAVASESSIONREMOVAL,javaSession); |
| if (aspSession != null && aspSession.length() > 0) |
| newSn.setAttribute(RSSConfig.ATTR_ASPSESSIONREMOVAL,aspSession); |
| if (phpSession != null && phpSession.length() > 0) |
| newSn.setAttribute(RSSConfig.ATTR_PHPSESSIONREMOVAL,phpSession); |
| if (bvSession != null && bvSession.length() > 0) |
| newSn.setAttribute(RSSConfig.ATTR_BVSESSIONREMOVAL,bvSession); |
| ds.addChild(ds.getChildCount(),newSn); |
| } |
| j++; |
| } |
| if (operation.equals("Add")) |
| { |
| String regexp = variableContext.getParameter("urlregexp"); |
| String regexpDescription = variableContext.getParameter("urlregexpdesc"); |
| String reorder = variableContext.getParameter("urlregexpreorder"); |
| String javaSession = variableContext.getParameter("urlregexpjava"); |
| String aspSession = variableContext.getParameter("urlregexpasp"); |
| String phpSession = variableContext.getParameter("urlregexpphp"); |
| String bvSession = variableContext.getParameter("urlregexpbv"); |
| |
| // Add a new node at the end |
| SpecificationNode newSn = new SpecificationNode(RSSConfig.NODE_URLSPEC); |
| newSn.setAttribute(RSSConfig.ATTR_REGEXP,regexp); |
| if (regexpDescription != null && regexpDescription.length() > 0) |
| newSn.setAttribute(RSSConfig.VALUE_DESCRIPTION,regexpDescription); |
| if (reorder != null && reorder.length() > 0) |
| newSn.setAttribute(RSSConfig.ATTR_REORDER,reorder); |
| if (javaSession != null && javaSession.length() > 0) |
| newSn.setAttribute(RSSConfig.ATTR_JAVASESSIONREMOVAL,javaSession); |
| if (aspSession != null && aspSession.length() > 0) |
| newSn.setAttribute(RSSConfig.ATTR_ASPSESSIONREMOVAL,aspSession); |
| if (phpSession != null && phpSession.length() > 0) |
| newSn.setAttribute(RSSConfig.ATTR_PHPSESSIONREMOVAL,phpSession); |
| if (bvSession != null && bvSession.length() > 0) |
| newSn.setAttribute(RSSConfig.ATTR_BVSESSIONREMOVAL,bvSession); |
| ds.addChild(ds.getChildCount(),newSn); |
| } |
| } |
| |
| // Get the exclusions |
| String exclusions = variableContext.getParameter("exclusions"); |
| if (exclusions != null) |
| { |
| // Delete existing exclusions record first |
| int i = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i); |
| if (sn.getType().equals(RSSConfig.NODE_EXCLUDES)) |
| ds.removeChild(i); |
| else |
| i++; |
| } |
| |
| SpecificationNode cn = new SpecificationNode(RSSConfig.NODE_EXCLUDES); |
| cn.setValue(exclusions); |
| ds.addChild(ds.getChildCount(),cn); |
| } |
| |
| // Read the feed timeout, if present |
| String feedTimeoutValue = variableContext.getParameter("feedtimeout"); |
| if (feedTimeoutValue != null && feedTimeoutValue.length() > 0) |
| { |
| int j = 0; |
| while (j < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(j); |
| if (sn.getType().equals(RSSConfig.NODE_FEEDTIMEOUT)) |
| ds.removeChild(j); |
| else |
| j++; |
| } |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_FEEDTIMEOUT); |
| node.setAttribute(RSSConfig.ATTR_VALUE,feedTimeoutValue); |
| ds.addChild(ds.getChildCount(),node); |
| } |
| |
| // Read the feed refetch interval, if present |
| String feedRefetchValue = variableContext.getParameter("feedrefetch"); |
| if (feedRefetchValue != null && feedRefetchValue.length() > 0) |
| { |
| int j = 0; |
| while (j < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(j); |
| if (sn.getType().equals(RSSConfig.NODE_FEEDRESCAN)) |
| ds.removeChild(j); |
| else |
| j++; |
| } |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_FEEDRESCAN); |
| node.setAttribute(RSSConfig.ATTR_VALUE,feedRefetchValue); |
| ds.addChild(ds.getChildCount(),node); |
| } |
| |
| // Read the minimum feed refetch interval, if present |
| String minFeedRefetchValue = variableContext.getParameter("minfeedrefetch"); |
| if (minFeedRefetchValue != null && minFeedRefetchValue.length() > 0) |
| { |
| int j = 0; |
| while (j < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(j); |
| if (sn.getType().equals(RSSConfig.NODE_MINFEEDRESCAN)) |
| ds.removeChild(j); |
| else |
| j++; |
| } |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_MINFEEDRESCAN); |
| node.setAttribute(RSSConfig.ATTR_VALUE,minFeedRefetchValue); |
| ds.addChild(ds.getChildCount(),node); |
| } |
| |
| // Read the bad feed refetch interval (which is allowed to be null) |
| String badFeedRefetchValuePresent = variableContext.getParameter("badfeedrefetch_present"); |
| if (badFeedRefetchValuePresent != null && badFeedRefetchValuePresent.length() > 0) |
| { |
| String badFeedRefetchValue = variableContext.getParameter("badfeedrefetch"); |
| int k = 0; |
| while (k < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(k); |
| if (sn.getType().equals(RSSConfig.NODE_BADFEEDRESCAN)) |
| ds.removeChild(k); |
| else |
| k++; |
| } |
| if (badFeedRefetchValue != null && badFeedRefetchValue.length() > 0) |
| { |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_BADFEEDRESCAN); |
| node.setAttribute(RSSConfig.ATTR_VALUE,badFeedRefetchValue); |
| ds.addChild(ds.getChildCount(),node); |
| } |
| } |
| |
| // Read the dechromed mode |
| String dechromedMode = variableContext.getParameter("dechromedmode"); |
| if (dechromedMode != null && dechromedMode.length() > 0) |
| { |
| int j = 0; |
| while (j < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(j); |
| if (sn.getType().equals(RSSConfig.NODE_DECHROMEDMODE)) |
| ds.removeChild(j); |
| else |
| j++; |
| } |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_DECHROMEDMODE); |
| node.setAttribute(RSSConfig.ATTR_MODE,dechromedMode); |
| ds.addChild(ds.getChildCount(),node); |
| } |
| |
| // Read the chromed mode |
| String chromedMode = variableContext.getParameter("chromedmode"); |
| if (chromedMode != null && chromedMode.length() > 0) |
| { |
| int j = 0; |
| while (j < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(j); |
| if (sn.getType().equals(RSSConfig.NODE_CHROMEDMODE)) |
| ds.removeChild(j); |
| else |
| j++; |
| } |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_CHROMEDMODE); |
| node.setAttribute(RSSConfig.ATTR_MODE,chromedMode); |
| ds.addChild(ds.getChildCount(),node); |
| } |
| |
| // Now, do whatever action we were told to do. |
| String rssop = variableContext.getParameter("rssop"); |
| if (rssop != null && rssop.equals("Add")) |
| { |
| // Add a match to the end |
| String match = variableContext.getParameter("rssmatch"); |
| String map = variableContext.getParameter("rssmap"); |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_MAP); |
| node.setAttribute(RSSConfig.ATTR_MATCH,match); |
| node.setAttribute(RSSConfig.ATTR_MAP,map); |
| ds.addChild(ds.getChildCount(),node); |
| } |
| else if (rssop != null && rssop.equals("Delete")) |
| { |
| int index = Integer.parseInt(variableContext.getParameter("rssindex")); |
| int j = 0; |
| while (j < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(j); |
| if (sn.getType().equals(RSSConfig.NODE_MAP)) |
| { |
| if (index == 0) |
| { |
| ds.removeChild(j); |
| break; |
| } |
| index--; |
| } |
| j++; |
| } |
| } |
| |
| String xc = variableContext.getParameter("tokencount"); |
| if (xc != null) |
| { |
| // Delete all tokens first |
| int i = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i); |
| if (sn.getType().equals(RSSConfig.NODE_ACCESS)) |
| ds.removeChild(i); |
| else |
| i++; |
| } |
| |
| int accessCount = Integer.parseInt(xc); |
| i = 0; |
| while (i < accessCount) |
| { |
| String accessDescription = "_"+Integer.toString(i); |
| String accessOpName = "accessop"+accessDescription; |
| xc = variableContext.getParameter(accessOpName); |
| if (xc != null && xc.equals("Delete")) |
| { |
| // Next row |
| i++; |
| continue; |
| } |
| // Get the stuff we need |
| String accessSpec = variableContext.getParameter("spectoken"+accessDescription); |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_ACCESS); |
| node.setAttribute(RSSConfig.ATTR_TOKEN,accessSpec); |
| ds.addChild(ds.getChildCount(),node); |
| i++; |
| } |
| |
| String op = variableContext.getParameter("accessop"); |
| if (op != null && op.equals("Add")) |
| { |
| String accessspec = variableContext.getParameter("spectoken"); |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_ACCESS); |
| node.setAttribute(RSSConfig.ATTR_TOKEN,accessspec); |
| ds.addChild(ds.getChildCount(),node); |
| } |
| } |
| |
| xc = variableContext.getParameter("metadatacount"); |
| if (xc != null) |
| { |
| // Delete all tokens first |
| int i = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i); |
| if (sn.getType().equals(RSSConfig.NODE_METADATA)) |
| ds.removeChild(i); |
| else |
| i++; |
| } |
| |
| int metadataCount = Integer.parseInt(xc); |
| i = 0; |
| while (i < metadataCount) |
| { |
| String metadataDescription = "_"+Integer.toString(i); |
| String metadataOpName = "metadataop"+metadataDescription; |
| xc = variableContext.getParameter(metadataOpName); |
| if (xc != null && xc.equals("Delete")) |
| { |
| // Next row |
| i++; |
| continue; |
| } |
| // Get the stuff we need |
| String metaNameSpec = variableContext.getParameter("specmetaname"+metadataDescription); |
| String metaValueSpec = variableContext.getParameter("specmetavalue"+metadataDescription); |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_METADATA); |
| node.setAttribute(RSSConfig.ATTR_NAME,metaNameSpec); |
| node.setAttribute(RSSConfig.ATTR_VALUE,metaValueSpec); |
| ds.addChild(ds.getChildCount(),node); |
| i++; |
| } |
| |
| String op = variableContext.getParameter("metadataop"); |
| if (op != null && op.equals("Add")) |
| { |
| String metaNameSpec = variableContext.getParameter("specmetaname"); |
| String metaValueSpec = variableContext.getParameter("specmetavalue"); |
| |
| SpecificationNode node = new SpecificationNode(RSSConfig.NODE_METADATA); |
| node.setAttribute(RSSConfig.ATTR_NAME,metaNameSpec); |
| node.setAttribute(RSSConfig.ATTR_VALUE,metaValueSpec); |
| |
| ds.addChild(ds.getChildCount(),node); |
| } |
| } |
| return null; |
| } |
| |
| /** View specification. |
| * This method is called in the body section of a job's view page. Its purpose is to present the document specification information to the user. |
| * The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags. |
| *@param out is the output to which any HTML should be sent. |
| *@param ds is the current document specification for this job. |
| */ |
| @Override |
| public void viewSpecification(IHTTPOutput out, Locale locale, DocumentSpecification ds) |
| throws ManifoldCFException, IOException |
| { |
| String exclusions = ""; |
| |
| out.print( |
| "<table class=\"displaytable\">\n" |
| ); |
| int i = 0; |
| boolean seenAny = false; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_FEED)) |
| { |
| if (seenAny == false) |
| { |
| out.print( |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RSSUrls")+"</nobr></td>\n"+ |
| " <td class=\"value\">\n" |
| ); |
| seenAny = true; |
| } |
| out.print( |
| " <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(sn.getAttributeValue(RSSConfig.ATTR_URL))+"</nobr><br/>\n" |
| ); |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_EXCLUDES)) |
| { |
| exclusions = sn.getValue(); |
| if (exclusions == null) |
| exclusions = ""; |
| } |
| } |
| |
| if (seenAny) |
| { |
| out.print( |
| " </td>\n"+ |
| " </tr>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| " <tr><td class=\"message\" colspan=\"2\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.NoRSSUrlsSpecified")+"</nobr></td></tr>\n" |
| ); |
| } |
| out.print( |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" |
| ); |
| i = 0; |
| int l = 0; |
| seenAny = false; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_URLSPEC)) |
| { |
| if (l == 0) |
| { |
| out.print( |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.URLCanonicalization")+"</nobr></td>\n"+ |
| " <td class=\"value\">\n"+ |
| " <table class=\"formtable\">\n"+ |
| " <tr class=\"formheaderrow\">\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.URLRegexp")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.Description")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.Reorder")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveJSPSessions")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveASPSessions")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemovePHPSessions")+"</nobr></td>\n"+ |
| " <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveBVSessions")+"</nobr></td>\n"+ |
| " </tr>\n" |
| ); |
| } |
| String regexpString = sn.getAttributeValue(RSSConfig.ATTR_REGEXP); |
| String description = sn.getAttributeValue(RSSConfig.ATTR_DESCRIPTION); |
| if (description == null) |
| description = ""; |
| String allowReorder = sn.getAttributeValue(RSSConfig.ATTR_REORDER); |
| if (allowReorder == null || allowReorder.length() == 0) |
| allowReorder = RSSConfig.VALUE_NO; |
| String allowJavaSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL); |
| if (allowJavaSessionRemoval == null || allowJavaSessionRemoval.length() == 0) |
| allowJavaSessionRemoval = RSSConfig.VALUE_NO; |
| String allowASPSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL); |
| if (allowASPSessionRemoval == null || allowASPSessionRemoval.length() == 0) |
| allowASPSessionRemoval = RSSConfig.VALUE_NO; |
| String allowPHPSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL); |
| if (allowPHPSessionRemoval == null || allowPHPSessionRemoval.length() == 0) |
| allowPHPSessionRemoval = RSSConfig.VALUE_NO; |
| String allowBVSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL); |
| if (allowBVSessionRemoval == null || allowBVSessionRemoval.length() == 0) |
| allowBVSessionRemoval = RSSConfig.VALUE_NO; |
| out.print( |
| " <tr class=\""+(((l % 2)==0)?"evenformrow":"oddformrow")+"\">\n"+ |
| " <td class=\"formcolumncell\"><nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(regexpString)+"</nobr></td>\n"+ |
| " <td class=\"formcolumncell\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(description)+"</td>\n"+ |
| " <td class=\"formcolumncell\"><nobr>"+allowReorder+"</nobr></td>\n"+ |
| " <td class=\"formcolumncell\"><nobr>"+allowJavaSessionRemoval+"</nobr></td>\n"+ |
| " <td class=\"formcolumncell\"><nobr>"+allowASPSessionRemoval+"</nobr></td>\n"+ |
| " <td class=\"formcolumncell\"><nobr>"+allowPHPSessionRemoval+"</nobr></td>\n"+ |
| " <td class=\"formcolumncell\"><nobr>"+allowBVSessionRemoval+"</nobr></td>\n"+ |
| " </tr>\n" |
| ); |
| l++; |
| } |
| } |
| if (l > 0) |
| { |
| out.print( |
| " </table>\n"+ |
| " </td>\n"+ |
| " </tr>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| " <tr><td class=\"message\" colspan=\"2\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.NoCanonicalizationSpecified")+"</nobr></td></tr>\n" |
| ); |
| } |
| |
| out.print( |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" |
| ); |
| i = 0; |
| seenAny = false; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_MAP)) |
| { |
| if (seenAny == false) |
| { |
| out.print( |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.URLMappingsColon")+"</nobr></td>\n"+ |
| " <td class=\"value\">\n" |
| ); |
| seenAny = true; |
| } |
| String match = sn.getAttributeValue(RSSConfig.ATTR_MATCH); |
| String map = sn.getAttributeValue(RSSConfig.ATTR_MAP); |
| out.print( |
| " <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(match)+"</nobr>\n" |
| ); |
| if (map != null && map.length() > 0) |
| { |
| out.print( |
| " --> <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(map)+"</nobr>\n" |
| ); |
| } |
| out.print( |
| " <br/>\n" |
| ); |
| } |
| } |
| |
| if (seenAny) |
| { |
| out.print( |
| " </td>\n"+ |
| " </tr>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| " <tr><td class=\"message\" colspan=\"2\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.NoMappingsSpecifiedWillAcceptAllUrls")+"</nobr></td></tr>\n" |
| ); |
| } |
| out.print( |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.Exclude") + "</nobr></td>\n"+ |
| " <td class=\"value\">\n" |
| ); |
| try |
| { |
| java.io.Reader str = new java.io.StringReader(exclusions); |
| try |
| { |
| java.io.BufferedReader is = new java.io.BufferedReader(str); |
| try |
| { |
| while (true) |
| { |
| String nextString = is.readLine(); |
| if (nextString == null) |
| break; |
| if (nextString.length() == 0) |
| continue; |
| out.print( |
| " <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(nextString)+"</nobr><br/>\n" |
| ); |
| } |
| } |
| finally |
| { |
| is.close(); |
| } |
| } |
| finally |
| { |
| str.close(); |
| } |
| } |
| catch (java.io.IOException e) |
| { |
| throw new ManifoldCFException("IO error: "+e.getMessage(),e); |
| } |
| out.print( |
| " </td>\n"+ |
| " </tr>\n" |
| ); |
| out.print( |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" |
| ); |
| String feedTimeoutValue = "60"; |
| String feedRefetchValue = "60"; |
| String minFeedRefetchValue = "15"; |
| String badFeedRefetchValue = null; |
| String dechromedMode = RSSConfig.VALUE_NONE; |
| String chromedMode = RSSConfig.VALUE_USE; |
| i = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_FEEDTIMEOUT)) |
| { |
| feedTimeoutValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_FEEDRESCAN)) |
| { |
| feedRefetchValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_MINFEEDRESCAN)) |
| { |
| minFeedRefetchValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_BADFEEDRESCAN)) |
| { |
| badFeedRefetchValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_DECHROMEDMODE)) |
| { |
| dechromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE); |
| } |
| else if (sn.getType().equals(RSSConfig.NODE_CHROMEDMODE)) |
| { |
| chromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE); |
| } |
| } |
| out.print( |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.FeedConnectionTimeout")+"</nobr></td>\n"+ |
| " <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(feedTimeoutValue)+"</td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.DefaultFeedRescanInterval")+"</nobr></td>\n"+ |
| " <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(feedRefetchValue)+"</td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.MinimumFeedRescanInterval")+"</nobr></td>\n"+ |
| " <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(minFeedRefetchValue)+"</td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.BadFeedRescanInterval")+"</nobr></td>\n"+ |
| " <td class=\"value\">"+((badFeedRefetchValue==null)?"(Default feed rescan value)":org.apache.manifoldcf.ui.util.Encoder.bodyEscape(badFeedRefetchValue))+"</td>\n"+ |
| |
| " </tr>\n"+ |
| " \n"+ |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+ |
| "\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.DechromedContentSource")+"</nobr></td>\n"+ |
| " <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(dechromedMode)+"</td>\n"+ |
| " </tr>\n"+ |
| " <tr>\n"+ |
| " <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.ChromedContent")+"</nobr></td>\n"+ |
| " <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(chromedMode)+"</td>\n"+ |
| " </tr>\n"+ |
| "\n" |
| ); |
| out.print( |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" |
| ); |
| |
| // Go through looking for access tokens |
| seenAny = false; |
| i = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_ACCESS)) |
| { |
| if (seenAny == false) |
| { |
| out.print( |
| " <tr><td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.AccessTokens")+"</nobr></td>\n"+ |
| " <td class=\"value\">\n" |
| ); |
| seenAny = true; |
| } |
| String token = sn.getAttributeValue(RSSConfig.ATTR_TOKEN); |
| out.print( |
| " "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(token)+"<br/>\n" |
| ); |
| } |
| } |
| |
| if (seenAny) |
| { |
| out.print( |
| " </td>\n"+ |
| " </tr>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| " <tr><td class=\"message\" colspan=\"2\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.NoAccessTokensSpecified") + "</nobr></td></tr>\n" |
| ); |
| } |
| out.print( |
| " <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n" |
| ); |
| // Go through looking for metadata |
| seenAny = false; |
| i = 0; |
| while (i < ds.getChildCount()) |
| { |
| SpecificationNode sn = ds.getChild(i++); |
| if (sn.getType().equals(RSSConfig.NODE_METADATA)) |
| { |
| if (seenAny == false) |
| { |
| out.print( |
| " <tr><td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.MetadataColon") + "</nobr></td>\n"+ |
| " <td class=\"value\">\n" |
| ); |
| seenAny = true; |
| } |
| String name = sn.getAttributeValue(RSSConfig.ATTR_NAME); |
| String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE); |
| out.print( |
| " "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(name)+" = "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(value)+"<br/>\n" |
| ); |
| } |
| } |
| |
| if (seenAny) |
| { |
| out.print( |
| " </td>\n"+ |
| " </tr>\n" |
| ); |
| } |
| else |
| { |
| out.print( |
| " <tr><td class=\"message\" colspan=\"2\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.NoMetadataSpecified") + "</nobr></td></tr>\n" |
| ); |
| } |
| out.print( |
| "</table>\n" |
| ); |
| } |
| |
| /** Handle an RSS feed document, using SAX to limit the memory impact */ |
| protected void handleRSSFeedSAX(String documentIdentifier, IProcessActivity activities, Filter filter) |
| throws ManifoldCFException, ServiceInterruption |
| { |
| // The SAX model uses parsing events to control parsing, which allows me to manage memory usage much better. |
| // This is essential for when a feed contains dechromed content as well as links. |
| |
| // First, catch all flavors of IO exception, and handle them properly |
| try |
| { |
| // Open the input stream, and set up the parse |
| InputStream is = cache.getData(documentIdentifier); |
| if (is == null) |
| { |
| Logging.connectors.error("RSS: Document '"+documentIdentifier+"' should be in cache but isn't"); |
| return; |
| } |
| try |
| { |
| Parser p = new Parser(); |
| // Parse the document. This will cause various things to occur, within the instantiated XMLParsingContext class. |
| XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState(); |
| OuterContextClass c = new OuterContextClass(x,documentIdentifier,activities,filter); |
| x.setContext(c); |
| try |
| { |
| // Believe it or not, there are no parsing errors we can get back now. |
| p.parseWithCharsetDetection(null,is,x); |
| c.checkIfValidFeed(); |
| c.setDefaultRescanTimeIfNeeded(); |
| } |
| finally |
| { |
| x.cleanup(); |
| } |
| } |
| finally |
| { |
| is.close(); |
| } |
| } |
| catch (java.net.SocketTimeoutException e) |
| { |
| throw new ManifoldCFException("Socket timeout error: "+e.getMessage(),e); |
| } |
| catch (ConnectTimeoutException e) |
| { |
| throw new ManifoldCFException("Socket connect timeout error: "+e.getMessage(),e); |
| } |
| catch (InterruptedIOException e) |
| { |
| throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); |
| } |
| catch (IOException e) |
| { |
| throw new ManifoldCFException("IO error: "+e.getMessage(),e); |
| } |
| |
| } |
| |
| /** This class handles the outermost XML context for the feed document. */ |
| protected class OuterContextClass extends XMLParsingContext |
| { |
| /** Keep track of the number of valid feed signals we saw */ |
| protected int outerTagCount = 0; |
| /** The document identifier */ |
| protected String documentIdentifier; |
| /** Activities interface */ |
| protected IProcessActivity activities; |
| /** Filter */ |
| protected Filter filter; |
| /** Flag indicating the the rescan time was set for this feed */ |
| protected boolean rescanTimeSet = false; |
| |
| public OuterContextClass(XMLFuzzyHierarchicalParseState theStream, String documentIdentifier, IProcessActivity activities, Filter filter) |
| { |
| super(theStream); |
| this.documentIdentifier = documentIdentifier; |
| this.activities = activities; |
| this.filter = filter; |
| } |
| |
| /** Check if feed was valid */ |
| public void checkIfValidFeed() |
| { |
| if (outerTagCount == 0) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: RSS document '"+documentIdentifier+"' does not have rss, feed, or rdf:RDF tag - not valid feed"); |
| } |
| } |
| |
| /** Check if the rescan flag was set or not, and if not, make sure it gets set properly */ |
| public void setDefaultRescanTimeIfNeeded() |
| throws ManifoldCFException |
| { |
| if (rescanTimeSet == false) |
| { |
| // Set it! |
| // Need to set the requeue parameters appropriately, since otherwise the feed reverts to default document |
| // rescan or expire behavior. |
| long currentTime = System.currentTimeMillis(); |
| Long rescanTime = filter.getBadFeedRescanTime(currentTime); |
| if (rescanTime == null) |
| rescanTime = filter.getDefaultRescanTime(currentTime); |
| |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"' setting default rescan time to "+((rescanTime==null)?"null":rescanTime.toString())); |
| |
| activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null); |
| rescanTimeSet = true; |
| } |
| } |
| |
| /** Handle the tag beginning to set the correct second-level parsing context */ |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| if (localName.equals("rss")) |
| { |
| // RSS feed detected |
| outerTagCount++; |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Parsed bottom-level XML for RSS document '"+documentIdentifier+"'"); |
| return new RSSContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter); |
| } |
| else if (localName.equals("RDF")) |
| { |
| // RDF/Atom feed detected |
| outerTagCount++; |
| return new RDFContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter); |
| } |
| else if (localName.equals("feed")) |
| { |
| // Basic feed detected |
| outerTagCount++; |
| return new FeedContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter); |
| } |
| else if (localName.equals("urlset") || localName.equals("sitemapindex")) |
| { |
| // Sitemap detected |
| outerTagCount++; |
| return new UrlsetContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter); |
| } |
| |
| // The default action is to establish a new default context. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| |
| /** Handle the tag ending */ |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| XMLParsingContext context = theStream.getContext(); |
| String tagName = context.getLocalname(); |
| if (tagName.equals("rss")) |
| { |
| rescanTimeSet = ((RSSContextClass)context).process(); |
| } |
| else if (tagName.equals("RDF")) |
| { |
| rescanTimeSet = ((RDFContextClass)context).process(); |
| } |
| else if (tagName.equals("feed")) |
| { |
| rescanTimeSet = ((FeedContextClass)context).process(); |
| } |
| else if (tagName.equals("urlset") || tagName.equals("sitemapindex")) |
| { |
| rescanTimeSet = ((UrlsetContextClass)context).process(); |
| } |
| else |
| super.endTag(); |
| } |
| |
| } |
| |
| protected class RSSContextClass extends XMLParsingContext |
| { |
| /** The document identifier */ |
| protected String documentIdentifier; |
| /** Activities interface */ |
| protected IProcessActivity activities; |
| /** Filter */ |
| protected Filter filter; |
| /** Rescan time set flag */ |
| protected boolean rescanTimeSet = false; |
| |
| public RSSContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter) |
| { |
| super(theStream,namespace,localName,qName,atts); |
| this.documentIdentifier = documentIdentifier; |
| this.activities = activities; |
| this.filter = filter; |
| } |
| |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| // Handle each channel |
| if (localName.equals("channel")) |
| { |
| // Channel detected |
| return new RSSChannelContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter); |
| } |
| |
| // Skip everything else. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| // If it's our channel tag, process global channel information |
| XMLParsingContext context = theStream.getContext(); |
| String tagName = context.getLocalname(); |
| if (tagName.equals("channel")) |
| { |
| rescanTimeSet = ((RSSChannelContextClass)context).process(); |
| } |
| else |
| super.endTag(); |
| } |
| |
| /** Process this data */ |
| protected boolean process() |
| throws ManifoldCFException |
| { |
| return rescanTimeSet; |
| } |
| |
| } |
| |
| protected class RSSChannelContextClass extends XMLParsingContext |
| { |
| /** The document identifier */ |
| protected String documentIdentifier; |
| /** Activities interface */ |
| protected IProcessActivity activities; |
| /** Filter */ |
| protected Filter filter; |
| |
| /** TTL value is set on a per-channel basis */ |
| protected String ttlValue = null; |
| |
| public RSSChannelContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter) |
| { |
| super(theStream,namespace,localName,qName,atts); |
| this.documentIdentifier = documentIdentifier; |
| this.activities = activities; |
| this.filter = filter; |
| } |
| |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| // The tags we care about are "ttl" and "item", nothing else. |
| if (localName.equals("ttl")) |
| { |
| // TTL value seen. Prepare to record it, as a string. |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("item")) |
| { |
| // Item seen. We don't need any of the attributes etc., but we need to start a new context. |
| return new RSSItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode()); |
| } |
| // Skip everything else. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| XMLParsingContext theContext = theStream.getContext(); |
| String theTag = theContext.getLocalname(); |
| if (theTag.equals("ttl")) |
| // If the current context must be the TTL one, record its data value. |
| ttlValue = ((XMLStringParsingContext)theContext).getValue(); |
| else if (theTag.equals("item")) |
| { |
| // It's an item. |
| RSSItemContextClass itemContext = (RSSItemContextClass)theContext; |
| // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including: |
| // (1) File name (if any), containing dechromed content |
| // (2) Link name(s) |
| // (3) Pubdate |
| // (4) Title |
| // The job now is to pull this info out and call the activities interface appropriately. |
| |
| // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up |
| // all dangling files etc. that need to be removed. |
| // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup() |
| // method will be called regardless. |
| itemContext.process(documentIdentifier,activities,filter); |
| } |
| else |
| super.endTag(); |
| } |
| |
| /** Process this data, return true if rescan time was set */ |
| protected boolean process() |
| throws ManifoldCFException |
| { |
| // Deal with the ttlvalue, if it was found |
| // Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default. |
| long currentTime = System.currentTimeMillis(); |
| Long rescanTime = filter.getDefaultRescanTime(currentTime); |
| if (ttlValue != null) |
| { |
| try |
| { |
| int minutes = Integer.parseInt(ttlValue); |
| long nextTime = currentTime + minutes * 60000L; |
| rescanTime = new Long(nextTime); |
| // Set the upper bound time; we want to scan the feeds aggressively. |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"', found a ttl value of "+ttlValue+"; setting refetch time accordingly"); |
| } |
| catch (NumberFormatException e) |
| { |
| Logging.connectors.warn("RSS: RSS document '"+documentIdentifier+"' has illegal ttl value '"+ttlValue+"'"); |
| } |
| } |
| |
| if (rescanTime != null) |
| { |
| Long minimumTime = filter.getMinimumRescanTime(currentTime); |
| if (minimumTime != null) |
| { |
| if (rescanTime.longValue() < minimumTime.longValue()) |
| rescanTime = minimumTime; |
| } |
| } |
| |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"' setting rescan time to "+((rescanTime==null)?"null":rescanTime.toString())); |
| |
| activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null); |
| return true; |
| } |
| } |
| |
| protected class RSSItemContextClass extends XMLParsingContext |
| { |
| protected int dechromedContentMode; |
| protected String guidField = null; |
| protected String linkField = null; |
| protected String pubDateField = null; |
| protected String titleField = null; |
| protected String descriptionField = null; |
| protected String authorEmailField = null; |
| protected String authorNameField = null; |
| protected ArrayList categoryField = new ArrayList(); |
| protected File contentsFile = null; |
| |
| public RSSItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode) |
| { |
| super(theStream,namespace,localName,qName,atts); |
| this.dechromedContentMode = dechromedContentMode; |
| } |
| |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| // The tags we care about are "ttl" and "item", nothing else. |
| if (localName.equals("link")) |
| { |
| // "link" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("guid")) |
| { |
| // "guid" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("pubdate")) |
| { |
| // "pubDate" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("title")) |
| { |
| // "title" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("category")) |
| { |
| // "category" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("author")) |
| { |
| // "author" tag, which contains email |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("creator")) |
| { |
| // "creator" tag which contains name (like dc:creator) |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else |
| { |
| // Handle potentially longer fields. Both "description" and "content" fields can potentially be large; they are thus |
| // processed as temporary files. But the dance is complicated because (a) we only want one PRIMARY content source, |
| // and (b) we want access to the description field, if it is not used as primary content. |
| switch (dechromedContentMode) |
| { |
| case DECHROMED_NONE: |
| if (localName.equals("description")) |
| { |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| break; |
| case DECHROMED_DESCRIPTION: |
| if (localName.equals("description")) |
| { |
| try |
| { |
| File tempFile = File.createTempFile("_rssdata_","tmp"); |
| return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile); |
| } |
| catch (java.net.SocketTimeoutException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| catch (InterruptedIOException e) |
| { |
| throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); |
| } |
| catch (IOException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| } |
| break; |
| case DECHROMED_CONTENT: |
| if (localName.equals("content")) |
| { |
| try |
| { |
| File tempFile = File.createTempFile("_rssdata_","tmp"); |
| return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile); |
| } |
| catch (java.net.SocketTimeoutException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| catch (InterruptedIOException e) |
| { |
| throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); |
| } |
| catch (IOException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| } |
| else if (localName.equals("description")) |
| { |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| break; |
| default: |
| break; |
| } |
| // Skip everything else. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| } |
| |
| /** Convert the individual sub-fields of the item context into their final forms */ |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| XMLParsingContext theContext = theStream.getContext(); |
| String theTag = theContext.getLocalname(); |
| if (theTag.equals("link")) |
| { |
| linkField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("guid")) |
| { |
| guidField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("pubdate")) |
| { |
| pubDateField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("title")) |
| { |
| titleField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("category")) |
| { |
| categoryField.add(((XMLStringParsingContext)theContext).getValue()); |
| } |
| else if (theTag.equals("author")) |
| { |
| authorEmailField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("creator")) |
| { |
| authorNameField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else |
| { |
| // What we want is: (a) if dechromed mode is NONE, just put the description file in the description field; (b) |
| // if dechromed mode is "description", put the description field in the primary content field; (c) |
| // if dechromed mode is "content", put the content field in the primary content field, and the description field in the description field. |
| switch (dechromedContentMode) |
| { |
| case DECHROMED_NONE: |
| if (theTag.equals("description")) |
| { |
| descriptionField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| break; |
| case DECHROMED_DESCRIPTION: |
| if (theTag.equals("description")) |
| { |
| // Content file has been written; retrieve it (being sure not to leak any files already hanging around!) |
| tagCleanup(); |
| contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile(); |
| return; |
| } |
| break; |
| case DECHROMED_CONTENT: |
| if (theTag.equals("content")) |
| { |
| tagCleanup(); |
| // Retrieve content file |
| contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile(); |
| return; |
| } |
| else if (theTag.equals("description")) |
| { |
| descriptionField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| break; |
| default: |
| break; |
| } |
| |
| super.endTag(); |
| } |
| } |
| |
| protected void tagCleanup() |
| throws ManifoldCFException |
| { |
| // Delete the contents file if it is there. |
| if (contentsFile != null) |
| { |
| contentsFile.delete(); |
| contentsFile = null; |
| } |
| } |
| |
| /** Process the data accumulated for this item */ |
| public void process(String documentIdentifier, IProcessActivity activities, Filter filter) |
| throws ManifoldCFException |
| { |
| if (linkField == null || linkField.length() == 0) |
| linkField = guidField; |
| |
| if (linkField != null && linkField.length() > 0) |
| { |
| Date origDateDate = null; |
| if (pubDateField != null && pubDateField.length() > 0) |
| { |
| origDateDate = DateParser.parseRFC822Date(pubDateField); |
| // Special for China Daily News |
| if (origDateDate == null) |
| origDateDate = DateParser.parseChinaDate(pubDateField); |
| // Special for LL |
| if (origDateDate == null) |
| origDateDate = DateParser.parseISO8601Date(pubDateField); |
| } |
| Long origDate; |
| if (origDateDate != null) |
| origDate = new Long(origDateDate.getTime()); |
| else |
| origDate = null; |
| |
| String[] links = linkField.split(", "); |
| int l = 0; |
| while (l < links.length) |
| { |
| String rawURL = links[l++].trim(); |
| // Process the link |
| String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL); |
| if (newIdentifier != null) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+ |
| ((origDate==null)?"null":origDate.toString())); |
| if (filter.isLegalURL(newIdentifier)) |
| { |
| if (contentsFile == null && filter.getChromedContentMode() != CHROMED_METADATA_ONLY) |
| { |
| // It's a reference! Add it. |
| String[] dataNames = new String[]{"pubdate","title","source","authoremail","authorname","category","description"}; |
| String[][] dataValues = new String[dataNames.length][]; |
| if (origDate != null) |
| dataValues[0] = new String[]{origDate.toString()}; |
| if (titleField != null) |
| dataValues[1] = new String[]{titleField}; |
| dataValues[2] = new String[]{documentIdentifier}; |
| if (authorEmailField != null) |
| dataValues[3] = new String[]{authorEmailField}; |
| if (authorNameField != null) |
| dataValues[4] = new String[]{authorNameField}; |
| dataValues[5] = new String[categoryField.size()]; |
| int q = 0; |
| while (q < categoryField.size()) |
| { |
| (dataValues[5])[q] = (String)categoryField.get(q); |
| q++; |
| } |
| if (descriptionField != null) |
| dataValues[6] = new String[]{descriptionField}; |
| // Add document reference, not including the data to pass down, but including a description |
| activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate); |
| } |
| else |
| { |
| // The issue here is that if a document is ingested without a jobqueue entry, the document will not |
| // be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make |
| // sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document |
| // right here. |
| |
| // Since the dechromed data is available from the feed, the possibility remains of passing the document |
| |
| // Now, set up the carrydown info |
| String[] dataNames = new String[]{"pubdate","title","source","authoremail","authorname","category","data","description"}; |
| Object[][] dataValues = new Object[dataNames.length][]; |
| if (origDate != null) |
| dataValues[0] = new String[]{origDate.toString()}; |
| if (titleField != null) |
| dataValues[1] = new String[]{titleField}; |
| dataValues[2] = new String[]{documentIdentifier}; |
| if (authorEmailField != null) |
| dataValues[3] = new String[]{authorEmailField}; |
| if (authorNameField != null) |
| dataValues[4] = new String[]{authorNameField}; |
| dataValues[5] = new String[categoryField.size()]; |
| int q = 0; |
| while (q < categoryField.size()) |
| { |
| (dataValues[5])[q] = (String)categoryField.get(q); |
| q++; |
| } |
| |
| if (descriptionField != null) |
| dataValues[7] = new String[]{descriptionField}; |
| |
| if (contentsFile == null) |
| { |
| CharacterInput ci = new NullCharacterInput(); |
| try |
| { |
| dataValues[6] = new Object[]{ci}; |
| |
| // Add document reference, including the data to pass down, and the dechromed content too |
| activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate); |
| } |
| finally |
| { |
| ci.discard(); |
| } |
| } |
| else |
| { |
| CharacterInput ci = new TempFileCharacterInput(contentsFile); |
| try |
| { |
| contentsFile = null; |
| dataValues[6] = new Object[]{ci}; |
| |
| // Add document reference, including the data to pass down, and the dechromed content too |
| activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate); |
| } |
| finally |
| { |
| ci.discard(); |
| } |
| } |
| } |
| } |
| else |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded"); |
| } |
| } |
| else |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'"); |
| } |
| } |
| } |
| } |
| } |
| |
| protected class RDFContextClass extends XMLParsingContext |
| { |
| /** The document identifier */ |
| protected String documentIdentifier; |
| /** Activities interface */ |
| protected IProcessActivity activities; |
| /** Filter */ |
| protected Filter filter; |
| |
| /** ttl value */ |
| protected String ttlValue = null; |
| |
| public RDFContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter) |
| { |
| super(theStream,namespace,localName,qName,atts); |
| this.documentIdentifier = documentIdentifier; |
| this.activities = activities; |
| this.filter = filter; |
| } |
| |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| // The tags we care about are "ttl" and "item", nothing else. |
| if (localName.equals("ttl")) |
| { |
| // TTL value seen. Prepare to record it, as a string. |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("item")) |
| { |
| // Item seen. We don't need any of the attributes etc., but we need to start a new context. |
| return new RDFItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode()); |
| } |
| // Skip everything else. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| XMLParsingContext theContext = theStream.getContext(); |
| String theTag = theContext.getLocalname(); |
| if (theTag.equals("ttl")) |
| // If the current context must be the TTL one, record its data value. |
| ttlValue = ((XMLStringParsingContext)theContext).getValue(); |
| else if (theTag.equals("item")) |
| { |
| // It's an item. |
| RDFItemContextClass itemContext = (RDFItemContextClass)theContext; |
| // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including: |
| // (1) File name (if any), containing dechromed content |
| // (2) Link name(s) |
| // (3) Pubdate |
| // (4) Title |
| // The job now is to pull this info out and call the activities interface appropriately. |
| |
| // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up |
| // all dangling files etc. that need to be removed. |
| // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup() |
| // method will be called regardless. |
| itemContext.process(documentIdentifier,activities,filter); |
| } |
| else |
| super.endTag(); |
| } |
| |
| /** Process this data */ |
| protected boolean process() |
| throws ManifoldCFException |
| { |
| // Deal with the ttlvalue, if it was found |
| // Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default. |
| long currentTime = System.currentTimeMillis(); |
| Long rescanTime = filter.getDefaultRescanTime(currentTime); |
| if (ttlValue != null) |
| { |
| try |
| { |
| int minutes = Integer.parseInt(ttlValue); |
| long nextTime = currentTime + minutes * 60000L; |
| rescanTime = new Long(nextTime); |
| // Set the upper bound time; we want to scan the feeds aggressively. |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In RDF document '"+documentIdentifier+"', found a ttl value of "+ttlValue+"; setting refetch time accordingly"); |
| } |
| catch (NumberFormatException e) |
| { |
| Logging.connectors.warn("RSS: RDF document '"+documentIdentifier+"' has illegal ttl value '"+ttlValue+"'"); |
| } |
| } |
| |
| if (rescanTime != null) |
| { |
| Long minimumTime = filter.getMinimumRescanTime(currentTime); |
| if (minimumTime != null) |
| { |
| if (rescanTime.longValue() < minimumTime.longValue()) |
| rescanTime = minimumTime; |
| } |
| } |
| |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In RDF document '"+documentIdentifier+"' setting rescan time to "+((rescanTime==null)?"null":rescanTime.toString())); |
| |
| activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null); |
| return true; |
| } |
| } |
| |
| protected class RDFItemContextClass extends XMLParsingContext |
| { |
| protected int dechromedContentMode; |
| protected String linkField = null; |
| protected String pubDateField = null; |
| protected String titleField = null; |
| protected String authorNameField = null; |
| protected String descriptionField = null; |
| protected File contentsFile = null; |
| |
| public RDFItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode) |
| { |
| super(theStream,namespace,localName,qName,atts); |
| this.dechromedContentMode = dechromedContentMode; |
| } |
| |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| // The tags we care about are "ttl" and "item", nothing else. |
| if (localName.equals("link")) |
| { |
| // "link" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("date")) |
| { |
| // "dc:date" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("title")) |
| { |
| // "title" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("creator")) |
| { |
| // "creator" tag (e.g. "dc:creator") |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else |
| { |
| switch (dechromedContentMode) |
| { |
| case DECHROMED_NONE: |
| if (localName.equals("description")) |
| { |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| break; |
| case DECHROMED_DESCRIPTION: |
| if (localName.equals("description")) |
| { |
| try |
| { |
| File tempFile = File.createTempFile("_rssdata_","tmp"); |
| return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile); |
| } |
| catch (java.net.SocketTimeoutException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| catch (InterruptedIOException e) |
| { |
| throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); |
| } |
| catch (IOException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| } |
| break; |
| case DECHROMED_CONTENT: |
| if (localName.equals("content")) |
| { |
| try |
| { |
| File tempFile = File.createTempFile("_rssdata_","tmp"); |
| return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile); |
| } |
| catch (java.net.SocketTimeoutException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| catch (InterruptedIOException e) |
| { |
| throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); |
| } |
| catch (IOException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| } |
| else if (localName.equals("description")) |
| { |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| break; |
| default: |
| break; |
| } |
| // Skip everything else. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| } |
| |
| /** Convert the individual sub-fields of the item context into their final forms */ |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| XMLParsingContext theContext = theStream.getContext(); |
| String theTag = theContext.getLocalname(); |
| if (theTag.equals("link")) |
| { |
| linkField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("date")) |
| { |
| pubDateField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("title")) |
| { |
| titleField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("creator")) |
| { |
| authorNameField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else |
| { |
| switch (dechromedContentMode) |
| { |
| case DECHROMED_NONE: |
| if (theTag.equals("description")) |
| { |
| descriptionField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| break; |
| case DECHROMED_DESCRIPTION: |
| if (theTag.equals("description")) |
| { |
| // Content file has been written; retrieve it (being sure not to leak any files already hanging around!) |
| tagCleanup(); |
| contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile(); |
| return; |
| } |
| break; |
| case DECHROMED_CONTENT: |
| if (theTag.equals("dc:content")) |
| { |
| // Retrieve content file |
| tagCleanup(); |
| contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile(); |
| return; |
| } |
| else if (theTag.equals("description")) |
| { |
| descriptionField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| break; |
| default: |
| break; |
| } |
| |
| super.endTag(); |
| } |
| } |
| |
| protected void tagCleanup() |
| throws ManifoldCFException |
| { |
| // Delete the contents file if it is there. |
| if (contentsFile != null) |
| { |
| contentsFile.delete(); |
| contentsFile = null; |
| } |
| } |
| |
| /** Process the data accumulated for this item */ |
| public void process(String documentIdentifier, IProcessActivity activities, Filter filter) |
| throws ManifoldCFException |
| { |
| if (linkField != null && linkField.length() > 0) |
| { |
| Date origDateDate = null; |
| if (pubDateField != null && pubDateField.length() > 0) |
| origDateDate = DateParser.parseISO8601Date(pubDateField); |
| |
| Long origDate; |
| if (origDateDate != null) |
| origDate = new Long(origDateDate.getTime()); |
| else |
| origDate = null; |
| |
| String[] links = linkField.split(", "); |
| int l = 0; |
| while (l < links.length) |
| { |
| String rawURL = links[l++].trim(); |
| // Process the link |
| String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL); |
| if (newIdentifier != null) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In RDF document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+ |
| ((origDate==null)?"null":origDate.toString())); |
| if (filter.isLegalURL(newIdentifier)) |
| { |
| if (contentsFile == null && filter.getChromedContentMode() != CHROMED_METADATA_ONLY) |
| { |
| // It's a reference! Add it. |
| String[] dataNames = new String[]{"pubdate","title","source","authorname","description"}; |
| String[][] dataValues = new String[dataNames.length][]; |
| if (origDate != null) |
| dataValues[0] = new String[]{origDate.toString()}; |
| if (titleField != null) |
| dataValues[1] = new String[]{titleField}; |
| dataValues[2] = new String[]{documentIdentifier}; |
| if (authorNameField != null) |
| dataValues[3] = new String[]{authorNameField}; |
| if (descriptionField != null) |
| dataValues[4] = new String[]{descriptionField}; |
| |
| // Add document reference, including the data to pass down |
| activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate); |
| } |
| else |
| { |
| // The issue here is that if a document is ingested without a jobqueue entry, the document will not |
| // be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make |
| // sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document |
| // right here. |
| |
| // Now, set up the carrydown info |
| String[] dataNames = new String[]{"pubdate","title","source","authorname","data","description"}; |
| Object[][] dataValues = new Object[dataNames.length][]; |
| if (origDate != null) |
| dataValues[0] = new String[]{origDate.toString()}; |
| if (titleField != null) |
| dataValues[1] = new String[]{titleField}; |
| dataValues[2] = new String[]{documentIdentifier}; |
| if (authorNameField != null) |
| dataValues[3] = new String[]{authorNameField}; |
| if (descriptionField != null) |
| dataValues[5] = new String[]{descriptionField}; |
| |
| if (contentsFile == null) |
| { |
| CharacterInput ci = new NullCharacterInput(); |
| try |
| { |
| dataValues[4] = new Object[]{ci}; |
| |
| // Add document reference, including the data to pass down, and the dechromed content too |
| activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate); |
| } |
| finally |
| { |
| ci.discard(); |
| } |
| } |
| else |
| { |
| CharacterInput ci = new TempFileCharacterInput(contentsFile); |
| try |
| { |
| contentsFile = null; |
| dataValues[4] = new Object[]{ci}; |
| |
| // Add document reference, including the data to pass down, and the dechromed content too |
| activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate); |
| } |
| finally |
| { |
| ci.discard(); |
| } |
| } |
| } |
| } |
| else |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded"); |
| } |
| } |
| else |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'"); |
| } |
| } |
| } |
| } |
| } |
| |
| protected class FeedContextClass extends XMLParsingContext |
| { |
| /** The document identifier */ |
| protected String documentIdentifier; |
| /** Activities interface */ |
| protected IProcessActivity activities; |
| /** Filter */ |
| protected Filter filter; |
| |
| /** ttl value */ |
| protected String ttlValue = null; |
| |
| public FeedContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter) |
| { |
| super(theStream,namespace,localName,qName,atts); |
| this.documentIdentifier = documentIdentifier; |
| this.activities = activities; |
| this.filter = filter; |
| } |
| |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| // The tags we care about are "ttl" and "item", nothing else. |
| if (localName.equals("ttl")) |
| { |
| // TTL value seen. Prepare to record it, as a string. |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("entry")) |
| { |
| // Item seen. We don't need any of the attributes etc., but we need to start a new context. |
| return new FeedItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode()); |
| } |
| // Skip everything else. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| XMLParsingContext theContext = theStream.getContext(); |
| String theTag = theContext.getLocalname(); |
| if (theTag.equals("ttl")) |
| // If the current context must be the TTL one, record its data value. |
| ttlValue = ((XMLStringParsingContext)theContext).getValue(); |
| else if (theTag.equals("entry")) |
| { |
| // It's an item. |
| FeedItemContextClass itemContext = (FeedItemContextClass)theContext; |
| // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including: |
| // (1) File name (if any), containing dechromed content |
| // (2) Link name(s) |
| // (3) Pubdate |
| // (4) Title |
| // The job now is to pull this info out and call the activities interface appropriately. |
| |
| // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up |
| // all dangling files etc. that need to be removed. |
| // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup() |
| // method will be called regardless. |
| itemContext.process(documentIdentifier,activities,filter); |
| } |
| else |
| super.endTag(); |
| } |
| |
| /** Process this data */ |
| protected boolean process() |
| throws ManifoldCFException |
| { |
| // Deal with the ttlvalue, if it was found |
| // Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default. |
| long currentTime = System.currentTimeMillis(); |
| Long rescanTime = filter.getDefaultRescanTime(currentTime); |
| if (ttlValue != null) |
| { |
| try |
| { |
| int minutes = Integer.parseInt(ttlValue); |
| long nextTime = currentTime + minutes * 60000L; |
| rescanTime = new Long(nextTime); |
| // Set the upper bound time; we want to scan the feeds aggressively. |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found a ttl value of "+ttlValue+"; setting refetch time accordingly"); |
| } |
| catch (NumberFormatException e) |
| { |
| Logging.connectors.warn("RSS: Atom document '"+documentIdentifier+"' has illegal ttl value '"+ttlValue+"'"); |
| } |
| } |
| |
| if (rescanTime != null) |
| { |
| Long minimumTime = filter.getMinimumRescanTime(currentTime); |
| if (minimumTime != null) |
| { |
| if (rescanTime.longValue() < minimumTime.longValue()) |
| rescanTime = minimumTime; |
| } |
| } |
| |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"' setting rescan time to "+((rescanTime==null)?"null":rescanTime.toString())); |
| |
| activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null); |
| return true; |
| } |
| } |
| |
| protected class FeedItemContextClass extends XMLParsingContext |
| { |
| protected int dechromedContentMode; |
| protected List<String> linkField = new ArrayList<String>(); |
| protected String pubDateField = null; |
| protected String titleField = null; |
| protected String authorNameField = null; |
| protected String authorEmailField = null; |
| protected ArrayList categoryField = new ArrayList(); |
| protected File contentsFile = null; |
| protected String descriptionField = null; |
| |
| public FeedItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode) |
| { |
| super(theStream,namespace,localName,qName,atts); |
| this.dechromedContentMode = dechromedContentMode; |
| } |
| |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| // The tags we care about are "ttl" and "item", nothing else. |
| if (localName.equals("link")) |
| { |
| // "link" tag |
| String ref = atts.get("href"); |
| if (ref != null && ref.length() > 0) |
| linkField.add(ref); |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| else if (localName.equals("published") || localName.equals("updated")) |
| { |
| // "published" pr "updated" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("title")) |
| { |
| // "title" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("author")) |
| { |
| return new FeedAuthorContextClass(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("category")) |
| { |
| String category = atts.get("term"); |
| if (category != null && category.length() > 0) |
| categoryField.add(category); |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| else |
| { |
| switch (dechromedContentMode) |
| { |
| case DECHROMED_NONE: |
| if (localName.equals("subtitle")) |
| { |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| break; |
| case DECHROMED_DESCRIPTION: |
| if (localName.equals("subtitle")) |
| { |
| try |
| { |
| File tempFile = File.createTempFile("_rssdata_","tmp"); |
| return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile); |
| } |
| catch (java.net.SocketTimeoutException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| catch (InterruptedIOException e) |
| { |
| throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); |
| } |
| catch (IOException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| } |
| break; |
| case DECHROMED_CONTENT: |
| if (localName.equals("content")) |
| { |
| try |
| { |
| File tempFile = File.createTempFile("_rssdata_","tmp"); |
| return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile); |
| } |
| catch (java.net.SocketTimeoutException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| catch (InterruptedIOException e) |
| { |
| throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); |
| } |
| catch (IOException e) |
| { |
| throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e); |
| } |
| } |
| else if (localName.equals("subtitle")) |
| { |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| break; |
| default: |
| break; |
| } |
| // Skip everything else. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| } |
| |
| /** Convert the individual sub-fields of the item context into their final forms */ |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| XMLParsingContext theContext = theStream.getContext(); |
| String theTag = theContext.getLocalname(); |
| if (theTag.equals("published") || theTag.equals("updated")) |
| { |
| pubDateField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("title")) |
| { |
| titleField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("author")) |
| { |
| FeedAuthorContextClass authorContext = (FeedAuthorContextClass)theContext; |
| authorEmailField = authorContext.getAuthorEmail(); |
| authorNameField = authorContext.getAuthorName(); |
| } |
| else |
| { |
| switch (dechromedContentMode) |
| { |
| case DECHROMED_NONE: |
| if (theTag.equals("subtitle")) |
| { |
| titleField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| break; |
| case DECHROMED_DESCRIPTION: |
| if (theTag.equals("subtitle")) |
| { |
| // Content file has been written; retrieve it (being sure not to leak any files already hanging around!) |
| tagCleanup(); |
| contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile(); |
| return; |
| } |
| break; |
| case DECHROMED_CONTENT: |
| if (theTag.equals("content")) |
| { |
| // Retrieve content file |
| tagCleanup(); |
| contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile(); |
| return; |
| } |
| else if (theTag.equals("subtitle")) |
| { |
| titleField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| break; |
| default: |
| break; |
| } |
| |
| super.endTag(); |
| } |
| } |
| |
| protected void tagCleanup() |
| throws ManifoldCFException |
| { |
| // Delete the contents file if it is there. |
| if (contentsFile != null) |
| { |
| contentsFile.delete(); |
| contentsFile = null; |
| } |
| } |
| |
| /** Process the data accumulated for this item */ |
| public void process(String documentIdentifier, IProcessActivity activities, Filter filter) |
| throws ManifoldCFException |
| { |
| if (linkField.size() > 0) |
| { |
| Date origDateDate = null; |
| if (pubDateField != null && pubDateField.length() > 0) |
| origDateDate = DateParser.parseISO8601Date(pubDateField); |
| |
| Long origDate; |
| if (origDateDate != null) |
| origDate = new Long(origDateDate.getTime()); |
| else |
| origDate = null; |
| |
| for (String linkValue : linkField) |
| { |
| String[] links = linkValue.split(", "); |
| int l = 0; |
| while (l < links.length) |
| { |
| String rawURL = links[l++].trim(); |
| // Process the link |
| String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL); |
| if (newIdentifier != null) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+ |
| ((origDate==null)?"null":origDate.toString())); |
| if (filter.isLegalURL(newIdentifier)) |
| { |
| if (contentsFile == null && filter.getChromedContentMode() != CHROMED_METADATA_ONLY) |
| { |
| // It's a reference! Add it. |
| String[] dataNames = new String[]{"pubdate","title","source","category","description"}; |
| String[][] dataValues = new String[dataNames.length][]; |
| if (origDate != null) |
| dataValues[0] = new String[]{origDate.toString()}; |
| if (titleField != null) |
| dataValues[1] = new String[]{titleField}; |
| dataValues[2] = new String[]{documentIdentifier}; |
| dataValues[3] = new String[categoryField.size()]; |
| int q = 0; |
| while (q < categoryField.size()) |
| { |
| (dataValues[3])[q] = (String)categoryField.get(q); |
| q++; |
| } |
| if (descriptionField != null) |
| dataValues[4] = new String[]{descriptionField}; |
| |
| // Add document reference, including the data to pass down |
| activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate); |
| } |
| else |
| { |
| // The issue here is that if a document is ingested without a jobqueue entry, the document will not |
| // be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make |
| // sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document |
| // right here. |
| |
| // Now, set up the carrydown info |
| String[] dataNames = new String[]{"pubdate","title","source","category","data","description"}; |
| Object[][] dataValues = new Object[dataNames.length][]; |
| if (origDate != null) |
| dataValues[0] = new String[]{origDate.toString()}; |
| if (titleField != null) |
| dataValues[1] = new String[]{titleField}; |
| dataValues[2] = new String[]{documentIdentifier}; |
| dataValues[3] = new String[categoryField.size()]; |
| int q = 0; |
| while (q < categoryField.size()) |
| { |
| (dataValues[3])[q] = (String)categoryField.get(q); |
| q++; |
| } |
| if (descriptionField != null) |
| dataValues[5] = new String[]{descriptionField}; |
| |
| if (contentsFile == null) |
| { |
| CharacterInput ci = new NullCharacterInput(); |
| try |
| { |
| dataValues[4] = new Object[]{ci}; |
| |
| // Add document reference, including the data to pass down, and the dechromed content too |
| activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate); |
| } |
| finally |
| { |
| ci.discard(); |
| } |
| } |
| else |
| { |
| CharacterInput ci = new TempFileCharacterInput(contentsFile); |
| try |
| { |
| contentsFile = null; |
| |
| dataValues[4] = new Object[]{ci}; |
| |
| // Add document reference, including the data to pass down, and the dechromed content too |
| activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate); |
| } |
| finally |
| { |
| ci.discard(); |
| } |
| } |
| } |
| } |
| else |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded"); |
| } |
| } |
| else |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'"); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| protected class FeedAuthorContextClass extends XMLParsingContext |
| { |
| protected String authorNameField = null; |
| protected String authorEmailField = null; |
| |
| public FeedAuthorContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts) |
| { |
| super(theStream,namespace,localName,qName,atts); |
| } |
| |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| if (localName.equals("name")) |
| { |
| // "name" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("email")) |
| { |
| // "email" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else |
| { |
| // Skip everything else. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| } |
| |
| /** Convert the individual sub-fields of the item context into their final forms */ |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| XMLParsingContext theContext = theStream.getContext(); |
| String theTag = theContext.getLocalname(); |
| if (theTag.equals("name")) |
| { |
| authorNameField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("email")) |
| { |
| authorEmailField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else |
| { |
| super.endTag(); |
| } |
| } |
| |
| public String getAuthorName() |
| { |
| return authorNameField; |
| } |
| |
| public String getAuthorEmail() |
| { |
| return authorEmailField; |
| } |
| } |
| |
| protected class UrlsetContextClass extends XMLParsingContext |
| { |
| /** The document identifier */ |
| protected String documentIdentifier; |
| /** Activities interface */ |
| protected IProcessActivity activities; |
| /** Filter */ |
| protected Filter filter; |
| |
| /** ttl value */ |
| protected String ttlValue = null; |
| |
| public UrlsetContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter) |
| { |
| super(theStream,namespace,localName,qName,atts); |
| this.documentIdentifier = documentIdentifier; |
| this.activities = activities; |
| this.filter = filter; |
| } |
| |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| // The tags we care about are "url", nothing else. |
| if (localName.equals("url") || localName.equals("sitemap")) |
| { |
| // Item seen. We don't need any of the attributes etc., but we need to start a new context. |
| return new UrlsetItemContextClass(theStream,namespace,localName,qName,atts); |
| } |
| // Skip everything else. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| XMLParsingContext theContext = theStream.getContext(); |
| String theTag = theContext.getLocalname(); |
| if (theTag.equals("url") || theTag.equals("sitemap")) |
| { |
| // It's an item. |
| UrlsetItemContextClass itemContext = (UrlsetItemContextClass)theContext; |
| // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including: |
| // (1) File name (if any), containing dechromed content |
| // (2) Link name(s) |
| // (3) Pubdate |
| // (4) Title |
| // The job now is to pull this info out and call the activities interface appropriately. |
| |
| // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up |
| // all dangling files etc. that need to be removed. |
| // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup() |
| // method will be called regardless. |
| itemContext.process(documentIdentifier,activities,filter); |
| } |
| else |
| super.endTag(); |
| } |
| |
| /** Process this data */ |
| protected boolean process() |
| throws ManifoldCFException |
| { |
| // Deal with the ttlvalue, if it was found |
| // Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default. |
| long currentTime = System.currentTimeMillis(); |
| Long rescanTime = filter.getDefaultRescanTime(currentTime); |
| if (ttlValue != null) |
| { |
| try |
| { |
| int minutes = Integer.parseInt(ttlValue); |
| long nextTime = currentTime + minutes * 60000L; |
| rescanTime = new Long(nextTime); |
| // Set the upper bound time; we want to scan the feeds aggressively. |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found a ttl value of "+ttlValue+"; setting refetch time accordingly"); |
| } |
| catch (NumberFormatException e) |
| { |
| Logging.connectors.warn("RSS: SiteMap document '"+documentIdentifier+"' has illegal ttl value '"+ttlValue+"'"); |
| } |
| } |
| |
| if (rescanTime != null) |
| { |
| Long minimumTime = filter.getMinimumRescanTime(currentTime); |
| if (minimumTime != null) |
| { |
| if (rescanTime.longValue() < minimumTime.longValue()) |
| rescanTime = minimumTime; |
| } |
| } |
| |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"' setting rescan time to "+((rescanTime==null)?"null":rescanTime.toString())); |
| |
| activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null); |
| return true; |
| } |
| } |
| |
| protected class UrlsetItemContextClass extends XMLParsingContext |
| { |
| protected String linkField = null; |
| protected String pubDateField = null; |
| |
| public UrlsetItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts) |
| { |
| super(theStream,namespace,localName,qName,atts); |
| } |
| |
| @Override |
| protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts) |
| throws ManifoldCFException |
| { |
| // The tags we care about are "loc" and "lastmod", nothing else. |
| if (localName.equals("loc")) |
| { |
| // "loc" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else if (localName.equals("lastmod")) |
| { |
| // "lastmod" tag |
| return new XMLStringParsingContext(theStream,namespace,localName,qName,atts); |
| } |
| else |
| { |
| // Skip everything else. |
| return super.beginTag(namespace,localName,qName,atts); |
| } |
| } |
| |
| /** Convert the individual sub-fields of the item context into their final forms */ |
| @Override |
| protected void endTag() |
| throws ManifoldCFException |
| { |
| XMLParsingContext theContext = theStream.getContext(); |
| String theTag = theContext.getLocalname(); |
| if (theTag.equals("loc")) |
| { |
| linkField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else if (theTag.equals("lastmod")) |
| { |
| pubDateField = ((XMLStringParsingContext)theContext).getValue(); |
| } |
| else |
| { |
| super.endTag(); |
| } |
| } |
| |
| protected void tagCleanup() |
| throws ManifoldCFException |
| { |
| } |
| |
| /** Process the data accumulated for this item */ |
| public void process(String documentIdentifier, IProcessActivity activities, Filter filter) |
| throws ManifoldCFException |
| { |
| if (linkField != null && linkField.length() > 0) |
| { |
| Date origDateDate = null; |
| if (pubDateField != null && pubDateField.length() > 0) |
| origDateDate = DateParser.parseISO8601Date(pubDateField); |
| |
| Long origDate; |
| if (origDateDate != null) |
| origDate = new Long(origDateDate.getTime()); |
| else |
| origDate = null; |
| |
| String[] links = linkField.split(", "); |
| int l = 0; |
| while (l < links.length) |
| { |
| String rawURL = links[l++].trim(); |
| // Process the link |
| String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL); |
| if (newIdentifier != null) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+ |
| ((origDate==null)?"null":origDate.toString())); |
| if (filter.isLegalURL(newIdentifier)) |
| { |
| // It's a reference! Add it. |
| String[] dataNames = new String[]{"pubdate","source"}; |
| String[][] dataValues = new String[dataNames.length][]; |
| if (origDate != null) |
| dataValues[0] = new String[]{origDate.toString()}; |
| dataValues[1] = new String[]{documentIdentifier}; |
| |
| // Add document reference, including the data to pass down |
| activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate); |
| } |
| else |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded"); |
| } |
| } |
| else |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'"); |
| } |
| } |
| } |
| } |
| } |
| |
| |
| /** Get the maximum number of documents to amalgamate together into one batch, for this connector. |
| *@return the maximum number. 0 indicates "unlimited". |
| */ |
| public int getMaxDocumentRequest() |
| { |
| // RSS and the web in general do not batch well. Multiple chunks have no advantage over one-at-a-time requests. |
| return 1; |
| } |
| |
| // Protected methods and classes |
| |
| /** Code to check if data is interesting, based on response code and content type. |
| */ |
| protected boolean isContentInteresting(IFingerprintActivity activities, String contentType) |
| throws ServiceInterruption, ManifoldCFException |
| { |
| // Look at the content type and decide if it's a kind we want. This is defined |
| // as something we think we can either ingest, or extract links from. |
| |
| // For now, we're only going to attempt to extract links from html. This will change eventually. |
| // But the check here is just what the content type is. |
| if (contentType == null) |
| return false; |
| |
| int pos = contentType.indexOf(";"); |
| if (pos != -1) |
| contentType = contentType.substring(0,pos); |
| contentType = contentType.trim(); |
| |
| return activities.checkMimeTypeIndexable(contentType); |
| } |
| |
| /** Code to check if an already-fetched document should be ingested. |
| */ |
| protected boolean isDataIngestable(IFingerprintActivity activities, String documentIdentifier) |
| throws ServiceInterruption, ManifoldCFException |
| { |
| if (activities.checkLengthIndexable(cache.getDataLength(documentIdentifier)) == false) |
| return false; |
| |
| if (activities.checkURLIndexable(documentIdentifier) == false) |
| return false; |
| |
| // Check if it's a recognized content type |
| String contentType = cache.getContentType(documentIdentifier); |
| |
| // Some sites have multiple content types. We just look at the LAST one in that case. |
| if (contentType != null) |
| { |
| String[] contentTypes = contentType.split(","); |
| if (contentTypes.length > 0) |
| contentType = contentTypes[contentTypes.length-1].trim(); |
| else |
| contentType = null; |
| } |
| |
| if (contentType == null) |
| return false; |
| |
| int pos = contentType.indexOf(";"); |
| if (pos != -1) |
| contentType = contentType.substring(0,pos); |
| contentType = contentType.trim(); |
| |
| return activities.checkMimeTypeIndexable(contentType); |
| } |
| |
| /** Given the current parameters, find the correct throttled fetcher object |
| * (or create one if not there). |
| */ |
| protected ThrottledFetcher getFetcher() |
| { |
| synchronized (fetcherMap) |
| { |
| ThrottledFetcher tf = fetcherMap.get(throttleGroupName); |
| if (tf == null) |
| { |
| tf = new ThrottledFetcher(); |
| fetcherMap.put(throttleGroupName,tf); |
| } |
| return tf; |
| } |
| } |
| |
| /** Read a string as a sequence of individual expressions, urls, etc. |
| */ |
| protected static ArrayList stringToArray(String input) |
| { |
| ArrayList list = new ArrayList(); |
| try |
| { |
| java.io.Reader str = new java.io.StringReader(input); |
| try |
| { |
| java.io.BufferedReader is = new java.io.BufferedReader(str); |
| try |
| { |
| while (true) |
| { |
| String nextString = is.readLine(); |
| if (nextString == null) |
| break; |
| if (nextString.length() == 0) |
| continue; |
| nextString.trim(); |
| if (nextString.startsWith("#")) |
| continue; |
| list.add(nextString); |
| } |
| } |
| finally |
| { |
| is.close(); |
| } |
| } |
| finally |
| { |
| str.close(); |
| } |
| } |
| catch (java.io.IOException e) |
| { |
| // Eat the exception and exit. |
| } |
| return list; |
| } |
| |
| /** Compile all regexp entries in the passed in list, and add them to the output |
| * list. |
| */ |
| protected static void compileList(ArrayList output, ArrayList input) |
| throws ManifoldCFException |
| { |
| int i = 0; |
| while (i < input.size()) |
| { |
| String inputString = (String)input.get(i++); |
| try |
| { |
| output.add(Pattern.compile(inputString)); |
| } |
| catch (PatternSyntaxException e) |
| { |
| throw new ManifoldCFException("Mapping regular expression '"+inputString+"' is illegal: "+e.getMessage(),e); |
| } |
| } |
| } |
| |
| /** Given the current parameters, find the correct robots object (or create |
| * one if none found). |
| */ |
| protected Robots getRobots(ThrottledFetcher fetcher) |
| { |
| synchronized (robotsMap) |
| { |
| Robots r = (Robots)robotsMap.get(throttleGroupName); |
| if (r == null) |
| { |
| r = new Robots(fetcher); |
| robotsMap.put(throttleGroupName,r); |
| } |
| return r; |
| } |
| } |
| |
| // Protected classes |
| |
| /** The throttle specification class. Each server name is a different bin in this model. |
| */ |
| protected static class ThrottleSpec implements IThrottleSpec |
| { |
| protected final int maxOpenConnectionsPerServer; |
| protected final long minimumMillisecondsPerFetchPerServer; |
| protected final double minimumMillisecondsPerBytePerServer; |
| |
| public ThrottleSpec(int maxOpenConnectionsPerServer, long minimumMillisecondsPerFetchPerServer, |
| double minimumMillisecondsPerBytePerServer) |
| { |
| this.maxOpenConnectionsPerServer = maxOpenConnectionsPerServer; |
| this.minimumMillisecondsPerFetchPerServer = minimumMillisecondsPerFetchPerServer; |
| this.minimumMillisecondsPerBytePerServer = minimumMillisecondsPerBytePerServer; |
| } |
| |
| /** Given a bin name, find the max open connections to use for that bin. |
| *@return Integer.MAX_VALUE if no limit found. |
| */ |
| public int getMaxOpenConnections(String binName) |
| { |
| return maxOpenConnectionsPerServer; |
| } |
| |
| /** Look up minimum milliseconds per byte for a bin. |
| *@return 0.0 if no limit found. |
| */ |
| public double getMinimumMillisecondsPerByte(String binName) |
| { |
| return minimumMillisecondsPerBytePerServer; |
| } |
| |
| /** Look up minimum milliseconds for a fetch for a bin. |
| *@return 0 if no limit found. |
| */ |
| public long getMinimumMillisecondsPerFetch(String binName) |
| { |
| return minimumMillisecondsPerFetchPerServer; |
| } |
| } |
| |
| /** Name/value class */ |
| protected static class NameValue |
| { |
| protected String name; |
| protected String value; |
| |
| public NameValue(String name, String value) |
| { |
| this.name = name; |
| this.value = value; |
| } |
| |
| public String getName() |
| { |
| return name; |
| } |
| |
| public String getValue() |
| { |
| return value; |
| } |
| } |
| |
| /** Evaluator token. |
| */ |
| protected static class EvaluatorToken |
| { |
| public final static int TYPE_GROUP = 0; |
| public final static int TYPE_TEXT = 1; |
| public final static int TYPE_COMMA = 2; |
| |
| public final static int GROUPSTYLE_NONE = 0; |
| public final static int GROUPSTYLE_LOWER = 1; |
| public final static int GROUPSTYLE_UPPER = 2; |
| public final static int GROUPSTYLE_MIXED = 3; |
| |
| protected int type; |
| protected int groupNumber = -1; |
| protected int groupStyle = GROUPSTYLE_NONE; |
| protected String textValue = null; |
| |
| public EvaluatorToken() |
| { |
| type = TYPE_COMMA; |
| } |
| |
| public EvaluatorToken(int groupNumber, int groupStyle) |
| { |
| type = TYPE_GROUP; |
| this.groupNumber = groupNumber; |
| this.groupStyle = groupStyle; |
| } |
| |
| public EvaluatorToken(String text) |
| { |
| type = TYPE_TEXT; |
| this.textValue = text; |
| } |
| |
| public int getType() |
| { |
| return type; |
| } |
| |
| public int getGroupNumber() |
| { |
| return groupNumber; |
| } |
| |
| public int getGroupStyle() |
| { |
| return groupStyle; |
| } |
| |
| public String getTextValue() |
| { |
| return textValue; |
| } |
| |
| } |
| |
| |
| /** Token stream. |
| */ |
| protected static class EvaluatorTokenStream |
| { |
| protected String text; |
| protected int pos; |
| protected EvaluatorToken token = null; |
| |
| /** Constructor. |
| */ |
| public EvaluatorTokenStream(String text) |
| { |
| this.text = text; |
| this.pos = 0; |
| } |
| |
| /** Get current token. |
| */ |
| public EvaluatorToken peek() |
| throws ManifoldCFException |
| { |
| if (token == null) |
| { |
| token = nextToken(); |
| } |
| return token; |
| } |
| |
| /** Go on to next token. |
| */ |
| public void advance() |
| { |
| token = null; |
| } |
| |
| protected EvaluatorToken nextToken() |
| throws ManifoldCFException |
| { |
| char x; |
| // Fetch the next token |
| while (true) |
| { |
| if (pos == text.length()) |
| return null; |
| x = text.charAt(pos); |
| if (x > ' ') |
| break; |
| pos++; |
| } |
| |
| StringBuilder sb; |
| |
| if (x == '"') |
| { |
| // Parse text |
| pos++; |
| sb = new StringBuilder(); |
| while (true) |
| { |
| if (pos == text.length()) |
| break; |
| x = text.charAt(pos); |
| pos++; |
| if (x == '"') |
| { |
| break; |
| } |
| if (x == '\\') |
| { |
| if (pos == text.length()) |
| break; |
| x = text.charAt(pos++); |
| } |
| sb.append(x); |
| } |
| |
| return new EvaluatorToken(sb.toString()); |
| } |
| |
| if (x == ',') |
| { |
| pos++; |
| return new EvaluatorToken(); |
| } |
| |
| // Eat number at beginning |
| sb = new StringBuilder(); |
| while (true) |
| { |
| if (pos == text.length()) |
| break; |
| x = text.charAt(pos); |
| if (x >= '0' && x <= '9') |
| { |
| sb.append(x); |
| pos++; |
| continue; |
| } |
| break; |
| } |
| String numberValue = sb.toString(); |
| int groupNumber = 0; |
| if (numberValue.length() > 0) |
| groupNumber = new Integer(numberValue).intValue(); |
| // Save the next char position |
| int modifierPos = pos; |
| // Go to the end of the word |
| while (true) |
| { |
| if (pos == text.length()) |
| break; |
| x = text.charAt(pos); |
| if (x == ',' || x >= '0' && x <= '9' || x <= ' ' && x >= 0) |
| break; |
| pos++; |
| } |
| |
| int style = EvaluatorToken.GROUPSTYLE_NONE; |
| if (modifierPos != pos) |
| { |
| String modifier = text.substring(modifierPos,pos); |
| if (modifier.startsWith("u")) |
| style = EvaluatorToken.GROUPSTYLE_UPPER; |
| else if (modifier.startsWith("l")) |
| style = EvaluatorToken.GROUPSTYLE_LOWER; |
| else if (modifier.startsWith("m")) |
| style = EvaluatorToken.GROUPSTYLE_MIXED; |
| else |
| throw new ManifoldCFException("Unknown style: "+modifier); |
| } |
| return new EvaluatorToken(groupNumber,style); |
| } |
| } |
| |
| /** Class representing a URL regular expression match, for the purposes of determining canonicalization policy */ |
| protected static class CanonicalizationPolicy |
| { |
| protected Pattern matchPattern; |
| protected boolean reorder; |
| protected boolean removeJavaSession; |
| protected boolean removeAspSession; |
| protected boolean removePhpSession; |
| protected boolean removeBVSession; |
| |
| public CanonicalizationPolicy(Pattern matchPattern, boolean reorder, boolean removeJavaSession, boolean removeAspSession, |
| boolean removePhpSession, boolean removeBVSession) |
| { |
| this.matchPattern = matchPattern; |
| this.reorder = reorder; |
| this.removeJavaSession = removeJavaSession; |
| this.removeAspSession = removeAspSession; |
| this.removePhpSession = removePhpSession; |
| this.removeBVSession = removeBVSession; |
| } |
| |
| public boolean checkMatch(String url) |
| { |
| Matcher matcher = matchPattern.matcher(url); |
| return matcher.find(); |
| } |
| |
| public boolean canReorder() |
| { |
| return reorder; |
| } |
| |
| public boolean canRemoveJavaSession() |
| { |
| return removeJavaSession; |
| } |
| |
| public boolean canRemoveAspSession() |
| { |
| return removeAspSession; |
| } |
| |
| public boolean canRemovePhpSession() |
| { |
| return removePhpSession; |
| } |
| |
| public boolean canRemoveBvSession() |
| { |
| return removeBVSession; |
| } |
| |
| } |
| |
| /** Class representing a list of canonicalization rules */ |
| protected static class CanonicalizationPolicies |
| { |
| protected ArrayList rules = new ArrayList(); |
| |
| public CanonicalizationPolicies() |
| { |
| } |
| |
| public void addRule(CanonicalizationPolicy rule) |
| { |
| rules.add(rule); |
| } |
| |
| public CanonicalizationPolicy findMatch(String url) |
| { |
| int i = 0; |
| while (i < rules.size()) |
| { |
| CanonicalizationPolicy rule = (CanonicalizationPolicy)rules.get(i++); |
| if (rule.checkMatch(url)) |
| return rule; |
| } |
| return null; |
| } |
| } |
| |
| /** Class representing a mapping rule */ |
| protected static class MappingRule |
| { |
| protected Pattern matchPattern; |
| protected String evalExpression; |
| |
| public MappingRule(Pattern matchPattern, String evalExpression) |
| { |
| this.matchPattern = matchPattern; |
| this.evalExpression = evalExpression; |
| } |
| |
| public boolean checkMatch(String url) |
| { |
| Matcher matcher = matchPattern.matcher(url); |
| return matcher.matches(); |
| } |
| |
| public String map(String url) |
| throws ManifoldCFException |
| { |
| // Create a matcher, and attempt to do a match |
| Matcher matcher = matchPattern.matcher(url); |
| if (!matcher.matches()) |
| { |
| return null; |
| } |
| |
| // A match! Now, interpret the output expression |
| if (evalExpression == null || evalExpression.length() == 0) |
| return url; |
| |
| StringBuilder sb = new StringBuilder(); |
| EvaluatorTokenStream et = new EvaluatorTokenStream(evalExpression); |
| |
| while (true) |
| { |
| EvaluatorToken t = et.peek(); |
| if (t == null) |
| break; |
| switch (t.getType()) |
| { |
| case EvaluatorToken.TYPE_COMMA: |
| et.advance(); |
| break; |
| case EvaluatorToken.TYPE_GROUP: |
| et.advance(); |
| String groupValue = matcher.group(t.getGroupNumber()); |
| switch (t.getGroupStyle()) |
| { |
| case EvaluatorToken.GROUPSTYLE_NONE: |
| sb.append(groupValue); |
| break; |
| case EvaluatorToken.GROUPSTYLE_LOWER: |
| sb.append(groupValue.toLowerCase()); |
| break; |
| case EvaluatorToken.GROUPSTYLE_UPPER: |
| sb.append(groupValue.toUpperCase()); |
| break; |
| case EvaluatorToken.GROUPSTYLE_MIXED: |
| if (groupValue.length() > 0) |
| { |
| sb.append(groupValue.substring(0,1).toUpperCase()); |
| sb.append(groupValue.substring(1).toLowerCase()); |
| } |
| break; |
| default: |
| throw new ManifoldCFException("Illegal group style"); |
| } |
| break; |
| case EvaluatorToken.TYPE_TEXT: |
| et.advance(); |
| sb.append(t.getTextValue()); |
| break; |
| default: |
| throw new ManifoldCFException("Illegal token type"); |
| } |
| } |
| return sb.toString(); |
| } |
| |
| } |
| |
| /** Class that represents all mappings */ |
| protected static class MappingRules |
| { |
| protected ArrayList mappings = new ArrayList(); |
| |
| public MappingRules() |
| { |
| } |
| |
| public void add(MappingRule rule) |
| { |
| mappings.add(rule); |
| } |
| |
| public boolean isMatch(String url) |
| { |
| if (mappings.size() == 0) |
| return true; |
| int i = 0; |
| while (i < mappings.size()) |
| { |
| MappingRule p = (MappingRule)mappings.get(i); |
| if (p.checkMatch(url)) |
| return true; |
| i++; |
| } |
| return false; |
| } |
| |
| public String map(String url) |
| throws ManifoldCFException |
| { |
| if (mappings.size() == 0) |
| return url; |
| int i = 0; |
| while (i < mappings.size()) |
| { |
| MappingRule p = (MappingRule)mappings.get(i); |
| String rval = p.map(url); |
| if (rval != null) |
| return rval; |
| i++; |
| } |
| return null; |
| } |
| } |
| |
| /** Class that handles parsing and interpretation of the document specification. |
| * Note that I believe it to be faster to do this once, gathering all the data, than to scan the document specification multiple times. |
| * Therefore, this class contains the *entire* interpreted set of data from a document specification. |
| */ |
| protected static class Filter |
| { |
| protected MappingRules mappings = new MappingRules(); |
| protected HashMap seeds = null; |
| protected Integer defaultRescanInterval = null; |
| protected Integer minimumRescanInterval = null; |
| protected Integer badFeedRescanInterval = null; |
| protected int dechromedContentMode = DECHROMED_NONE; |
| protected int chromedContentMode = CHROMED_USE; |
| protected int feedTimeoutValue = 60000; |
| protected ArrayList metadata = new ArrayList(); |
| protected HashMap acls = new HashMap(); |
| protected CanonicalizationPolicies canonicalizationPolicies = new CanonicalizationPolicies(); |
| /** The arraylist of include patterns */ |
| protected ArrayList includePatterns = new ArrayList(); |
| /** The arraylist of exclude patterns */ |
| protected ArrayList excludePatterns = new ArrayList(); |
| |
| /** Constructor. */ |
| public Filter(DocumentSpecification spec, boolean warnOnBadSeed) |
| throws ManifoldCFException |
| { |
| String excludes = ""; |
| |
| // To save allocation, preallocate the seeds map assuming that it will require 1.5x the number of nodes in the spec |
| int initialSize = spec.getChildCount(); |
| if (initialSize == 0) |
| initialSize = 1; |
| seeds = new HashMap((initialSize * 3) >> 1); |
| |
| int i = 0; |
| |
| // First pass. Find all of the rules (which are necessary to canonicalize the seeds, etc.) |
| while (i < spec.getChildCount()) |
| { |
| SpecificationNode n = spec.getChild(i++); |
| if (n.getType().equals(RSSConfig.NODE_MAP)) |
| { |
| String match = n.getAttributeValue(RSSConfig.ATTR_MATCH); |
| String map = n.getAttributeValue(RSSConfig.ATTR_MAP); |
| if (match != null && match.length() > 0) |
| { |
| Pattern p; |
| try |
| { |
| p = Pattern.compile(match); |
| } |
| catch (java.util.regex.PatternSyntaxException e) |
| { |
| throw new ManifoldCFException("Regular expression '"+match+"' is illegal: "+e.getMessage(),e); |
| } |
| if (map == null) |
| map = ""; |
| mappings.add(new MappingRule(p,map)); |
| } |
| } |
| else if (n.getType().equals(RSSConfig.NODE_EXCLUDES)) |
| { |
| excludes = n.getValue(); |
| if (excludes == null) |
| excludes = ""; |
| } |
| else if (n.getType().equals(RSSConfig.NODE_URLSPEC)) |
| { |
| String urlRegexp = n.getAttributeValue(RSSConfig.ATTR_REGEXP); |
| if (urlRegexp == null) |
| urlRegexp = ""; |
| String reorder = n.getAttributeValue(RSSConfig.ATTR_REORDER); |
| boolean reorderValue; |
| if (reorder == null) |
| reorderValue = false; |
| else |
| { |
| if (reorder.equals(RSSConfig.VALUE_YES)) |
| reorderValue = true; |
| else |
| reorderValue = false; |
| } |
| |
| String javaSession = n.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL); |
| boolean javaSessionValue; |
| if (javaSession == null) |
| javaSessionValue = false; |
| else |
| { |
| if (javaSession.equals(RSSConfig.VALUE_YES)) |
| javaSessionValue = true; |
| else |
| javaSessionValue = false; |
| } |
| |
| String aspSession = n.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL); |
| boolean aspSessionValue; |
| if (aspSession == null) |
| aspSessionValue = false; |
| else |
| { |
| if (aspSession.equals(RSSConfig.VALUE_YES)) |
| aspSessionValue = true; |
| else |
| aspSessionValue = false; |
| } |
| |
| String phpSession = n.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL); |
| boolean phpSessionValue; |
| if (phpSession == null) |
| phpSessionValue = false; |
| else |
| { |
| if (phpSession.equals(RSSConfig.VALUE_YES)) |
| phpSessionValue = true; |
| else |
| phpSessionValue = false; |
| } |
| |
| String bvSession = n.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL); |
| boolean bvSessionValue; |
| if (bvSession == null) |
| bvSessionValue = false; |
| else |
| { |
| if (bvSession.equals(RSSConfig.VALUE_YES)) |
| bvSessionValue = true; |
| else |
| bvSessionValue = false; |
| } |
| try |
| { |
| canonicalizationPolicies.addRule(new CanonicalizationPolicy(Pattern.compile(urlRegexp),reorderValue,javaSessionValue,aspSessionValue, |
| phpSessionValue, bvSessionValue)); |
| } |
| catch (java.util.regex.PatternSyntaxException e) |
| { |
| throw new ManifoldCFException("Canonicalization regular expression '"+urlRegexp+"' is illegal: "+e.getMessage(),e); |
| } |
| } |
| } |
| |
| compileList(excludePatterns,stringToArray(excludes)); |
| |
| // Second pass. Do the rest of the work, |
| i = 0; |
| while (i < spec.getChildCount()) |
| { |
| SpecificationNode n = spec.getChild(i++); |
| if (n.getType().equals(RSSConfig.NODE_FEED)) |
| { |
| String rssURL = n.getAttributeValue(RSSConfig.ATTR_URL); |
| if (rssURL != null && rssURL.length() > 0) |
| { |
| String canonicalURL = makeDocumentIdentifier(canonicalizationPolicies,null,rssURL); |
| if (canonicalURL != null) |
| { |
| seeds.put(canonicalURL,canonicalURL); |
| } |
| else |
| { |
| if (warnOnBadSeed) |
| Logging.connectors.warn("RSS: Illegal seed feed '"+rssURL+"'"); |
| } |
| } |
| } |
| else if (n.getType().equals(RSSConfig.NODE_METADATA)) |
| { |
| String name = n.getAttributeValue(RSSConfig.ATTR_NAME); |
| String value = n.getAttributeValue(RSSConfig.ATTR_VALUE); |
| if (name != null && name.length() > 0 && value != null && value.length() > 0) |
| metadata.add(new NameValue(name,value)); |
| } |
| else if (n.getType().equals(RSSConfig.NODE_ACCESS)) |
| { |
| String token = n.getAttributeValue(RSSConfig.ATTR_TOKEN); |
| acls.put(token,token); |
| } |
| else if (n.getType().equals(RSSConfig.NODE_FEEDRESCAN)) |
| { |
| String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE); |
| if (interval != null && interval.length() > 0) |
| { |
| try |
| { |
| defaultRescanInterval = new Integer(interval); |
| } |
| catch (NumberFormatException e) |
| { |
| throw new ManifoldCFException("Bad number: "+e.getMessage(),e); |
| } |
| } |
| } |
| else if (n.getType().equals(RSSConfig.NODE_MINFEEDRESCAN)) |
| { |
| String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE); |
| if (interval != null && interval.length() > 0) |
| { |
| try |
| { |
| minimumRescanInterval = new Integer(interval); |
| } |
| catch (NumberFormatException e) |
| { |
| throw new ManifoldCFException("Bad number: "+e.getMessage(),e); |
| } |
| } |
| } |
| else if (n.getType().equals(RSSConfig.NODE_BADFEEDRESCAN)) |
| { |
| String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE); |
| if (interval != null && interval.length() > 0) |
| { |
| try |
| { |
| badFeedRescanInterval = new Integer(interval); |
| } |
| catch (NumberFormatException e) |
| { |
| throw new ManifoldCFException("Bad number: "+e.getMessage(),e); |
| } |
| } |
| } |
| else if (n.getType().equals(RSSConfig.NODE_FEEDTIMEOUT)) |
| { |
| String value = n.getAttributeValue(RSSConfig.ATTR_VALUE); |
| if (value != null && value.length() > 0) |
| { |
| try |
| { |
| feedTimeoutValue= Integer.parseInt(value) * 1000; |
| } |
| catch (NumberFormatException e) |
| { |
| throw new ManifoldCFException("Bad number: "+e.getMessage(),e); |
| } |
| } |
| } |
| else if (n.getType().equals(RSSConfig.NODE_DECHROMEDMODE)) |
| { |
| String mode = n.getAttributeValue(RSSConfig.ATTR_MODE); |
| if (mode != null && mode.length() > 0) |
| { |
| if (mode.equals(RSSConfig.VALUE_NONE)) |
| dechromedContentMode = DECHROMED_NONE; |
| else if (mode.equals(RSSConfig.VALUE_DESCRIPTION)) |
| dechromedContentMode = DECHROMED_DESCRIPTION; |
| else if (mode.equals(RSSConfig.VALUE_CONTENT)) |
| dechromedContentMode = DECHROMED_CONTENT; |
| } |
| } |
| else if (n.getType().equals(RSSConfig.NODE_CHROMEDMODE)) |
| { |
| String mode = n.getAttributeValue(RSSConfig.ATTR_MODE); |
| if (mode != null && mode.length() > 0) |
| { |
| if (mode.equals(RSSConfig.VALUE_USE)) |
| chromedContentMode = CHROMED_USE; |
| else if (mode.equals(RSSConfig.VALUE_SKIP)) |
| chromedContentMode = CHROMED_SKIP; |
| else if (mode.equals(RSSConfig.VALUE_METADATA)) |
| chromedContentMode = CHROMED_METADATA_ONLY; |
| } |
| } |
| } |
| } |
| |
| /** Check if document is a seed */ |
| public boolean isSeed(String canonicalUrl) |
| { |
| return seeds.get(canonicalUrl) != null; |
| } |
| |
| /** Iterate over all canonicalized seeds */ |
| public Iterator getSeeds() |
| { |
| return seeds.keySet().iterator(); |
| } |
| |
| /** Get the specified metadata */ |
| public ArrayList getMetadata() |
| { |
| return metadata; |
| } |
| |
| /** Get the acls */ |
| public String[] getAcls() |
| { |
| String[] rval = new String[acls.size()]; |
| Iterator iter = acls.keySet().iterator(); |
| int i = 0; |
| while (iter.hasNext()) |
| { |
| rval[i++] = (String)iter.next(); |
| } |
| return rval; |
| } |
| |
| /** Get the feed timeout value */ |
| public int getFeedTimeoutValue() |
| { |
| return feedTimeoutValue; |
| } |
| |
| /** Get the dechromed content mode */ |
| public int getDechromedContentMode() |
| { |
| return dechromedContentMode; |
| } |
| |
| /** Get the chromed content mode */ |
| public int getChromedContentMode() |
| { |
| return chromedContentMode; |
| } |
| |
| /** Get the next time (by default) a feed should be scanned */ |
| public Long getDefaultRescanTime(long currentTime) |
| { |
| if (defaultRescanInterval == null) |
| return null; |
| return new Long(defaultRescanInterval.intValue() * 60000L + currentTime); |
| } |
| |
| /** Get the minimum next time a feed should be scanned */ |
| public Long getMinimumRescanTime(long currentTime) |
| { |
| if (minimumRescanInterval == null) |
| return null; |
| return new Long(minimumRescanInterval.intValue() * 60000L + currentTime); |
| } |
| |
| /** Get the next time a "bad feed" should be rescanned */ |
| public Long getBadFeedRescanTime(long currentTime) |
| { |
| if (badFeedRescanInterval == null) |
| return null; |
| return new Long(badFeedRescanInterval.intValue() * 60000L + currentTime); |
| } |
| |
| /** Check for legality of a url. |
| * @return true if the passed-in url is either a seed, or a legal url, according to this |
| * filter. |
| */ |
| public boolean isLegalURL(String url) |
| { |
| if (seeds.get(url) != null) |
| return true; |
| if (mappings.isMatch(url) == false) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Url '"+url+"' is illegal because it did not match a mapping rule"); |
| return false; |
| } |
| // Now make sure it's not in the exclude list. |
| int i = 0; |
| while (i < excludePatterns.size()) |
| { |
| Pattern p = (Pattern)excludePatterns.get(i); |
| Matcher m = p.matcher(url); |
| if (m.find()) |
| { |
| if (Logging.connectors.isDebugEnabled()) |
| Logging.connectors.debug("RSS: Url '"+url+"' is illegal because exclude pattern '"+p.toString()+"' matched it"); |
| return false; |
| } |
| i++; |
| } |
| |
| return true; |
| } |
| |
| /** Scan patterns and return the one that matches first. |
| * @return null if the url doesn't match or should not be ingested, or the new string if it does. |
| */ |
| public String mapDocumentURL(String url) |
| throws ManifoldCFException |
| { |
| if (seeds.get(url) != null) |
| return null; |
| return mappings.map(url); |
| } |
| |
| /** Get canonicalization policies */ |
| public CanonicalizationPolicies getCanonicalizationPolicies() |
| { |
| return canonicalizationPolicies; |
| } |
| } |
| |
| } |
| |
| |