| /* $Id: IIncrementalIngester.java 988245 2010-08-23 18:39:35Z kwright $ */ |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.manifoldcf.agents.interfaces; |
| |
| import org.apache.manifoldcf.core.interfaces.*; |
| import java.io.*; |
| import java.util.*; |
| |
| /** This interface describes the incremental ingestion API. |
| * SOME NOTES: |
| * The expected client flow for this API is to: |
| * |
| * 1) Use the API to fetch a document's version. |
| * 2) Base a decision whether to ingest based on that version. |
| * 3) If the decision to ingest occurs, then the ingest method in the API is |
| * called. |
| * |
| * The module described by this interface is responsible for keeping track of what has been sent where, and also the corresponding version of |
| * each document so indexed. The space over which this takes place is defined by the individual output connection - that is, the output connection |
| * seems to "remember" what documents were handed to it. |
| * |
| * A secondary purpose of this module is to provide a mapping between the key by which a document is described internally (by an |
| * identifier hash, plus the name of an identifier space), and the way the document is identified in the output space (by the name of an |
| * output connection, plus a URI which is considered local to that output connection space). |
| * |
| */ |
| public interface IIncrementalIngester |
| { |
| public static final String _rcsid = "@(#)$Id: IIncrementalIngester.java 988245 2010-08-23 18:39:35Z kwright $"; |
| |
| /** Install the incremental ingestion manager. |
| */ |
| public void install() |
| throws ManifoldCFException; |
| |
| /** Uninstall the incremental ingestion manager. |
| */ |
| public void deinstall() |
| throws ManifoldCFException; |
| |
| /** Flush all knowledge of what was ingested before. |
| */ |
| public void clearAll() |
| throws ManifoldCFException; |
| |
| |
| /** Get an output version string for a document. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param spec is the output specification. |
| *@return the description string. |
| */ |
| public String getOutputDescription(String outputConnectionName, OutputSpecification spec) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Check if a mime type is indexable. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param outputDescription is the output description string. |
| *@param mimeType is the mime type to check. |
| *@return true if the mimeType is indexable. |
| */ |
| public boolean checkMimeTypeIndexable(String outputConnectionName, String outputDescription, String mimeType) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Check if a file is indexable. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param outputDescription is the output description string. |
| *@param localFile is the local file to check. |
| *@return true if the local file is indexable. |
| */ |
| public boolean checkDocumentIndexable(String outputConnectionName, String outputDescription, File localFile) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Pre-determine whether a document's length is indexable by this connector. This method is used by participating repository connectors |
| * to help filter out documents that are too long to be indexable. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param outputDescription is the output description string. |
| *@param length is the length of the document. |
| *@return true if the file is indexable. |
| */ |
| public boolean checkLengthIndexable(String outputConnectionName, String outputDescription, long length) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Pre-determine whether a document's URL is indexable by this connector. This method is used by participating repository connectors |
| * to help filter out documents that not indexable. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param outputDescription is the output description string. |
| *@param url is the url of the document. |
| *@return true if the file is indexable. |
| */ |
| public boolean checkURLIndexable(String outputConnectionName, String outputDescription, String url) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Record a document version, but don't ingest it. |
| * The purpose of this method is to keep track of the frequency at which ingestion "attempts" take place. |
| * ServiceInterruption is thrown if this action must be rescheduled. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hashed document identifier. |
| *@param documentVersion is the document version. |
| *@param recordTime is the time at which the recording took place, in milliseconds since epoch. |
| *@param activities is the object used in case a document needs to be removed from the output index as the result of this operation. |
| */ |
| public void documentRecord(String outputConnectionName, |
| String identifierClass, String identifierHash, |
| String documentVersion, long recordTime, |
| IOutputActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Ingest a document. |
| * This ingests the document, and notes it. If this is a repeat ingestion of the document, this |
| * method also REMOVES ALL OLD METADATA. When complete, the index will contain only the metadata |
| * described by the RepositoryDocument object passed to this method. |
| * ServiceInterruption is thrown if the document ingestion must be rescheduled. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hashed document identifier. |
| *@param documentVersion is the document version. |
| *@param outputVersion is the output version string constructed from the output specification by the output connector. |
| *@param authorityName is the name of the authority associated with the document, if any. |
| *@param data is the document data. The data is closed after ingestion is complete. |
| *@param ingestTime is the time at which the ingestion took place, in milliseconds since epoch. |
| *@param documentURI is the URI of the document, which will be used as the key of the document in the index. |
| *@param activities is an object providing a set of methods that the implementer can use to perform the operation. |
| *@return true if the ingest was ok, false if the ingest is illegal (and should not be repeated). |
| */ |
| public boolean documentIngest(String outputConnectionName, |
| String identifierClass, String identifierHash, |
| String documentVersion, |
| String outputVersion, |
| String authorityName, |
| RepositoryDocument data, |
| long ingestTime, String documentURI, |
| IOutputActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Ingest a document. |
| * This ingests the document, and notes it. If this is a repeat ingestion of the document, this |
| * method also REMOVES ALL OLD METADATA. When complete, the index will contain only the metadata |
| * described by the RepositoryDocument object passed to this method. |
| * ServiceInterruption is thrown if the document ingestion must be rescheduled. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hashed document identifier. |
| *@param documentVersion is the document version. |
| *@param parameterVersion is the forced parameter version. |
| *@param outputVersion is the output version string constructed from the output specification by the output connector. |
| *@param authorityName is the name of the authority associated with the document, if any. |
| *@param data is the document data. The data is closed after ingestion is complete. |
| *@param ingestTime is the time at which the ingestion took place, in milliseconds since epoch. |
| *@param documentURI is the URI of the document, which will be used as the key of the document in the index. |
| *@param activities is an object providing a set of methods that the implementer can use to perform the operation. |
| *@return true if the ingest was ok, false if the ingest is illegal (and should not be repeated). |
| */ |
| public boolean documentIngest(String outputConnectionName, |
| String identifierClass, String identifierHash, |
| String documentVersion, |
| String outputVersion, |
| String parameterVersion, |
| String authorityName, |
| RepositoryDocument data, |
| long ingestTime, String documentURI, |
| IOutputActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Note the fact that we checked a document (and found that it did not need to be ingested, because the |
| * versions agreed). |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes are the set of document identifier hashes. |
| *@param checkTime is the time at which the check took place, in milliseconds since epoch. |
| */ |
| public void documentCheckMultiple(String outputConnectionName, |
| String[] identifierClasses, String[] identifierHashes, |
| long checkTime) |
| throws ManifoldCFException; |
| |
| /** Note the fact that we checked a document (and found that it did not need to be ingested, because the |
| * versions agreed). |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hashed document identifier. |
| *@param checkTime is the time at which the check took place, in milliseconds since epoch. |
| */ |
| public void documentCheck(String outputConnectionName, |
| String identifierClass, String identifierHash, |
| long checkTime) |
| throws ManifoldCFException; |
| |
| /** Delete multiple documents from the search engine index. |
| *@param outputConnectionNames are the names of the output connections associated with this action. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes is tha array of document identifier hashes if the documents. |
| *@param activities is the object to use to log the details of the ingestion attempt. May be null. |
| */ |
| public void documentDeleteMultiple(String[] outputConnectionNames, |
| String[] identifierClasses, String[] identifierHashes, |
| IOutputRemoveActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Delete multiple documents from the search engine index. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes is tha array of document identifier hashes if the documents. |
| *@param activities is the object to use to log the details of the ingestion attempt. May be null. |
| */ |
| public void documentDeleteMultiple(String outputConnectionName, |
| String[] identifierClasses, String[] identifierHashes, |
| IOutputRemoveActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Delete a document from the search engine index. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hash of the id of the document. |
| *@param activities is the object to use to log the details of the ingestion attempt. May be null. |
| */ |
| public void documentDelete(String outputConnectionName, |
| String identifierClass, String identifierHash, |
| IOutputRemoveActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Look up ingestion data for a SET of documents. |
| *@param outputConnectionNames are the names of the output connections associated with this action. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes is the array of document identifier hashes to look up. |
| *@return the array of document data. Null will come back for any identifier that doesn't |
| * exist in the index. |
| */ |
| public DocumentIngestStatus[] getDocumentIngestDataMultiple(String[] outputConnectionNames, |
| String[] identifierClasses, String[] identifierHashes) |
| throws ManifoldCFException; |
| |
| /** Look up ingestion data for a SET of documents. |
| *@param outputConnectionName is the names of the output connection associated with this action. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes is the array of document identifier hashes to look up. |
| *@return the array of document data. Null will come back for any identifier that doesn't |
| * exist in the index. |
| */ |
| public DocumentIngestStatus[] getDocumentIngestDataMultiple(String outputConnectionName, |
| String[] identifierClasses, String[] identifierHashes) |
| throws ManifoldCFException; |
| |
| /** Look up ingestion data for a documents. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hash of the id of the document. |
| *@return the current document's ingestion data, or null if the document is not currently ingested. |
| */ |
| public DocumentIngestStatus getDocumentIngestData(String outputConnectionName, |
| String identifierClass, String identifierHash) |
| throws ManifoldCFException; |
| |
| /** Calculate the average time interval between changes for a document. |
| * This is based on the data gathered for the document. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes is the hashes of the ids of the documents. |
| *@return the number of milliseconds between changes, or 0 if this cannot be calculated. |
| */ |
| public long[] getDocumentUpdateIntervalMultiple(String outputConnectionName, |
| String[] identifierClasses, String[] identifierHashes) |
| throws ManifoldCFException; |
| |
| /** Calculate the average time interval between changes for a document. |
| * This is based on the data gathered for the document. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hash of the id of the document. |
| *@return the number of milliseconds between changes, or 0 if this cannot be calculated. |
| */ |
| public long getDocumentUpdateInterval(String outputConnectionName, |
| String identifierClass, String identifierHash) |
| throws ManifoldCFException; |
| |
| /** Reset all documents belonging to a specific output connection, because we've got information that |
| * that system has been reconfigured. This will force all such documents to be reindexed the next time |
| * they are checked. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| */ |
| public void resetOutputConnection(String outputConnectionName) |
| throws ManifoldCFException; |
| |
| } |