| /* $Id: IIncrementalIngester.java 988245 2010-08-23 18:39:35Z kwright $ */ |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.manifoldcf.agents.interfaces; |
| |
| import org.apache.manifoldcf.core.interfaces.*; |
| import java.io.*; |
| import java.util.*; |
| |
| /** This interface describes the incremental ingestion API. |
| * SOME NOTES: |
| * The expected client flow for this API is to: |
| * |
| * 1) Use the API to fetch a document's version. |
| * 2) Base a decision whether to ingest based on that version. |
| * 3) If the decision to ingest occurs, then the ingest method in the API is |
| * called. |
| * |
| * The module described by this interface is responsible for keeping track of what has been sent where, and also the corresponding version of |
| * each document so indexed. The space over which this takes place is defined by the individual output connection - that is, the output connection |
| * seems to "remember" what documents were handed to it. |
| * |
| * A secondary purpose of this module is to provide a mapping between the key by which a document is described internally (by an |
| * identifier hash, plus the name of an identifier space), and the way the document is identified in the output space (by the name of an |
| * output connection, plus a URI which is considered local to that output connection space). |
| * |
| */ |
| public interface IIncrementalIngester |
| { |
| public static final String _rcsid = "@(#)$Id: IIncrementalIngester.java 988245 2010-08-23 18:39:35Z kwright $"; |
| |
| /** Install the incremental ingestion manager. |
| */ |
| public void install() |
| throws ManifoldCFException; |
| |
| /** Uninstall the incremental ingestion manager. |
| */ |
| public void deinstall() |
| throws ManifoldCFException; |
| |
| /** Flush all knowledge of what was ingested before. |
| */ |
| public void clearAll() |
| throws ManifoldCFException; |
| |
| |
| /** From a pipeline specification, get the name of the output connection that will be indexed last |
| * in the pipeline. |
| *@param pipelineSpecificationBasic is the basic pipeline specification. |
| *@return the last indexed output connection name. |
| */ |
| public String getLastIndexedOutputConnectionName(IPipelineSpecificationBasic pipelineSpecificationBasic); |
| |
| /** From a pipeline specification, get the name of the output connection that will be indexed first |
| * in the pipeline. |
| *@param pipelineSpecificationBasic is the basic pipeline specification. |
| *@return the first indexed output connection name. |
| */ |
| public String getFirstIndexedOutputConnectionName(IPipelineSpecificationBasic pipelineSpecificationBasic); |
| |
| /** Get an output version string for a document. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| *@param spec is the output specification. |
| *@return the description string. |
| */ |
| public VersionContext getOutputDescription(String outputConnectionName, Specification spec) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Get transformation version string for a document. |
| *@param transformationConnectionName is the names of the transformation connection associated with this action. |
| *@param spec is the transformation specification. |
| *@return the description string. |
| */ |
| public VersionContext getTransformationDescription(String transformationConnectionName, Specification spec) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Check if a mime type is indexable. |
| *@param pipelineSpecification is the pipeline specification. |
| *@param mimeType is the mime type to check. |
| *@param activity are the activities available to this method. |
| *@return true if the mimeType is indexable. |
| */ |
| public boolean checkMimeTypeIndexable( |
| IPipelineSpecification pipelineSpecification, |
| String mimeType, |
| IOutputCheckActivity activity) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Check if a file is indexable. |
| *@param pipelineSpecification is the pipeline specification. |
| *@param localFile is the local file to check. |
| *@param activity are the activities available to this method. |
| *@return true if the local file is indexable. |
| */ |
| public boolean checkDocumentIndexable( |
| IPipelineSpecification pipelineSpecification, |
| File localFile, |
| IOutputCheckActivity activity) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Pre-determine whether a document's length is indexable by this connector. This method is used by participating repository connectors |
| * to help filter out documents that are too long to be indexable. |
| *@param pipelineSpecification is the pipeline specification. |
| *@param length is the length of the document. |
| *@param activity are the activities available to this method. |
| *@return true if the file is indexable. |
| */ |
| public boolean checkLengthIndexable( |
| IPipelineSpecification pipelineSpecification, |
| long length, |
| IOutputCheckActivity activity) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Pre-determine whether a document's URL is indexable by this connector. This method is used by participating repository connectors |
| * to help filter out documents that not indexable. |
| *@param pipelineSpecification is the pipeline specification. |
| *@param url is the url of the document. |
| *@param activity are the activities available to this method. |
| *@return true if the file is indexable. |
| */ |
| public boolean checkURLIndexable( |
| IPipelineSpecification pipelineSpecification, |
| String url, |
| IOutputCheckActivity activity) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Determine whether we need to fetch or refetch a document. |
| * Pass in information including the pipeline specification with existing version info, plus new document and parameter version strings. |
| * If no outputs need to be updated, then this method will return false. If any outputs need updating, then true is returned. |
| *@param pipelineSpecificationWithVersions is the pipeline specification including new version info for all transformation and output |
| * connections. |
| *@param newDocumentVersion is the newly-determined document version. |
| *@param newParameterVersion is the newly-determined parameter version. |
| *@param newAuthorityNameString is the newly-determined authority name. |
| *@return true if the document needs to be refetched. |
| */ |
| public boolean checkFetchDocument( |
| IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, |
| String newDocumentVersion, |
| String newParameterVersion, |
| String newAuthorityNameString); |
| |
| /** Record a document version, but don't ingest it. |
| * The purpose of this method is to update document version information without reindexing the document. |
| *@param pipelineSpecificationBasic is the basic pipeline specification needed. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hashed document identifier. |
| *@param componentHash is the hashed component identifier, if any. |
| *@param documentVersion is the document version. |
| *@param recordTime is the time at which the recording took place, in milliseconds since epoch. |
| */ |
| public void documentRecord( |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String identifierClass, String identifierHash, String componentHash, |
| String documentVersion, long recordTime) |
| throws ManifoldCFException; |
| |
| /** Remove a document from specified indexes, just as if an empty document |
| * was indexed, and record the necessary version information. |
| * This method is conceptually similar to documentIngest(), but does not actually take |
| * a document or allow it to be transformed. If there is a document already |
| * indexed, it is removed from the index. |
| *@param pipelineSpecificationWithVersions is the pipeline specification with already-fetched output versioning information. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hashed document identifier. |
| *@param componentHash is the hashed component identifier, if any. |
| *@param documentVersion is the document version. |
| *@param parameterVersion is the version string for the forced parameters. |
| *@param authorityName is the name of the authority associated with the document, if any. |
| *@param recordTime is the time at which the recording took place, in milliseconds since epoch. |
| *@param activities is an object providing a set of methods that the implementer can use to perform the operation. |
| */ |
| public void documentNoData( |
| IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, |
| String identifierClass, String identifierHash, String componentHash, |
| String documentVersion, |
| String parameterVersion, |
| String authorityName, |
| long recordTime, |
| IOutputActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Ingest a document. |
| * This ingests the document, and notes it. If this is a repeat ingestion of the document, this |
| * method also REMOVES ALL OLD METADATA. When complete, the index will contain only the metadata |
| * described by the RepositoryDocument object passed to this method. |
| * ServiceInterruption is thrown if the document ingestion must be rescheduled. |
| *@param pipelineSpecificationWithVersions is the pipeline specification with already-fetched output versioning information. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hashed document identifier. |
| *@param componentHash is the hashed component identifier, if any. |
| *@param documentVersion is the document version. |
| *@param parameterVersion is the version string for the forced parameters. |
| *@param authorityName is the name of the authority associated with the document, if any. |
| *@param data is the document data. The data is closed after ingestion is complete. |
| *@param ingestTime is the time at which the ingestion took place, in milliseconds since epoch. |
| *@param documentURI is the URI of the document, which will be used as the key of the document in the index. |
| *@param activities is an object providing a set of methods that the implementer can use to perform the operation. |
| *@return true if the ingest was ok, false if the ingest is illegal (and should not be repeated). |
| *@throws IOException only if data stream throws an IOException. |
| */ |
| public boolean documentIngest( |
| IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, |
| String identifierClass, String identifierHash, String componentHash, |
| String documentVersion, |
| String parameterVersion, |
| String authorityName, |
| RepositoryDocument data, |
| long ingestTime, String documentURI, |
| IOutputActivity activities) |
| throws ManifoldCFException, ServiceInterruption, IOException; |
| |
| /** Remove a document component from the search engine index. |
| *@param pipelineSpecificationBasic is the basic pipeline specification. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hash of the id of the document. |
| *@param componentHash is the hashed component identifier, if any. |
| *@param activities is the object to use to log the details of the ingestion attempt. May be null. |
| */ |
| public void documentRemove( |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String identifierClass, String identifierHash, String componentHash, |
| IOutputRemoveActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Remove multiple document components from the search engine index. |
| *@param pipelineSpecificationBasic is the basic pipeline specification. |
| *@param identifierClasses are the names of the spaces in which the identifier hash should be interpreted. |
| *@param identifierHashes are the hashes of the ids of the documents. |
| *@param componentHash is the hashed component identifier, if any. |
| *@param activities is the object to use to log the details of the ingestion attempt. May be null. |
| */ |
| public void documentRemoveMultiple( |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String[] identifierClasses, String[] identifierHashes, String componentHash, |
| IOutputRemoveActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Note the fact that we checked a document (and found that it did not need to be ingested, because the |
| * versions agreed). |
| *@param pipelineSpecificationBasic is a pipeline specification. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes are the set of document identifier hashes. |
| *@param checkTime is the time at which the check took place, in milliseconds since epoch. |
| */ |
| public void documentCheckMultiple( |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String[] identifierClasses, String[] identifierHashes, |
| long checkTime) |
| throws ManifoldCFException; |
| |
| /** Note the fact that we checked a document (and found that it did not need to be ingested, because the |
| * versions agreed). |
| *@param pipelineSpecificationBasic is a basic pipeline specification. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hashed document identifier. |
| *@param checkTime is the time at which the check took place, in milliseconds since epoch. |
| */ |
| public void documentCheck( |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String identifierClass, String identifierHash, |
| long checkTime) |
| throws ManifoldCFException; |
| |
| /** Delete multiple documents, and their components, from the search engine index. |
| *@param pipelineSpecificationBasics are the pipeline specifications associated with the documents. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes is tha array of document identifier hashes if the documents. |
| *@param activities is the object to use to log the details of the ingestion attempt. May be null. |
| */ |
| public void documentDeleteMultiple( |
| IPipelineSpecificationBasic[] pipelineSpecificationBasics, |
| String[] identifierClasses, String[] identifierHashes, |
| IOutputRemoveActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Delete multiple documents, and their components, from the search engine index. |
| *@param pipelineSpecificationBasic is the basic pipeline specification. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes is tha array of document identifier hashes if the documents. |
| *@param activities is the object to use to log the details of the ingestion attempt. May be null. |
| */ |
| public void documentDeleteMultiple( |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String[] identifierClasses, String[] identifierHashes, |
| IOutputRemoveActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Delete a document, and all its components, from the search engine index. |
| *@param pipelineSpecificationBasic is the basic pipeline specification. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hash of the id of the document. |
| *@param activities is the object to use to log the details of the ingestion attempt. May be null. |
| */ |
| public void documentDelete( |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String identifierClass, String identifierHash, |
| IOutputRemoveActivity activities) |
| throws ManifoldCFException, ServiceInterruption; |
| |
| /** Look up ingestion data for a set of documents. |
| *@param rval is a map of output key to document data, in no particular order, which will be loaded with all matching results. |
| *@param pipelineSpecificationBasics are the pipeline specifications corresponding to the identifier classes and hashes. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes is the array of document identifier hashes to look up. |
| */ |
| public void getPipelineDocumentIngestDataMultiple( |
| IngestStatuses rval, |
| IPipelineSpecificationBasic[] pipelineSpecificationBasics, |
| String[] identifierClasses, String[] identifierHashes) |
| throws ManifoldCFException; |
| |
| /** Look up ingestion data for a SET of documents. |
| *@param rval is a map of output key to document data, in no particular order, which will be loaded with all matching results. |
| *@param pipelineSpecificationBasic is the pipeline specification for all documents. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes is the array of document identifier hashes to look up. |
| */ |
| public void getPipelineDocumentIngestDataMultiple( |
| IngestStatuses rval, |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String[] identifierClasses, String[] identifierHashes) |
| throws ManifoldCFException; |
| |
| /** Look up ingestion data for a document. |
| *@param rval is a map of output key to document data, in no particular order, which will be loaded with all matching results. |
| *@param pipelineSpecificationBasic is the pipeline specification for the document. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hash of the id of the document. |
| */ |
| public void getPipelineDocumentIngestData( |
| IngestStatuses rval, |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String identifierClass, String identifierHash) |
| throws ManifoldCFException; |
| |
| /** Calculate the average time interval between changes for a document. |
| * This is based on the data gathered for the document. |
| *@param pipelineSpecificationBasic is the basic pipeline specification. |
| *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. |
| *@param identifierHashes is the hashes of the ids of the documents. |
| *@return the number of milliseconds between changes, or 0 if this cannot be calculated. |
| */ |
| public long[] getDocumentUpdateIntervalMultiple( |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String[] identifierClasses, String[] identifierHashes) |
| throws ManifoldCFException; |
| |
| /** Calculate the average time interval between changes for a document. |
| * This is based on the data gathered for the document. |
| *@param pipelineSpecificationBasic is the basic pipeline specification. |
| *@param identifierClass is the name of the space in which the identifier hash should be interpreted. |
| *@param identifierHash is the hash of the id of the document. |
| *@return the number of milliseconds between changes, or 0 if this cannot be calculated. |
| */ |
| public long getDocumentUpdateInterval( |
| IPipelineSpecificationBasic pipelineSpecificationBasic, |
| String identifierClass, String identifierHash) |
| throws ManifoldCFException; |
| |
| /** Reset all documents belonging to a specific output connection, because we've got information that |
| * that system has been reconfigured. This will force all such documents to be reindexed the next time |
| * they are checked. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| */ |
| public void resetOutputConnection(String outputConnectionName) |
| throws ManifoldCFException; |
| |
| /** Remove all knowledge of an output index from the system. This is appropriate |
| * when the output index no longer exists and you wish to delete the associated job. |
| *@param outputConnectionName is the name of the output connection associated with this action. |
| */ |
| public void removeOutputConnection(String outputConnectionName) |
| throws ManifoldCFException; |
| |
| } |