blob: 33834645db8f371f05cbbc8716294c78dfe1f47c [file] [log] [blame]
/* $Id: IJobManager.java 991295 2010-08-31 19:12:14Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.f
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.interfaces;
import org.apache.manifoldcf.core.interfaces.*;
import java.util.ArrayList;
/** This manager deals with jobs. Each job is associated with a repository connection, and has a number
* of scheduling options: starting every n hours/days/weeks/months, on specific dates, or "continuous" (which basically
* establishes a priority queue based on modification frequency).
* The job itself also specifies "seeds" (or starting points), which are the places that scanning begins.
* NOTE WELL: Every job is incremental. This means that the job will check for deletions among all the documents
* that it has scanned in the past, as part of the process of ingesting.
*/
public interface IJobManager
{
public static final String _rcsid = "@(#)$Id: IJobManager.java 991295 2010-08-31 19:12:14Z kwright $";
// Actions, for continuous crawling
public static final int ACTION_RESCAN = 0;
public static final int ACTION_REMOVE = 1;
// Document states, for status reports.
public static final int DOCSTATE_NEVERPROCESSED = 0;
public static final int DOCSTATE_PREVIOUSLYPROCESSED = 1;
public static final int DOCSTATE_OUTOFSCOPE = 2;
// Document statuses, for status reports.
public static final int DOCSTATUS_INACTIVE = 0;
public static final int DOCSTATUS_PROCESSING = 1;
public static final int DOCSTATUS_EXPIRING = 2;
public static final int DOCSTATUS_DELETING = 3;
public static final int DOCSTATUS_READYFORPROCESSING = 4;
public static final int DOCSTATUS_READYFOREXPIRATION = 5;
public static final int DOCSTATUS_WAITINGFORPROCESSING = 6;
public static final int DOCSTATUS_WAITINGFOREXPIRATION = 7;
public static final int DOCSTATUS_WAITINGFOREVER = 8;
public static final int DOCSTATUS_HOPCOUNTEXCEEDED = 9;
/** Install the job manager's tables.
*/
public void install()
throws ManifoldCFException;
/** Uninstall the job manager's tables.
*/
public void deinstall()
throws ManifoldCFException;
/** Export configuration */
public void exportConfiguration(java.io.OutputStream os)
throws java.io.IOException, ManifoldCFException;
/** Import configuration */
public void importConfiguration(java.io.InputStream is)
throws java.io.IOException, ManifoldCFException;
/** Load a sorted list of job descriptions.
*@return the list, sorted by description.
*/
public IJobDescription[] getAllJobs()
throws ManifoldCFException;
/** Create a new job.
*@return the new job.
*/
public IJobDescription createJob()
throws ManifoldCFException;
/** Delete a job.
*@param id is the job's identifier. This method will purge all the records belonging to the job from the database, as
* well as remove all documents indexed by the job from the index.
*/
public void deleteJob(Long id)
throws ManifoldCFException;
/** Load a job for editing.
*@param id is the job's identifier.
*@return null if the job doesn't exist.
*/
public IJobDescription load(Long id)
throws ManifoldCFException;
/** Load a job.
*@param id is the job's identifier.
*@param readOnly is true if a read-only object is desired.
*@return null if the job doesn't exist.
*/
public IJobDescription load(Long id, boolean readOnly)
throws ManifoldCFException;
/** Save a job.
*@param jobDescription is the job description.
*/
public void save(IJobDescription jobDescription)
throws ManifoldCFException;
/** See if there's a reference to a connection name.
*@param connectionName is the name of the connection.
*@return true if there is a reference, false otherwise.
*/
public boolean checkIfReference(String connectionName)
throws ManifoldCFException;
/** See if there's a reference to an output connection name.
*@param connectionName is the name of the connection.
*@return true if there is a reference, false otherwise.
*/
public boolean checkIfOutputReference(String connectionName)
throws ManifoldCFException;
/** See if there's a reference to a transformation connection name.
*@param connectionName is the name of the connection.
*@return true if there is a reference, false otherwise.
*/
public boolean checkIfTransformationReference(String connectionName)
throws ManifoldCFException;
/** Get the job IDs associated with a given connection name.
*@param connectionName is the name of the connection.
*@return the set of job id's associated with that connection.
*/
public IJobDescription[] findJobsForConnection(String connectionName)
throws ManifoldCFException;
/** Clear job seeding state.
*@param jobID is the job ID.
*/
public void clearJobSeedingState(Long jobID)
throws ManifoldCFException;
// These methods cover activities that require interaction with the job queue.
// The job queue is maintained underneath this interface, and all threads that perform
// job activities need to go through this layer.
/** Reset the job queue for an individual process ID.
* If a node was shut down in the middle of doing something, sufficient information should
* be around in the database to allow the node's activities to be cleaned up.
*@param processID is the process ID of the node we want to clean up after.
*/
public void cleanupProcessData(String processID)
throws ManifoldCFException;
/** Reset the job queue for all process IDs.
* If a node was shut down in the middle of doing something, sufficient information should
* be around in the database to allow the node's activities to be cleaned up.
*/
public void cleanupProcessData()
throws ManifoldCFException;
/** Prepare to start the entire cluster.
* If there are no other nodes alive, then at the time the first node comes up, we need to
* reset the job queue for ALL processes that had been running before. This method must
* be called in addition to cleanupProcessData().
*/
public void prepareForClusterStart()
throws ManifoldCFException;
/** Reset as part of restoring document worker threads.
*@param processID is the current process ID.
*/
public void resetDocumentWorkerStatus(String processID)
throws ManifoldCFException;
/** Reset as part of restoring seeding threads.
*/
public void resetSeedingWorkerStatus(String processID)
throws ManifoldCFException;
/** Reset as part of restoring doc delete threads.
*@param processID is the current process ID.
*/
public void resetDocDeleteWorkerStatus(String processID)
throws ManifoldCFException;
/** Reset as part of restoring doc cleanup threads.
*@param processID is the current process ID.
*/
public void resetDocCleanupWorkerStatus(String processID)
throws ManifoldCFException;
/** Reset as part of restoring delete startup threads.
*@param processID is the current process ID.
*/
public void resetDeleteStartupWorkerStatus(String processID)
throws ManifoldCFException;
/** Reset as part of restoring notification threads.
*@param processID is the current process ID.
*/
public void resetNotificationWorkerStatus(String processID)
throws ManifoldCFException;
/** Reset as part of restoring startup threads.
*@param processID is the current process ID.
*/
public void resetStartupWorkerStatus(String processID)
throws ManifoldCFException;
// These methods support the "set doc priority" thread
/** Clear all document priorities, in preparation for reprioritization of all previously-prioritized documents.
* This method is called to start the dynamic reprioritization cycle, which follows this
* method with explicit prioritization of all documents, piece-meal, using getNextNotYetProcessedReprioritizationDocuments(),
* and writeDocumentPriorities().
*/
public void clearAllDocumentPriorities()
throws ManifoldCFException;
/** Get a list of not-yet-processed documents to reprioritize. Documents in all jobs will be
* returned by this method. Up to n document descriptions will be returned.
*@param processID is the process that requests the reprioritization documents.
*@param n is the maximum number of document descriptions desired.
*@return the document descriptions.
*/
public DocumentDescription[] getNextNotYetProcessedReprioritizationDocuments(String processID, int n)
throws ManifoldCFException;
/** Save a set of document priorities. In the case where a document was eligible to have its
* priority set, but it no longer is eligible, then the provided priority will not be written.
*@param descriptions are the document descriptions.
*@param priorities are the desired priorities.
*/
public void writeDocumentPriorities(DocumentDescription[] descriptions, IPriorityCalculator[] priorities)
throws ManifoldCFException;
// This method supports the "expiration" thread
/** Get up to the next n documents to be expired.
* This method marks the documents whose descriptions have been returned as "being processed", or active.
* The same marking is used as is used for documents that have been queued for worker threads. The model
* is thus identical.
*
*@param processID is the current process ID.
*@param n is the maximum number of records desired.
*@param currentTime is the current time.
*@return the array of document descriptions to expire.
*/
public DocumentSetAndFlags getExpiredDocuments(String processID, int n, long currentTime)
throws ManifoldCFException;
// This method supports the "queue stuffer" thread
/** Get up to the next n document(s) to be fetched and processed.
* This fetch returns records that contain the document identifier, plus all instructions
* pertaining to the document's handling (e.g. whether it should be refetched if the version
* has not changed).
* This method also marks the documents whose descriptions have be returned as "being processed".
*@param processID is the current process ID.
*@param n is the number of documents desired.
*@param currentTime is the current time; some fetches do not occur until a specific time.
*@param interval is the number of milliseconds that this set of documents should represent (for throttling).
*@param blockingDocuments is the place to record documents that were encountered, are eligible for reprioritization,
* but could not be queued due to throttling considerations.
*@param statistics are the current performance statistics per connection, which are used to balance the queue stuffing
* so that individual connections are not overwhelmed.
*@param scanRecord retains the bins from all documents encountered from the query, even those that were skipped due
* to being overcommitted.
*@return the array of document descriptions to fetch and process.
*/
public DocumentDescription[] getNextDocuments(String processID,
int n, long currentTime, long interval,
BlockingDocuments blockingDocuments, PerformanceStatistics statistics,
DepthStatistics scanRecord)
throws ManifoldCFException;
// These methods support the individual fetch/process threads.
/** Verify that a specific job is indeed still active. This is used to permit abort or pause to be relatively speedy.
* The query done within MUST be cached in order to not cause undue performance degradation.
*@param jobID is the job identifier.
*@return true if the job is in one of the "active" states.
*/
public boolean checkJobActive(Long jobID)
throws ManifoldCFException;
/** Verify if a job is still processing documents, or no longer has any outstanding active documents */
public boolean checkJobBusy(Long jobID)
throws ManifoldCFException;
/** Note completion of document processing by a job thread of a document.
* This method causes the state of the document to be marked as "completed".
*@param documentDescriptions are the description objects for the documents that were processed.
*/
public void markDocumentCompletedMultiple(DocumentDescription[] documentDescriptions)
throws ManifoldCFException;
/** Note completion of document processing by a job thread of a document.
* This method causes the state of the document to be marked as "completed".
*@param documentDescription is the description object for the document that was processed.
*/
public void markDocumentCompleted(DocumentDescription documentDescription)
throws ManifoldCFException;
/** Delete from queue as a result of processing of an active document.
* The document is expected to be in one of the active states: ACTIVE, ACTIVESEEDING,
* ACTIVENEEDSRESCAN, ACTIVESEEDINGNEEDSRESCAN. The RESCAN variants are interpreted
* as meaning that the document should not be deleted, but should instead be popped back on the queue for
* a repeat processing attempt.
*@param documentDescriptions are the set of description objects for the documents that were processed.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@return the set of documents for which carrydown data was changed by this operation. These documents are likely
* to be requeued as a result of the change.
*/
public DocumentDescription[] markDocumentDeletedMultiple(Long jobID, String[] legalLinkTypes, DocumentDescription[] documentDescriptions,
int hopcountMethod)
throws ManifoldCFException;
/** Delete from queue as a result of processing of an active document.
* The document is expected to be in one of the active states: ACTIVE, ACTIVESEEDING,
* ACTIVENEEDSRESCAN, ACTIVESEEDINGNEEDSRESCAN. The RESCAN variants are interpreted
* as meaning that the document should not be deleted, but should instead be popped back on the queue for
* a repeat processing attempt.
*@param documentDescription is the description object for the document that was processed.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@return the set of documents for which carrydown data was changed by this operation. These documents are likely
* to be requeued as a result of the change.
*/
public DocumentDescription[] markDocumentDeleted(Long jobID, String[] legalLinkTypes, DocumentDescription documentDescription,
int hopcountMethod)
throws ManifoldCFException;
/** Mark hopcount removal from queue as a result of processing of an active document.
* The document is expected to be in one of the active states: ACTIVE, ACTIVESEEDING,
* ACTIVENEEDSRESCAN, ACTIVESEEDINGNEEDSRESCAN. The RESCAN variants are interpreted
* as meaning that the document should not be marked as removed, but should instead be popped back on the queue for
* a repeat processing attempt.
*@param documentDescriptions are the set of description objects for the documents that were processed.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@return the set of documents for which carrydown data was changed by this operation. These documents are likely
* to be requeued as a result of the change.
*/
public DocumentDescription[] markDocumentHopcountRemovalMultiple(Long jobID, String[] legalLinkTypes, DocumentDescription[] documentDescriptions,
int hopcountMethod)
throws ManifoldCFException;
/** Mark hopcount removal from queue as a result of processing of an active document.
* The document is expected to be in one of the active states: ACTIVE, ACTIVESEEDING,
* ACTIVENEEDSRESCAN, ACTIVESEEDINGNEEDSRESCAN. The RESCAN variants are interpreted
* as meaning that the document should not be marked as removed, but should instead be popped back on the queue for
* a repeat processing attempt.
*@param documentDescription is the description object for the document that was processed.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@return the set of documents for which carrydown data was changed by this operation. These documents are likely
* to be requeued as a result of the change.
*/
public DocumentDescription[] markDocumentHopcountRemoval(Long jobID, String[] legalLinkTypes, DocumentDescription documentDescription,
int hopcountMethod)
throws ManifoldCFException;
/** Delete from queue as a result of expiration of an active document.
* The document is expected to be in one of the active states: ACTIVE, ACTIVESEEDING,
* ACTIVENEEDSRESCAN, ACTIVESEEDINGNEEDSRESCAN. Since the document expired,
* no special activity takes place as a result of the document being in a RESCAN state.
*@param documentDescriptions are the set of description objects for the documents that were processed.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@return the set of documents for which carrydown data was changed by this operation. These documents are likely
* to be requeued as a result of the change.
*/
public DocumentDescription[] markDocumentExpiredMultiple(Long jobID, String[] legalLinkTypes, DocumentDescription[] documentDescriptions,
int hopcountMethod)
throws ManifoldCFException;
/** Delete from queue as a result of expiration of an active document.
* The document is expected to be in one of the active states: ACTIVE, ACTIVESEEDING,
* ACTIVENEEDSRESCAN, ACTIVESEEDINGNEEDSRESCAN. Since the document expired,
* no special activity takes place as a result of the document being in a RESCAN state.
*@param documentDescription is the description object for the document that was processed.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@return the set of documents for which carrydown data was changed by this operation. These documents are likely
* to be requeued as a result of the change.
*/
public DocumentDescription[] markDocumentExpired(Long jobID, String[] legalLinkTypes, DocumentDescription documentDescription,
int hopcountMethod)
throws ManifoldCFException;
/** Delete from queue as a result of cleaning up an unreachable document.
* The document is expected to be in the PURGATORY state. There is never any need to reprocess the
* document.
*@param documentDescriptions are the set of description objects for the documents that were processed.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@return the set of documents for which carrydown data was changed by this operation. These documents are likely
* to be requeued as a result of the change.
*/
public DocumentDescription[] markDocumentCleanedUpMultiple(Long jobID, String[] legalLinkTypes, DocumentDescription[] documentDescriptions,
int hopcountMethod)
throws ManifoldCFException;
/** Delete from queue as a result of cleaning up an unreachable document.
* The document is expected to be in the PURGATORY state. There is never any need to reprocess the
* document.
*@param documentDescription is the description object for the document that was processed.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@return the set of documents for which carrydown data was changed by this operation. These documents are likely
* to be requeued as a result of the change.
*/
public DocumentDescription[] markDocumentCleanedUp(Long jobID, String[] legalLinkTypes, DocumentDescription documentDescription,
int hopcountMethod)
throws ManifoldCFException;
/** Requeue a document set because of carrydown changes.
* This method is called when carrydown data is modified for a set of documents. The documents must be requeued for immediate reprocessing, even to the
* extent that if one is *already* being processed, it will need to be done over again.
*@param documentDescriptions is the set of description objects for the documents that have had their parent carrydown information changed.
*@param docPriorities are the document priorities to assign to the documents, if needed.
*/
public void carrydownChangeDocumentMultiple(DocumentDescription[] documentDescriptions, IPriorityCalculator[] docPriorities)
throws ManifoldCFException;
/** Requeue a document because of carrydown changes.
* This method is called when carrydown data is modified for a document. The document must be requeued for immediate reprocessing, even to the
* extent that if it is *already* being processed, it will need to be done over again.
*@param documentDescription is the description object for the document that has had its parent carrydown information changed.
*@param docPriority is the document priority to assign to the document, if needed.
*/
public void carrydownChangeDocument(DocumentDescription documentDescription, IPriorityCalculator docPriority)
throws ManifoldCFException;
/** Requeue a document for further processing in the future.
* This method is called after a document is processed, when the job is a "continuous" one.
* It is essentially equivalent to noting that the document processing is complete, except the
* document remains on the queue.
*@param documentDescriptions is the set of description objects for the document that was processed.
*@param executeTimes are the times that the documents should be rescanned. Null indicates "never".
*@param actions are what should be done when the time arrives. Choices are ACTION_RESCAN or ACTION_REMOVE.
*/
public void requeueDocumentMultiple(DocumentDescription[] documentDescriptions, Long[] executeTimes,
int[] actions)
throws ManifoldCFException;
/** Requeue a document for further processing in the future.
* This method is called after a document is processed, when the job is a "continuous" one.
* It is essentially equivalent to noting that the document processing is complete, except the
* document remains on the queue.
*@param documentDescription is the description object for the document that was processed.
*@param executeTime is the time that the document should be rescanned. Null indicates "never".
*@param action is what should be done when the time arrives. Choices include ACTION_RESCAN or ACTION_REMOVE.
*/
public void requeueDocument(DocumentDescription documentDescription, Long executeTime,
int action)
throws ManifoldCFException;
/** Reset documents for further processing in the future.
* This method is called after a service interruption is thrown.
* It is essentially equivalent to resetting the time for documents to be reprocessed.
*@param documentDescriptions is the set of description objects for the document that was processed.
*@param executeTime is the time that the documents should be rescanned.
*@param failTime is the time beyond which hard failure should occur.
*@param failCount is the number of permitted failures before a hard error is signalled.
*/
public void resetDocumentMultiple(DocumentDescription[] documentDescriptions, long executeTime,
int action, long failTime, int failCount)
throws ManifoldCFException;
/** Reset an active document back to its former state.
* This gets done when there's a service interruption and the document cannot be processed yet.
*@param documentDescription is the description object for the document that was processed.
*@param executeTime is the time that the document should be rescanned.
*@param failTime is the time that the document should be considered to have failed, if it has not been
* successfully read until then.
*@param failCount is the number of permitted failures before a hard error is signalled.
*/
public void resetDocument(DocumentDescription documentDescription, long executeTime, int action, long failTime,
int failCount)
throws ManifoldCFException;
/** Reset a set of deleting documents for further processing in the future.
* This method is called after some unknown number of the documents were deleted, but then an ingestion service interruption occurred.
* Note well: The logic here basically presumes that we cannot know whether the documents were indeed processed or not.
* If we knew for a fact that none of the documents had been handled, it would be possible to look at the document's
* current status and decide what the new status ought to be, based on a true rollback scenario. Such cases, however, are rare enough so that
* special logic is probably not worth it.
*@param documentDescriptions is the set of description objects for the document that was processed.
*@param checkTime is the minimum time for the next cleaning attempt.
*/
public void resetDeletingDocumentMultiple(DocumentDescription[] documentDescriptions, long checkTime)
throws ManifoldCFException;
/** Reset a deleting document back to its former state.
* This gets done when a deleting thread sees a service interruption, etc., from the ingestion system.
*@param documentDescription is the description object for the document that was cleaned.
*@param checkTime is the minimum time for the next cleaning attempt.
*/
public void resetDeletingDocument(DocumentDescription documentDescription, long checkTime)
throws ManifoldCFException;
/** Reset a cleaning document back to its former state.
* This gets done when a cleaning thread sees a service interruption, etc., from the ingestion system.
*@param documentDescription is the description object for the document that was cleaned.
*@param checkTime is the minimum time for the next cleaning attempt.
*/
public void resetCleaningDocument(DocumentDescription documentDescription, long checkTime)
throws ManifoldCFException;
/** Reset a set of cleaning documents for further processing in the future.
* This method is called after some unknown number of the documents were cleaned, but then an ingestion service interruption occurred.
* Note well: The logic here basically presumes that we cannot know whether the documents were indeed cleaned or not.
* If we knew for a fact that none of the documents had been handled, it would be possible to look at the document's
* current status and decide what the new status ought to be, based on a true rollback scenario. Such cases, however, are rare enough so that
* special logic is probably not worth it.
*@param documentDescriptions is the set of description objects for the document that was cleaned.
*@param checkTime is the minimum time for the next cleaning attempt.
*/
public void resetCleaningDocumentMultiple(DocumentDescription[] documentDescriptions, long checkTime)
throws ManifoldCFException;
/** Retry startup.
*@param jobStartRecord is the current job startup record.
*@param failTime is the new fail time (-1L if none).
*@param failRetryCount is the new fail retry count (-1 if none).
*/
public void retryStartup(JobStartRecord jobStartRecord, long failTime, int failRetryCount)
throws ManifoldCFException;
/** Retry seeding.
*@param jobSeedingRecord is the current job seeding record.
*@param failTime is the new fail time (-1L if none).
*@param failRetryCount is the new fail retry count (-1 if none).
*/
public void retrySeeding(JobSeedingRecord jobSeedingRecord, long failTime, int failRetryCount)
throws ManifoldCFException;
/** Retry notification.
*@param jobNotifyRecord is the current job notification record.
*@param failTime is the new fail time (-1L if none).
*@param failRetryCount is the new fail retry count (-1 if none).
*/
public void retryNotification(JobNotifyRecord jobNotifyRecord, long failTime, int failRetryCount)
throws ManifoldCFException;
/** Retry delete notification.
*@param jnr is the current job notification record.
*@param failTime is the new fail time (-1L if none).
*@param failCount is the new fail retry count (-1 if none).
*/
public void retryDeleteNotification(JobNotifyRecord jnr, long failTime, int failCount)
throws ManifoldCFException;
/** Add an initial set of documents to the queue.
* This method is called during job startup, when the queue is being loaded.
* A set of document references is passed to this method, which updates the status of the document
* in the specified job's queue, according to specific state rules.
*@param processID is the current process ID.
*@param jobID is the job identifier.
*@param legalLinkTypes is the set of legal link types that this connector generates.
*@param docIDHashes are the hashes of the local document identifiers (primary key).
*@param docIDs are the local document identifiers.
*@param overrideSchedule is true if any existing document schedule should be overridden.
*@param hopcountMethod is either accurate, nodelete, or neverdelete.
*@param documentPriorities are the document priorities corresponding to the document identifiers.
*@param prereqEventNames are the events that must be completed before each document can be processed.
*/
public void addDocumentsInitial(String processID,
Long jobID, String[] legalLinkTypes,
String[] docIDHashes, String[] docIDs, boolean overrideSchedule,
int hopcountMethod, IPriorityCalculator[] documentPriorities,
String[][] prereqEventNames)
throws ManifoldCFException;
/** Add an initial set of remaining documents to the queue.
* This method is called during job startup, when the queue is being loaded, to list documents that
* were NOT included by calling addDocumentsInitial(). Documents listed here are simply designed to
* enable the framework to get rid of old, invalid seeds. They are not queued for processing.
*@param processID is the current process ID.
*@param jobID is the job identifier.
*@param legalLinkTypes is the set of legal link types that this connector generates.
*@param docIDHashes are the hash values of the local document identifiers.
*@param hopcountMethod is either accurate, nodelete, or neverdelete.
*/
public void addRemainingDocumentsInitial(String processID,
Long jobID, String[] legalLinkTypes,
String[] docIDHashes,
int hopcountMethod)
throws ManifoldCFException;
/** Signal that a seeding pass has been done.
* Call this method at the end of a seeding pass. It is used to perform the bookkeeping necessary to
* maintain the hopcount table.
*@param jobID is the job identifier.
*@param legalLinkTypes is the set of legal link types that this connector generates.
*@param isPartial is set if the seeds provided are only a partial list. Some connectors cannot
* supply a full list of seeds on every seeding iteration; this acknowledges that limitation.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*/
public void doneDocumentsInitial(Long jobID, String[] legalLinkTypes, boolean isPartial,
int hopcountMethod)
throws ManifoldCFException;
/** Begin an event sequence.
*@param processID is the current process ID.
*@param eventName is the name of the event.
*@return true if the event could be created, or false if it's already there.
*/
public boolean beginEventSequence(String processID, String eventName)
throws ManifoldCFException;
/** Complete an event sequence.
*@param eventName is the name of the event.
*/
public void completeEventSequence(String eventName)
throws ManifoldCFException;
/** Get the specified hop counts, with the limit as described.
*@param jobID is the job identifier.
*@param legalLinkTypes is the set of legal link types that this connector generates.
*@param docIDHashes is the set of document hashes to find the hopcount for.
*@param linkType is the kind of link to find the hopcount for.
*@param limit is the limit, beyond which a negative distance may be returned.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@return a vector of booleans corresponding to the documents requested. A true value is returned
* if the document is within the specified limit, false otherwise.
*/
public boolean[] findHopCounts(Long jobID, String[] legalLinkTypes, String[] docIDHashes, String linkType, int limit,
int hopcountMethod)
throws ManifoldCFException;
/** Get all the current seeds.
* Returns the seed document identifiers for a job.
*@param jobID is the job identifier.
*@return the document identifier hashes that are currently considered to be seeds.
*/
public String[] getAllSeeds(Long jobID)
throws ManifoldCFException;
/** Add a document to the queue.
* This method is called during document processing, when a document reference is discovered.
* The document reference is passed to this method, which updates the status of the document
* in the specified job's queue, according to specific state rules.
*@param processID is the current process ID.
*@param jobID is the job identifier.
*@param legalLinkTypes is the set of legal link types that this connector generates.
*@param docIDHash is the local document identifier hash value.
*@param parentIdentifierHash is the optional parent identifier hash value for this document. Pass null if none.
* MUST be present in the case of carrydown information.
*@param relationshipType is the optional link type between this document and its parent. Pass null if there
* is no relationship with a parent.
*@param hopcountMethod is either accurate, nodelete, or neverdelete.
*@param dataNames are the names of the data to carry down to the child from this parent.
*@param dataValues are the values to carry down to the child from this parent, corresponding to dataNames above. If CharacterInput objects are passed in here,
* it is the caller's responsibility to clean these up.
*@param priority is the desired document priority for the document.
*@param prereqEventNames are the events that must be completed before the document can be processed.
*/
public void addDocument(String processID,
Long jobID, String[] legalLinkTypes,
String docIDHash, String docID,
String parentIdentifierHash,
String relationshipType,
int hopcountMethod, String[] dataNames, Object[][] dataValues,
IPriorityCalculator priority, String[] prereqEventNames)
throws ManifoldCFException;
/** Add documents to the queue in bulk.
* This method is called during document processing, when a set of document references are discovered.
* The document references are passed to this method, which updates the status of the document(s)
* in the specified job's queue, according to specific state rules.
*@param processID is the current process ID.
*@param jobID is the job identifier.
*@param legalLinkTypes is the set of legal link types that this connector generates.
*@param docIDHashes are the hashes of the local document identifiers.
*@param docIDs are the local document identifiers.
*@param parentIdentifierHash is the optional parent identifier hash of these documents. Pass null if none.
* MUST be present in the case of carrydown information.
*@param relationshipType is the optional link type between this document and its parent. Pass null if there
* is no relationship with a parent.
*@param hopcountMethod is either accurate, nodelete, or neverdelete.
*@param dataNames are the names of the data to carry down to the child from this parent.
*@param dataValues are the values to carry down to the child from this parent, corresponding to dataNames above. If CharacterInput objects are passed in here,
* it is the caller's responsibility to clean these up.
*@param priorities are the desired document priorities for the documents.
*@param prereqEventNames are the events that must be completed before each document can be processed.
*/
public void addDocuments(String processID,
Long jobID, String[] legalLinkTypes,
String[] docIDHashes, String[] docIDs,
String parentIdentifierHash,
String relationshipType,
int hopcountMethod, String[][] dataNames, Object[][][] dataValues,
IPriorityCalculator[] priorities,
String[][] prereqEventNames)
throws ManifoldCFException;
/** Complete adding child documents to the queue, for a set of documents.
* This method is called at the end of document processing, to help the hopcount tracking engine do its bookkeeping.
*@param jobID is the job identifier.
*@param legalLinkTypes is the set of legal link types that this connector generates.
*@param parentIdentifierHashes are the hashes of the document identifiers for whom child link extraction just took place.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@return the set of documents for which carrydown data was changed by this operation. These documents are likely
* to be requeued as a result of the change.
*/
public DocumentDescription[] finishDocuments(Long jobID, String[] legalLinkTypes,
String[] parentIdentifierHashes, int hopcountMethod)
throws ManifoldCFException;
/** Undo the addition of child documents to the queue, for a set of documents.
* This method is called at the end of document processing, to back out any incomplete additions to the queue, and restore
* the status quo ante prior to the incomplete additions. Call this method instead of finishDocuments() if the
* addition of documents was not completed.
*@param jobID is the job identifier.
*@param legalLinkTypes is the set of legal link types that this connector generates.
*@param parentIdentifierHashes are the hashes of the document identifiers for whom child link extraction just took place.
*/
public void revertDocuments(Long jobID, String[] legalLinkTypes,
String[] parentIdentifierHashes)
throws ManifoldCFException;
/** Retrieve specific parent data for a given document.
*@param jobID is the job identifier.
*@param docIDHash is the hash of the document identifier.
*@param dataName is the kind of data to retrieve.
*@return the unique data values.
*/
public String[] retrieveParentData(Long jobID, String docIDHash, String dataName)
throws ManifoldCFException;
/** Retrieve specific parent data for a given document.
*@param jobID is the job identifier.
*@param docIDHash is the document identifier hash value.
*@param dataName is the kind of data to retrieve.
*@return the unique data values.
*/
public CharacterInput[] retrieveParentDataAsFiles(Long jobID, String docIDHash, String dataName)
throws ManifoldCFException;
// These methods support the job threads (which start jobs and end jobs)
// There is one thread that starts jobs. It simply looks for jobs which are ready to
// start, and changes their state accordingly.
// There is also a pool of threads that end jobs. These threads wait for a job that
// looks like it is done, and do completion processing if it is.
/** Manually start a job. The specified job will be run REGARDLESS of the timed windows, and
* will not cease until complete. If the job is already running, this operation will assure that
* the job does not pause when its window ends. The job can be manually paused, or manually aborted.
*@param jobID is the ID of the job to start.
*@param requestMinimum is true if a minimal job run is requested.
*/
public void manualStart(Long jobID, boolean requestMinimum)
throws ManifoldCFException;
/** Manually start a job. The specified job will be run REGARDLESS of the timed windows, and
* will not cease until complete. If the job is already running, this operation will assure that
* the job does not pause when its window ends. The job can be manually paused, or manually aborted.
*@param jobID is the ID of the job to start.
*/
public void manualStart(Long jobID)
throws ManifoldCFException;
/** Manually abort a running job. The job will be permanently stopped, and will not run again until
* automatically started based on schedule, or manually started.
*@param jobID is the job to abort.
*/
public void manualAbort(Long jobID)
throws ManifoldCFException;
/** Manually restart a running job. The job will be stopped and restarted. Any schedule affinity will be lost,
* until the job finishes on its own.
*@param jobID is the job to abort.
*@param requestMinimum is true if a minimal job run is requested.
*/
public void manualAbortRestart(Long jobID, boolean requestMinimum)
throws ManifoldCFException;
/** Manually restart a running job. The job will be stopped and restarted. Any schedule affinity will be lost,
* until the job finishes on its own.
*@param jobID is the job to abort.
*/
public void manualAbortRestart(Long jobID)
throws ManifoldCFException;
/** Pause a job.
*@param jobID is the job identifier to pause.
*/
public void pauseJob(Long jobID)
throws ManifoldCFException;
/** Restart a paused job.
*@param jobID is the job identifier to restart.
*/
public void restartJob(Long jobID)
throws ManifoldCFException;
/** Reset job schedule. This re-evaluates whether the job should be started now. This method would typically
* be called after a job's scheduling window has been changed.
*@param jobID is the job identifier.
*/
public void resetJobSchedule(Long jobID)
throws ManifoldCFException;
// These methods are called by automatic processes
/** Start jobs based on schedule.
* This method marks all the appropriate jobs as "in progress", which is all that should be
* needed to start them.
*@param currentTime is the current time in milliseconds since epoch.
*@param unwaitList is filled in with the set of job id's that were resumed (Long's).
*/
public void startJobs(long currentTime, ArrayList unwaitList)
throws ManifoldCFException;
/** Put active or paused jobs in wait state, if they've exceeded their window.
*@param currentTime is the current time in milliseconds since epoch.
*@param waitList is filled in with the set of job id's that were put into a wait state (Long's).
*/
public void waitJobs(long currentTime, ArrayList waitList)
throws ManifoldCFException;
/** Get the list of jobs that are ready for seeding.
*@param processID is the current process ID.
*@param currentTime is the current time in milliseconds since epoch.
*@return jobs that are active and are running in adaptive mode. These will be seeded
* based on what the connector says should be added to the queue.
*/
public JobSeedingRecord[] getJobsReadyForSeeding(String processID, long currentTime)
throws ManifoldCFException;
/** Reset a seeding job back to "active" state.
*@param jobID is the job id.
*/
public void resetSeedJob(Long jobID)
throws ManifoldCFException;
/** Get the list of jobs that are ready for delete cleanup.
*@param processID is the current process ID.
*@return jobs that were in the "readyfordelete" state.
*/
public JobDeleteRecord[] getJobsReadyForDeleteCleanup(String processID)
throws ManifoldCFException;
/** Get the list of jobs that are ready for startup.
*@param processID is the current process ID.
*@return jobs that were in the "readyforstartup" state. These will be marked as being in the "starting up" state.
*/
public JobStartRecord[] getJobsReadyForStartup(String processID)
throws ManifoldCFException;
/** Find the list of jobs that need to have their connectors notified of job completion.
*@param processID is the current process ID.
*@return the ID's of jobs that need their output connectors notified in order to become inactive.
*/
public JobNotifyRecord[] getJobsReadyForInactivity(String processID)
throws ManifoldCFException;
/** Find the list of jobs that need to have their connectors notified of job deletion.
*@param processID is the process ID.
*@return the ID's of jobs that need their output connectors notified in order to be removed.
*/
public JobNotifyRecord[] getJobsReadyForDelete(String processID)
throws ManifoldCFException;
/** Inactivate a job, from the notification state.
*@param jobID is the ID of the job to inactivate.
*/
public void inactivateJob(Long jobID)
throws ManifoldCFException;
/** Remove a job, from the notification state.
*@param jobID is the ID of the job to remove.
*/
public void removeJob(Long jobID)
throws ManifoldCFException;
/** Reset a job starting for delete back to "ready for delete"
* state.
*@param jobID is the job id.
*/
public void resetStartDeleteJob(Long jobID)
throws ManifoldCFException;
/** Reset a job that is notifying back to "ready for notify"
* state.
*@param jobID is the job id.
*/
public void resetNotifyJob(Long jobID)
throws ManifoldCFException;
/** Reset a job that is delete notifying back to "ready for delete notify"
* state.
*@param jobID is the job id.
*/
public void resetDeleteNotifyJob(Long jobID)
throws ManifoldCFException;
/** Reset a starting job back to "ready for startup" state.
*@param jobID is the job id.
*/
public void resetStartupJob(Long jobID)
throws ManifoldCFException;
/** Prepare for a delete scan.
*@param jobID is the job id.
*/
public void prepareDeleteScan(Long jobID)
throws ManifoldCFException;
/** Prepare a job to be run.
* This method is called regardless of the details of the job; what differs is only the flags that are passed in.
* The code inside will determine the appropriate procedures.
* (This method replaces prepareFullScan() and prepareIncrementalScan(). )
*@param jobID is the job id.
*@param legalLinkTypes are the link types allowed for the job.
*@param hopcountMethod describes how to handle deletions for hopcount purposes.
*@param connectorModel is the model used by the connector for the job.
*@param continuousJob is true if the job is a continuous one.
*@param fromBeginningOfTime is true if the job is running starting from time 0.
*@param requestMinimum is true if the minimal amount of work is requested for the job run.
*/
public void prepareJobScan(Long jobID, String[] legalLinkTypes, int hopcountMethod,
int connectorModel, boolean continuousJob, boolean fromBeginningOfTime,
boolean requestMinimum)
throws ManifoldCFException;
/** Note job delete started.
*@param jobID is the job id.
*@param startTime is the job start time.
*/
public void noteJobDeleteStarted(Long jobID, long startTime)
throws ManifoldCFException;
/** Note job started.
*@param jobID is the job id.
*@param startTime is the job start time.
*@param seedingVersion is the seeding version to record with the job start.
*/
public void noteJobStarted(Long jobID, long startTime, String seedingVersion)
throws ManifoldCFException;
/** Note job seeded.
*@param jobID is the job id.
*@param seedingVersion is the seeding version string to record.
*/
public void noteJobSeeded(Long jobID, String seedingVersion)
throws ManifoldCFException;
/** Note the deregistration of a connector used by the specified connections.
* This method will be called when the connector is deregistered. Jobs that use these connections
* must therefore enter appropriate states.
*@param connectionNames is the set of connection names.
*/
public void noteConnectorDeregistration(String[] connectionNames)
throws ManifoldCFException;
/** Note the registration of a connector used by the specified connections.
* This method will be called when a connector is registered, on which the specified
* connections depend.
*@param connectionNames is the set of connection names.
*/
public void noteConnectorRegistration(String[] connectionNames)
throws ManifoldCFException;
/** Note a change in connection configuration.
* This method will be called whenever a connection's configuration is modified, or when an external repository change
* is signalled.
*/
public void noteConnectionChange(String connectionName)
throws ManifoldCFException;
/** Note the deregistration of an output connector used by the specified connections.
* This method will be called when the connector is deregistered. Jobs that use these connections
* must therefore enter appropriate states.
*@param connectionNames is the set of connection names.
*/
public void noteOutputConnectorDeregistration(String[] connectionNames)
throws ManifoldCFException;
/** Note the registration of an output connector used by the specified connections.
* This method will be called when a connector is registered, on which the specified
* connections depend.
*@param connectionNames is the set of connection names.
*/
public void noteOutputConnectorRegistration(String[] connectionNames)
throws ManifoldCFException;
/** Note a change in output connection configuration.
* This method will be called whenever a connection's configuration is modified, or when an external output target change
* is signalled.
*/
public void noteOutputConnectionChange(String connectionName)
throws ManifoldCFException;
/** Note the deregistration of a transformation connector used by the specified connections.
* This method will be called when the connector is deregistered. Jobs that use these connections
* must therefore enter appropriate states.
*@param connectionNames is the set of connection names.
*/
public void noteTransformationConnectorDeregistration(String[] connectionNames)
throws ManifoldCFException;
/** Note the registration of a transformation connector used by the specified connections.
* This method will be called when a connector is registered, on which the specified
* connections depend.
*@param connectionNames is the set of connection names.
*/
public void noteTransformationConnectorRegistration(String[] connectionNames)
throws ManifoldCFException;
/** Note a change in transformation connection configuration.
* This method will be called whenever a connection's configuration is modified.
*/
public void noteTransformationConnectionChange(String connectionName)
throws ManifoldCFException;
/** Assess jobs marked to be in need of assessment for connector status changes.
*/
public void assessMarkedJobs()
throws ManifoldCFException;
/** Delete jobs in need of being deleted (which are marked "ready for delete").
* This method is meant to be called periodically to perform delete processing on jobs.
*/
public void deleteJobsReadyForDelete()
throws ManifoldCFException;
/** Get list of deletable document descriptions. This list will take into account
* multiple jobs that may own the same document.
*@param processID is the current process ID.
*@param n is the maximum number of documents to return.
*@param currentTime is the current time; some fetches do not occur until a specific time.
*@return the document descriptions for these documents.
*/
public DocumentDescription[] getNextDeletableDocuments(String processID,
int n, long currentTime)
throws ManifoldCFException;
/** Get list of cleanable document descriptions. This list will take into account
* multiple jobs that may own the same document.
*@param processID is the current process ID.
*@param n is the maximum number of documents to return.
*@param currentTime is the current time; some fetches do not occur until a specific time.
*@return the document descriptions for these documents.
*/
public DocumentSetAndFlags getNextCleanableDocuments(String processID,
int n, long currentTime)
throws ManifoldCFException;
/** Delete ingested document identifiers (as part of deleting the owning job).
* The number of identifiers specified is guaranteed to be less than the maxInClauseCount
* for the database.
*@param identifiers is the set of document identifiers.
*/
public void deleteIngestedDocumentIdentifiers(DocumentDescription[] identifiers)
throws ManifoldCFException;
/** Abort a running job due to a fatal error condition.
*@param jobID is the job to abort.
*@param errorText is the error text.
*@return true if this is the first abort for the job.
*/
public boolean errorAbort(Long jobID, String errorText)
throws ManifoldCFException;
/** Complete the sequence that stops jobs, either for abort, pause, or because of a scheduling
* window. The logic will move the job to its next state (INACTIVE, PAUSED, ACTIVEWAIT),
* and will record the jobs that have been so modified.
*@param timestamp is the current time in milliseconds since epoch.
*@param modifiedJobs is filled in with the set of IJobDescription objects that were stopped.
*/
public void finishJobStops(long timestamp, ArrayList modifiedJobs)
throws ManifoldCFException;
/** Complete the sequence that resumes jobs, either from a pause or from a scheduling window
* wait. The logic will restore the job to an active state (many possibilities depending on
* connector status), and will record the jobs that have been so modified.
*@param timestamp is the current time in milliseconds since epoch.
*@param modifiedJobs is filled in with the set of IJobDescription objects that were resumed.
*/
public void finishJobResumes(long timestamp, ArrayList modifiedJobs)
throws ManifoldCFException;
/** Put all eligible jobs in the "shutting down" state.
*/
public void finishJobs()
throws ManifoldCFException;
/** Reset eligible jobs either back to the "inactive" state, or make them active again. The
* latter will occur if the cleanup phase of the job generated more pending documents.
*
* This method is used to pick up all jobs in the shutting down state
* whose purgatory or being-cleaned records have been all processed.
*
*@param currentTime is the current time in milliseconds since epoch.
*@param resetJobs is filled in with the set of IJobDescription objects that were reset.
*/
public void resetJobs(long currentTime, ArrayList resetJobs)
throws ManifoldCFException;
// Status reports
/** Get the status of a job.
*@param jobID is the job ID.
*@return the status object for the specified job.
*/
public JobStatus getStatus(Long jobID)
throws ManifoldCFException;
/** Get a list of all jobs, and their status information.
*@return an ordered array of job status objects.
*/
public JobStatus[] getAllStatus()
throws ManifoldCFException;
/** Get a list of running jobs. This is for status reporting.
*@return an array of the job status objects.
*/
public JobStatus[] getRunningJobs()
throws ManifoldCFException;
/** Get a list of completed jobs, and their statistics.
*@return an array of the job status objects.
*/
public JobStatus[] getFinishedJobs()
throws ManifoldCFException;
/** Get the status of a job.
*@param jobID is the job ID.
*@param includeCounts is true if document counts should be included.
*@return the status object for the specified job.
*/
public JobStatus getStatus(Long jobID, boolean includeCounts)
throws ManifoldCFException;
/** Get a list of all jobs, and their status information.
*@param includeCounts is true if document counts should be included.
*@return an ordered array of job status objects.
*/
public JobStatus[] getAllStatus(boolean includeCounts)
throws ManifoldCFException;
/** Get a list of running jobs. This is for status reporting.
*@param includeCounts is true if document counts should be included.
*@return an array of the job status objects.
*/
public JobStatus[] getRunningJobs(boolean includeCounts)
throws ManifoldCFException;
/** Get a list of completed jobs, and their statistics.
*@param includeCounts is true if document counts should be included.
*@return an array of the job status objects.
*/
public JobStatus[] getFinishedJobs(boolean includeCounts)
throws ManifoldCFException;
/** Get the status of a job.
*@param jobID is the job ID.
*@param includeCounts is true if document counts should be included.
*@param maxCount is the maximum number of documents we want to count for each status.
*@return the status object for the specified job.
*/
public JobStatus getStatus(Long jobID, boolean includeCounts, int maxCount)
throws ManifoldCFException;
/** Get a list of all jobs, and their status information.
*@param includeCounts is true if document counts should be included.
*@param maxCount is the maximum number of documents we want to count for each status.
*@return an ordered array of job status objects.
*/
public JobStatus[] getAllStatus(boolean includeCounts, int maxCount)
throws ManifoldCFException;
/** Get a list of running jobs. This is for status reporting.
*@param includeCounts is true if document counts should be included.
*@param maxCount is the maximum number of documents we want to count for each status.
*@return an array of the job status objects.
*/
public JobStatus[] getRunningJobs(boolean includeCounts, int maxCount)
throws ManifoldCFException;
/** Get a list of completed jobs, and their statistics.
*@param includeCounts is true if document counts should be included.
*@param maxCount is the maximum number of documents we want to count for each status.
*@return an array of the job status objects.
*/
public JobStatus[] getFinishedJobs(boolean includeCounts, int maxCount)
throws ManifoldCFException;
// The following commands generate reports based on the queue.
/** Run a 'document status' report.
*@param connectionName is the name of the connection.
*@param filterCriteria are the criteria used to limit the records considered for the report.
*@param sortOrder is the specified sort order of the final report.
*@param startRow is the first row to include.
*@param rowCount is the number of rows to include.
*@return the results, with the following columns: identifier, job, state, status, scheduled, action, retrycount, retrylimit. The "scheduled" column and the
* "retrylimit" column are long values representing a time; all other values will be user-friendly strings.
*/
public IResultSet genDocumentStatus(String connectionName, StatusFilterCriteria filterCriteria, SortOrder sortOrder,
int startRow, int rowCount)
throws ManifoldCFException;
/** Run a 'queue status' report.
*@param connectionName is the name of the connection.
*@param filterCriteria are the criteria used to limit the records considered for the report.
*@param sortOrder is the specified sort order of the final report.
*@param idBucketDescription is the bucket description for generating the identifier class.
*@param startRow is the first row to include.
*@param rowCount is the number of rows to include.
*@return the results, with the following columns: idbucket, inactive, processing, expiring, deleting,
processready, expireready, processwaiting, expirewaiting
*/
public IResultSet genQueueStatus(String connectionName, StatusFilterCriteria filterCriteria, SortOrder sortOrder,
BucketDescription idBucketDescription, int startRow, int rowCount)
throws ManifoldCFException;
}