blob: 331db8b8c4b4ed6fb627bce603d75529778b72f0 [file] [log] [blame]
/* $Id: IntrinsicLink.java 988245 2010-08-23 18:39:35Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.jobs;
import java.util.*;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
import org.apache.manifoldcf.crawler.system.ManifoldCF;
/** This class manages the table that keeps track of intrinsic relationships between documents.
*
* <br><br>
* <b>intrinsiclink</b>
* <table border="1" cellpadding="3" cellspacing="0">
* <tr class="TableHeadingColor">
* <th>Field</th><th>Type</th><th>Description&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</th>
* <tr><td>jobid</td><td>BIGINT</td><td>Reference:jobs.id</td></tr>
* <tr><td>linktype</td><td>VARCHAR(255)</td><td></td></tr>
* <tr><td>parentidhash</td><td>VARCHAR(40)</td><td></td></tr>
* <tr><td>childidhash</td><td>VARCHAR(40)</td><td></td></tr>
* <tr><td>isnew</td><td>CHAR(1)</td><td></td></tr>
* <tr><td>processid</td><td>VARCHAR(16)</td><td></td></tr>
* </table>
* <br><br>
*
*/
public class IntrinsicLink extends org.apache.manifoldcf.core.database.BaseTable
{
public static final String _rcsid = "@(#)$Id: IntrinsicLink.java 988245 2010-08-23 18:39:35Z kwright $";
// "new" field values
/** The standard value for this field. Means that the link existed prior to this scan, and no new link
* was found yet. */
protected static final int LINKSTATUS_BASE = 0;
/** This value means that the link is brand-new; it did not exist before this pass. */
protected static final int LINKSTATUS_NEW = 1;
/** This value means that the link existed before, and has been found during this scan. */
protected static final int LINKSTATUS_EXISTING = 2;
// Field names
public static final String jobIDField = "jobid";
public static final String linkTypeField = "linktype";
public static final String parentIDHashField = "parentidhash";
public static final String childIDHashField = "childidhash";
public static final String newField = "isnew";
public static final String processIDField = "processid";
// Map from string character to link status
protected static Map linkstatusMap;
static
{
linkstatusMap = new HashMap();
linkstatusMap.put("B",new Integer(LINKSTATUS_BASE));
linkstatusMap.put("N",new Integer(LINKSTATUS_NEW));
linkstatusMap.put("E",new Integer(LINKSTATUS_EXISTING));
}
/** Constructor.
*@param database is the database handle.
*/
public IntrinsicLink(IDBInterface database)
throws ManifoldCFException
{
super(database,"intrinsiclink");
}
/** Install or upgrade.
*/
public void install(String jobsTable, String jobsColumn)
throws ManifoldCFException
{
// Creating a unique index as part of upgrading could well fail, so we must have the ability to fix things up and retry if that happens.
while (true)
{
// Schema
Map existing = getTableSchema(null,null);
if (existing == null)
{
HashMap map = new HashMap();
map.put(jobIDField,new ColumnDescription("BIGINT",false,false,jobsTable,jobsColumn,false));
map.put(linkTypeField,new ColumnDescription("VARCHAR(255)",false,true,null,null,false));
map.put(parentIDHashField,new ColumnDescription("VARCHAR(40)",false,false,null,null,false));
map.put(childIDHashField,new ColumnDescription("VARCHAR(40)",false,true,null,null,false));
map.put(newField,new ColumnDescription("CHAR(1)",false,true,null,null,false));
map.put(processIDField,new ColumnDescription("VARCHAR(16)",false,true,null,null,false));
performCreate(map,null);
}
else
{
// Perform upgrade, if needed.
if (existing.get(processIDField) == null)
{
Map insertMap = new HashMap();
insertMap.put(processIDField,new ColumnDescription("VARCHAR(16)",false,true,null,null,false));
performAlter(insertMap,null,null,null);
}
}
// Indexes
IndexDescription uniqueIndex = new IndexDescription(true,new String[]{jobIDField,parentIDHashField,linkTypeField,childIDHashField});
IndexDescription jobChildNewIndex = new IndexDescription(false,new String[]{jobIDField,childIDHashField,newField});
IndexDescription newIndex = new IndexDescription(false,new String[]{newField,processIDField});
Map indexes = getTableIndexes(null,null);
Iterator iter = indexes.keySet().iterator();
while (iter.hasNext())
{
String indexName = (String)iter.next();
IndexDescription id = (IndexDescription)indexes.get(indexName);
if (uniqueIndex != null && id.equals(uniqueIndex))
uniqueIndex = null;
else if (jobChildNewIndex != null && id.equals(jobChildNewIndex))
jobChildNewIndex = null;
else if (newIndex != null && id.equals(newIndex))
newIndex = null;
else if (indexName.indexOf("_pkey") == -1)
// This index shouldn't be here; drop it
performRemoveIndex(indexName);
}
// Create the indexes we are still missing
if (jobChildNewIndex != null)
performAddIndex(null,jobChildNewIndex);
if (newIndex != null)
performAddIndex(null,newIndex);
if (uniqueIndex != null)
performAddIndex(null,uniqueIndex);
// All done
break;
}
}
/** Uninstall.
*/
public void deinstall()
throws ManifoldCFException
{
performDrop(null);
}
/** Analyze job tables that need analysis.
*/
public void analyzeTables()
throws ManifoldCFException
{
long startTime = System.currentTimeMillis();
Logging.perf.debug("Beginning to analyze intrinsiclink table");
analyzeTable();
Logging.perf.debug("Done analyzing intrinsiclink table in "+new Long(System.currentTimeMillis()-startTime)+" ms");
}
/** Delete an owner (and clean up the corresponding hopcount rows).
*/
public void deleteOwner(Long jobID)
throws ManifoldCFException
{
ArrayList list = new ArrayList();
String query = buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID)});
performDelete("WHERE "+query,list,null);
noteModifications(0,0,1);
}
/** Reset, at startup time. Since links can only be added in a transactionally safe way by processing
* of documents, and cached records of hopcount are updated only when requested, it is safest to simply
* move any "new" or "new existing" links back to base state on startup. Then, the next time that page
* is processed, the links will be updated properly.
*@param processID is the process to restart.
*/
public void restart(String processID)
throws ManifoldCFException
{
HashMap map = new HashMap();
map.put(newField,statusToString(LINKSTATUS_BASE));
ArrayList list = new ArrayList();
String query = buildConjunctionClause(list,new ClauseDescription[]{
new MultiClause(newField,new Object[]{
statusToString(LINKSTATUS_NEW),
statusToString(LINKSTATUS_EXISTING)}),
new UnitaryClause(processIDField,processID)});
performUpdate(map,"WHERE "+query,list,null);
}
/** Clean up after all process IDs
*/
public void restart()
throws ManifoldCFException
{
HashMap map = new HashMap();
map.put(newField,statusToString(LINKSTATUS_BASE));
ArrayList list = new ArrayList();
String query = buildConjunctionClause(list,new ClauseDescription[]{
new MultiClause(newField,new Object[]{
statusToString(LINKSTATUS_NEW),
statusToString(LINKSTATUS_EXISTING)})});
performUpdate(map,"WHERE "+query,list,null);
}
public void restartCluster()
throws ManifoldCFException
{
// Does nothing
}
/** Record a references from source to targets. These references will be marked as either "new" or "existing".
*@return the target document ID's that are considered "new".
*/
public String[] recordReferences(Long jobID, String sourceDocumentIDHash,
String[] targetDocumentIDHashes, String linkType, String processID)
throws ManifoldCFException
{
HashMap duplicateRemoval = new HashMap();
int maxClause = maxClausePerformExistsCheck(jobID,linkType,sourceDocumentIDHash);
ArrayList list = new ArrayList();
int i = 0;
int k = 0;
// Keep track of the document identifiers that have been seen vs. those that were unseen.
HashMap presentMap = new HashMap();
while (k < targetDocumentIDHashes.length)
{
String targetDocumentIDHash = targetDocumentIDHashes[k++];
if (duplicateRemoval.get(targetDocumentIDHash) != null)
continue;
duplicateRemoval.put(targetDocumentIDHash,targetDocumentIDHash);
if (i == maxClause)
{
// Do the query and record the results
performExistsCheck(presentMap,jobID,linkType,sourceDocumentIDHash,list);
i = 0;
list.clear();
}
list.add(targetDocumentIDHash);
i++;
}
if (i > 0)
performExistsCheck(presentMap,jobID,linkType,sourceDocumentIDHash,list);
// Go through the list again, and based on the results above, decide to do either an insert or
// an update.
// We have to count these by hand, in case there are duplicates in the array.
int count = 0;
Iterator iter = duplicateRemoval.keySet().iterator();
while (iter.hasNext())
{
String targetDocumentIDHash = (String)iter.next();
if (presentMap.get(targetDocumentIDHash) == null)
count++;
}
String[] newReferences = new String[count];
int j = 0;
// Note: May be able to make this more efficient if we update things in batches...
iter = duplicateRemoval.keySet().iterator();
while (iter.hasNext())
{
String targetDocumentIDHash = (String)iter.next();
if (presentMap.get(targetDocumentIDHash) == null)
{
newReferences[j++] = targetDocumentIDHash;
HashMap map = new HashMap();
map.put(jobIDField,jobID);
map.put(parentIDHashField,targetDocumentIDHash);
map.put(childIDHashField,sourceDocumentIDHash);
map.put(linkTypeField,linkType);
map.put(newField,statusToString(LINKSTATUS_NEW));
map.put(processIDField,processID);
performInsert(map,null);
noteModifications(1,0,0);
}
else
{
HashMap map = new HashMap();
map.put(newField,statusToString(LINKSTATUS_EXISTING));
map.put(processIDField,processID);
ArrayList updateList = new ArrayList();
String query = buildConjunctionClause(updateList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(parentIDHashField,targetDocumentIDHash),
new UnitaryClause(linkTypeField,linkType),
new UnitaryClause(childIDHashField,sourceDocumentIDHash)});
performUpdate(map,"WHERE "+query,updateList,null);
noteModifications(0,1,0);
}
}
return newReferences;
}
/** Calculate the max clauses for the exists check
*/
protected int maxClausePerformExistsCheck(Long jobID, String linkType, String childIDHash)
{
return findConjunctionClauseMax(new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(linkTypeField,linkType),
new UnitaryClause(childIDHashField,childIDHash)});
}
/** Do the exists check, in batch. */
protected void performExistsCheck(Map presentMap, Long jobID, String linkType, String childIDHash, ArrayList list)
throws ManifoldCFException
{
ArrayList newList = new ArrayList();
String query = buildConjunctionClause(newList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new MultiClause(parentIDHashField,list),
new UnitaryClause(linkTypeField,linkType),
new UnitaryClause(childIDHashField,childIDHash)});
IResultSet result = performQuery("SELECT "+parentIDHashField+" FROM "+getTableName()+" WHERE "+query+" FOR UPDATE",newList,null,null);
int i = 0;
while (i < result.getRowCount())
{
IResultRow row = result.getRow(i++);
String parentIDHash = (String)row.getValue(parentIDHashField);
presentMap.put(parentIDHash,parentIDHash);
}
}
/** Remove all links that mention a specific set of documents, as described by a join.
*/
public void removeDocumentLinks(Long jobID,
String joinTableName,
String joinTableIDColumn, String joinTableJobColumn,
String joinTableCriteria, ArrayList joinTableParams)
throws ManifoldCFException
{
// Delete matches for childIDHashField
StringBuilder sb = new StringBuilder("WHERE ");
ArrayList list = new ArrayList();
sb.append("EXISTS(SELECT 'x' FROM ").append(joinTableName).append(" WHERE ")
.append(buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(joinTableJobColumn,jobID),
new JoinClause(joinTableIDColumn,getTableName()+"."+childIDHashField)})).append(" AND ");
sb.append(joinTableCriteria);
list.addAll(joinTableParams);
sb.append(")");
performDelete(sb.toString(),list,null);
noteModifications(0,0,1);
// DON'T delete ParentID matches; we need to leave those around for bookkeeping to
// be correct. See CONNECTORS-501.
}
/** Remove all links that mention a specific set of documents.
*/
public void removeDocumentLinks(Long jobID,
String[] documentIDHashes)
throws ManifoldCFException
{
int maxClause = maxClausePerformRemoveDocumentLinks(jobID);
ArrayList list = new ArrayList();
int i = 0;
int k = 0;
while (i < documentIDHashes.length)
{
if (k == maxClause)
{
performRemoveDocumentLinks(list,jobID);
list.clear();
k = 0;
}
list.add(documentIDHashes[i++]);
k++;
}
if (k > 0)
performRemoveDocumentLinks(list,jobID);
noteModifications(0,0,documentIDHashes.length);
}
protected int maxClausePerformRemoveDocumentLinks(Long jobID)
{
return findConjunctionClauseMax(new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID)});
}
protected void performRemoveDocumentLinks(ArrayList list, Long jobID)
throws ManifoldCFException
{
StringBuilder sb = new StringBuilder("WHERE ");
ArrayList thisList = new ArrayList();
sb.append(buildConjunctionClause(thisList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new MultiClause(childIDHashField,list)}));
performDelete(sb.toString(),thisList,null);
// DON'T do parentID matches; we need to leave those around. See CONNECTORS-501.
}
/** Remove all target links of the specified source documents that are not marked as "new" or "existing", and
* return the others to their base state.
*/
public void removeLinks(Long jobID,
String commonNewExpression, ArrayList commonNewParams,
String[] sourceDocumentIDHashes)
throws ManifoldCFException
{
int maxClause = maxClausePerformRemoveLinks(jobID);
ArrayList list = new ArrayList();
int i = 0;
int k = 0;
while (i < sourceDocumentIDHashes.length)
{
if (k == maxClause)
{
performRemoveLinks(list,jobID,commonNewExpression,commonNewParams);
list.clear();
k = 0;
}
list.add(sourceDocumentIDHashes[i++]);
k++;
}
if (k > 0)
performRemoveLinks(list,jobID,commonNewExpression,commonNewParams);
noteModifications(0,0,sourceDocumentIDHashes.length);
}
protected int maxClausePerformRemoveLinks(Long jobID)
{
return findConjunctionClauseMax(new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID)});
}
protected void performRemoveLinks(ArrayList list, Long jobID, String commonNewExpression,
ArrayList commonNewParams)
throws ManifoldCFException
{
StringBuilder sb = new StringBuilder("WHERE ");
ArrayList thisList = new ArrayList();
sb.append(buildConjunctionClause(thisList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new MultiClause(childIDHashField,list)}));
if (commonNewExpression != null)
{
sb.append(" AND ").append(commonNewExpression);
thisList.addAll(commonNewParams);
}
performDelete(sb.toString(),thisList,null);
}
/** Return all target links of the specified source documents to their base state.
*/
public void restoreLinks(Long jobID, String[] sourceDocumentIDHashes)
throws ManifoldCFException
{
int maxClause = maxClausesPerformRestoreLinks(jobID);
ArrayList list = new ArrayList();
int i = 0;
int k = 0;
while (i < sourceDocumentIDHashes.length)
{
if (k == maxClause)
{
performRestoreLinks(jobID,list);
list.clear();
k = 0;
}
list.add(sourceDocumentIDHashes[i++]);
k++;
}
if (k > 0)
performRestoreLinks(jobID,list);
noteModifications(0,sourceDocumentIDHashes.length,0);
}
protected int maxClausesPerformRestoreLinks(Long jobID)
{
return findConjunctionClauseMax(new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID)});
}
protected void performRestoreLinks(Long jobID, ArrayList list)
throws ManifoldCFException
{
HashMap map = new HashMap();
map.put(newField,statusToString(LINKSTATUS_BASE));
map.put(processIDField,null);
StringBuilder sb = new StringBuilder("WHERE ");
ArrayList newList = new ArrayList();
sb.append(buildConjunctionClause(newList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new MultiClause(childIDHashField,list)})).append(" AND ")
.append(newField).append(" IN (?,?)");
newList.add(statusToString(LINKSTATUS_EXISTING));
newList.add(statusToString(LINKSTATUS_NEW));
performUpdate(map,sb.toString(),newList,null);
}
/** Get document's children.
*@return rows that contain the children. Column names are 'linktype','childidentifier'.
*/
public IResultSet getDocumentChildren(Long jobID, String parentIDHash)
throws ManifoldCFException
{
ArrayList list = new ArrayList();
String query = buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(parentIDHashField,parentIDHash)});
return performQuery("SELECT "+linkTypeField+" AS linktype,"+childIDHashField+" AS childidentifier FROM "+
getTableName()+" WHERE "+query,list,null,null);
}
/** Get document's parents.
*@return a set of document identifier hashes that constitute parents of the specified identifier.
*/
public String[] getDocumentUniqueParents(Long jobID, String childIDHash)
throws ManifoldCFException
{
ArrayList list = new ArrayList();
String query = buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(childIDHashField,childIDHash)});
IResultSet set = performQuery("SELECT DISTINCT "+parentIDHashField+" FROM "+
getTableName()+" WHERE "+query,list,null,null);
String[] rval = new String[set.getRowCount()];
int i = 0;
while (i < rval.length)
{
IResultRow row = set.getRow(i);
rval[i++] = (String)row.getValue(parentIDHashField);
}
return rval;
}
/** Convert string to link status. */
public static int stringToStatus(String status)
{
Integer value = (Integer)linkstatusMap.get(status);
return value.intValue();
}
/** Convert link status to string */
public static String statusToString(int status)
{
switch (status)
{
case LINKSTATUS_BASE:
return "B";
case LINKSTATUS_NEW:
return "N";
case LINKSTATUS_EXISTING:
return "E";
default:
return null;
}
}
// This class filters an ordered resultset to return only the duplicates
protected static class DuplicateFinder implements ILimitChecker
{
protected Long prevJobID = null;
protected String prevLinkType = null;
protected String prevParentIDHash = null;
protected String prevChildIDHash = null;
public DuplicateFinder()
{
}
/** See if this class can be legitimately compared against another of
* the same type.
*@return true if comparisons will ever return "true".
*/
public boolean doesCompareWork()
{
return false;
}
/** Create a duplicate of this class instance. All current state should be preserved.
* NOTE: Since doesCompareWork() returns false, queries using this limit checker cannot
* be cached, and therefore duplicate() is never called from the query executor.
*@return the duplicate.
*/
public ILimitChecker duplicate()
{
DuplicateFinder df = new DuplicateFinder();
df.prevJobID = prevJobID;
df.prevLinkType = prevLinkType;
df.prevParentIDHash = prevParentIDHash;
df.prevChildIDHash = prevChildIDHash;
return df;
}
/** Find the hashcode for this class. This will only ever be used if
* doesCompareWork() returns true.
*@return the hashcode.
*/
public int hashCode()
{
return 0;
}
/** Compare two objects and see if equal. This will only ever be used
* if doesCompareWork() returns true.
*@param object is the object to compare against.
*@return true if equal.
*/
public boolean equals(Object object)
{
return false;
}
/** See if a result row should be included in the final result set.
*@param row is the result row to check.
*@return true if it should be included, false otherwise.
*/
public boolean checkInclude(IResultRow row)
throws ManifoldCFException
{
Long jobID = (Long)row.getValue(jobIDField);
String linkType = (String)row.getValue(linkTypeField);
String parentIDHash = (String)row.getValue(parentIDHashField);
if (parentIDHash == null)
parentIDHash = "";
String childIDHash = (String)row.getValue(childIDHashField);
if (childIDHash == null)
childIDHash = "";
// If this is a duplicate, we want to keep it!
if (prevJobID != null && jobID.equals(prevJobID) && linkType.equals(prevLinkType) && parentIDHash.equals(prevParentIDHash) && childIDHash.equals(prevChildIDHash))
return true;
prevJobID = jobID;
prevLinkType = linkType;
prevParentIDHash = parentIDHash;
prevChildIDHash = childIDHash;
return false;
}
/** See if we should examine another row.
*@return true if we need to keep going, or false if we are done.
*/
public boolean checkContinue()
throws ManifoldCFException
{
return true;
}
}
}