blob: 4187de138887d88be6b9c72316728b4b57b491fe [file] [log] [blame]
package org.apache.manifoldcf.crawler.connectors.confluence;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.text.DateFormat;
import java.text.MessageFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.lang.StringUtils;
import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
import org.apache.manifoldcf.agents.interfaces.ServiceInterruption;
import org.apache.manifoldcf.core.interfaces.ConfigParams;
import org.apache.manifoldcf.core.interfaces.IHTTPOutput;
import org.apache.manifoldcf.core.interfaces.IPasswordMapperActivity;
import org.apache.manifoldcf.core.interfaces.IPostParameters;
import org.apache.manifoldcf.core.interfaces.IThreadContext;
import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
import org.apache.manifoldcf.core.interfaces.Specification;
import org.apache.manifoldcf.core.interfaces.SpecificationNode;
import org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector;
import org.apache.manifoldcf.crawler.connectors.confluence.client.ConfluenceClient;
import org.apache.manifoldcf.crawler.connectors.confluence.model.Attachment;
import org.apache.manifoldcf.crawler.connectors.confluence.model.ConfluenceResponse;
import org.apache.manifoldcf.crawler.connectors.confluence.model.Page;
import org.apache.manifoldcf.crawler.connectors.confluence.util.ConfluenceUtil;
import org.apache.manifoldcf.crawler.interfaces.IExistingVersions;
import org.apache.manifoldcf.crawler.interfaces.IProcessActivity;
import org.apache.manifoldcf.crawler.interfaces.ISeedingActivity;
import org.apache.manifoldcf.crawler.system.Logging;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Optional;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
/**
* <p>
* Confluence Repository Connector class
* </p>
* <p>
* ManifoldCF Repository connector to deal with Confluence documents
* </p>
*
* @author Antonio David Perez Morales <adperezmorales@gmail.com>
*
*/
public class ConfluenceRepositoryConnector extends BaseRepositoryConnector {
protected final static String ACTIVITY_READ = "read document";
/** Deny access token for default authority */
private final static String defaultAuthorityDenyToken = GLOBAL_DENY_TOKEN;
/*
* Prefix for Confluence configuration and specification parameters
*/
private static final String PARAMETER_PREFIX = "confluence_";
/* Configuration tabs */
private static final String CONF_SERVER_TAB_PROPERTY = "ConfluenceRepositoryConnector.Server";
/* Specification tabs */
private static final String CONF_SPACES_TAB_PROPERTY = "ConfluenceRepositoryConnector.Spaces";
private static final String CONF_PAGES_TAB_PROPERTY = "ConfluenceRepositoryConnector.Pages";
// pages & js
// Template names for Confluence configuration
/**
* Forward to the javascript to check the configuration parameters
*/
private static final String EDIT_CONFIG_HEADER_FORWARD = "editConfiguration_conf.js";
/**
* Server tab template
*/
private static final String EDIT_CONFIG_FORWARD_SERVER = "editConfiguration_conf_server.html";
/**
* Forward to the HTML template to view the configuration parameters
*/
private static final String VIEW_CONFIG_FORWARD = "viewConfiguration_conf.html";
// Template names for Confluence job specification
/**
* Forward to the javascript to check the specification parameters for the
* job
*/
private static final String EDIT_SPEC_HEADER_FORWARD = "editSpecification_conf.js";
/**
* Forward to the template to edit the spaces for the job
*/
private static final String EDIT_SPEC_FORWARD_SPACES = "editSpecification_confSpaces.html";
/**
* Forward to the template to edit the pages configuration for the job
*/
private static final String EDIT_SPEC_FORWARD_CONF_PAGES = "editSpecification_confPages.html";
/**
* Forward to the template to view the specification parameters for the job
*/
private static final String VIEW_SPEC_FORWARD = "viewSpecification_conf.html";
protected long lastSessionFetch = -1L;
protected static final long timeToRelease = 300000L;
protected final static long interruptionRetryTime = 5L * 60L * 1000L;
private Logger logger = LoggerFactory
.getLogger(ConfluenceRepositoryConnector.class);
/* Confluence instance parameters */
protected String protocol = null;
protected String host = null;
protected String port = null;
protected String path = null;
protected String username = null;
protected String password = null;
protected ConfluenceClient confluenceClient = null;
/**
* <p>
* Default constructor
* </p>
*/
public ConfluenceRepositoryConnector() {
super();
}
/**
* Set Confluence Client (Mainly for Testing)
*
* @param confluenceClient
*/
public void setConfluenceClient(ConfluenceClient confluenceClient){
this.confluenceClient = confluenceClient;
}
@Override
public String[] getActivitiesList() {
return new String[] { ACTIVITY_READ };
}
@Override
public String[] getBinNames(String documentIdentifier) {
return new String[] { host };
}
/**
* Close the connection. Call this before discarding the connection.
*/
@Override
public void disconnect() throws ManifoldCFException {
if (confluenceClient != null) {
confluenceClient = null;
}
protocol = null;
host = null;
port = null;
path = null;
username = null;
password = null;
}
/**
* Makes connection to server
*
*
*/
@Override
public void connect(ConfigParams configParams) {
super.connect(configParams);
protocol = params.getParameter(ConfluenceConfiguration.Server.PROTOCOL);
host = params.getParameter(ConfluenceConfiguration.Server.HOST);
port = params.getParameter(ConfluenceConfiguration.Server.PORT);
path = params.getParameter(ConfluenceConfiguration.Server.PATH);
username = params.getParameter(ConfluenceConfiguration.Server.USERNAME);
password = params
.getObfuscatedParameter(ConfluenceConfiguration.Server.PASSWORD);
try {
initConfluenceClient();
} catch (ManifoldCFException e) {
logger.debug(
"Not possible to initialize Confluence client. Reason: {}",
e.getMessage());
e.printStackTrace();
}
}
/**
* Checks if connection is available
*/
@Override
public String check() throws ManifoldCFException {
try {
if (!isConnected()) {
initConfluenceClient();
}
Boolean result = confluenceClient.check();
if (result)
return super.check();
else
throw new ManifoldCFException(
"Confluence instance could not be reached");
} catch (ServiceInterruption e) {
return "Connection temporarily failed: " + e.getMessage();
} catch (ManifoldCFException e) {
return "Connection failed: " + e.getMessage();
} catch (Exception e) {
return "Connection failed: " + e.getMessage();
}
}
/**
* <p>
* Initialize Confluence client using the configured parameters
*
* @throws ManifoldCFException
*/
protected void initConfluenceClient() throws ManifoldCFException {
if (confluenceClient == null) {
if (StringUtils.isEmpty(protocol)) {
throw new ManifoldCFException("Parameter "
+ ConfluenceConfiguration.Server.PROTOCOL
+ " required but not set");
}
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("Confluence protocol = '" + protocol
+ "'");
}
if (StringUtils.isEmpty(host)) {
throw new ManifoldCFException("Parameter "
+ ConfluenceConfiguration.Server.HOST
+ " required but not set");
}
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("Confluence host = '" + host + "'");
}
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("Confluence port = '" + port + "'");
}
// if (StringUtils.isEmpty(path)) {
// throw new ManifoldCFException("Parameter "
// + ConfluenceConfiguration.Server.PATH
// + " required but not set");
// }
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("Confluence path = '" + path + "'");
}
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("Confluence username = '" + username
+ "'");
}
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors
.debug("Confluence password '" + password != null ? "set"
: "not set" + "'");
}
int portInt;
if (port != null && port.length() > 0) {
try {
portInt = Integer.parseInt(port);
} catch (NumberFormatException e) {
throw new ManifoldCFException("Bad number: "
+ e.getMessage(), e);
}
} else {
if (protocol.toLowerCase(Locale.ROOT).equals("http"))
portInt = 80;
else
portInt = 443;
}
/* Generating a client to perform Confluence requests */
confluenceClient = new ConfluenceClient(protocol, host, portInt,
path, username, password);
lastSessionFetch = System.currentTimeMillis();
}
}
/**
* This method is called to assess whether to count this connector instance
* should actually be counted as being connected.
*
* @return true if the connector instance is actually connected.
*/
@Override
public boolean isConnected() {
return confluenceClient != null;
}
@Override
public void poll() throws ManifoldCFException {
if (lastSessionFetch == -1L) {
return;
}
long currentTime = System.currentTimeMillis();
if (currentTime >= lastSessionFetch + timeToRelease) {
confluenceClient.close();
confluenceClient = null;
lastSessionFetch = -1L;
}
}
@Override
public int getMaxDocumentRequest() {
return super.getMaxDocumentRequest();
}
/**
* Return the list of relationship types that this connector recognizes.
*
* @return the list.
*/
@Override
public String[] getRelationshipTypes() {
return new String[] {};
}
private void fillInServerConfigurationMap(Map<String, String> serverMap,
IPasswordMapperActivity mapper, ConfigParams parameters) {
String confluenceProtocol = parameters
.getParameter(ConfluenceConfiguration.Server.PROTOCOL);
String confluenceHost = parameters
.getParameter(ConfluenceConfiguration.Server.HOST);
String confluencePort = parameters
.getParameter(ConfluenceConfiguration.Server.PORT);
String confluencePath = parameters
.getParameter(ConfluenceConfiguration.Server.PATH);
String confluenceUsername = parameters
.getParameter(ConfluenceConfiguration.Server.USERNAME);
String confluencePassword = parameters
.getObfuscatedParameter(ConfluenceConfiguration.Server.PASSWORD);
if (confluenceProtocol == null)
confluenceProtocol = ConfluenceConfiguration.Server.PROTOCOL_DEFAULT_VALUE;
if (confluenceHost == null)
confluenceHost = ConfluenceConfiguration.Server.HOST_DEFAULT_VALUE;
if (confluencePort == null)
confluencePort = ConfluenceConfiguration.Server.PORT_DEFAULT_VALUE;
if (confluencePath == null)
confluencePath = ConfluenceConfiguration.Server.PATH_DEFAULT_VALUE;
if (confluenceUsername == null)
confluenceUsername = ConfluenceConfiguration.Server.USERNAME_DEFAULT_VALUE;
if (confluencePassword == null)
confluencePassword = ConfluenceConfiguration.Server.PASSWORD_DEFAULT_VALUE;
else
confluencePassword = mapper.mapPasswordToKey(confluencePassword);
serverMap.put(PARAMETER_PREFIX
+ ConfluenceConfiguration.Server.PROTOCOL, confluenceProtocol);
serverMap.put(PARAMETER_PREFIX + ConfluenceConfiguration.Server.HOST,
confluenceHost);
serverMap.put(PARAMETER_PREFIX + ConfluenceConfiguration.Server.PORT,
confluencePort);
serverMap.put(PARAMETER_PREFIX + ConfluenceConfiguration.Server.PATH,
confluencePath);
serverMap.put(PARAMETER_PREFIX
+ ConfluenceConfiguration.Server.USERNAME, confluenceUsername);
serverMap.put(PARAMETER_PREFIX
+ ConfluenceConfiguration.Server.PASSWORD, confluencePassword);
}
@Override
public void viewConfiguration(IThreadContext threadContext,
IHTTPOutput out, Locale locale, ConfigParams parameters)
throws ManifoldCFException, IOException {
Map<String, String> paramMap = new HashMap<String, String>();
/* Fill server configuration parameters */
fillInServerConfigurationMap(paramMap, out, parameters);
Messages.outputResourceWithVelocity(out, locale, VIEW_CONFIG_FORWARD,
paramMap, true);
}
@Override
public void outputConfigurationHeader(IThreadContext threadContext,
IHTTPOutput out, Locale locale, ConfigParams parameters,
List<String> tabsArray) throws ManifoldCFException, IOException {
// Add the Server tab
tabsArray.add(Messages.getString(locale, CONF_SERVER_TAB_PROPERTY));
// Map the parameters
Map<String, String> paramMap = new HashMap<String, String>();
/* Fill server configuration parameters */
fillInServerConfigurationMap(paramMap, out, parameters);
// Output the Javascript - only one Velocity template for all tabs
Messages.outputResourceWithVelocity(out, locale,
EDIT_CONFIG_HEADER_FORWARD, paramMap, true);
}
@Override
public void outputConfigurationBody(IThreadContext threadContext,
IHTTPOutput out, Locale locale, ConfigParams parameters,
String tabName) throws ManifoldCFException, IOException {
// Call the Velocity templates for each tab
Map<String, String> paramMap = new HashMap<String, String>();
// Set the tab name
paramMap.put("TabName", tabName);
// Fill in the parameters
fillInServerConfigurationMap(paramMap, out, parameters);
// Server tab
Messages.outputResourceWithVelocity(out, locale,
EDIT_CONFIG_FORWARD_SERVER, paramMap, true);
}
/*
* Repository specification post handle, (server and proxy & client secret
* etc)
*
* @see
* org.apache.manifoldcf.core.connector.BaseConnector#processConfigurationPost
* (org.apache.manifoldcf.core.interfaces.IThreadContext,
* org.apache.manifoldcf.core.interfaces.IPostParameters,
* org.apache.manifoldcf.core.interfaces.ConfigParams)
*/
@Override
public String processConfigurationPost(IThreadContext threadContext,
IPostParameters variableContext, ConfigParams parameters)
throws ManifoldCFException {
String confluenceProtocol = variableContext
.getParameter(PARAMETER_PREFIX
+ ConfluenceConfiguration.Server.PROTOCOL);
if (confluenceProtocol != null)
parameters.setParameter(ConfluenceConfiguration.Server.PROTOCOL,
confluenceProtocol);
String confluenceHost = variableContext.getParameter(PARAMETER_PREFIX
+ ConfluenceConfiguration.Server.HOST);
if (confluenceHost != null)
parameters.setParameter(ConfluenceConfiguration.Server.HOST,
confluenceHost);
String confluencePort = variableContext.getParameter(PARAMETER_PREFIX
+ ConfluenceConfiguration.Server.PORT);
if (confluencePort != null)
parameters.setParameter(ConfluenceConfiguration.Server.PORT,
confluencePort);
String confluencePath = variableContext.getParameter(PARAMETER_PREFIX
+ ConfluenceConfiguration.Server.PATH);
if (confluencePath != null)
parameters.setParameter(ConfluenceConfiguration.Server.PATH,
confluencePath);
String confluenceUsername = variableContext
.getParameter(PARAMETER_PREFIX
+ ConfluenceConfiguration.Server.USERNAME);
if (confluenceUsername != null)
parameters.setParameter(ConfluenceConfiguration.Server.USERNAME,
confluenceUsername);
String confluencePassword = variableContext
.getParameter(PARAMETER_PREFIX
+ ConfluenceConfiguration.Server.PASSWORD);
if (confluencePassword != null)
parameters.setObfuscatedParameter(
ConfluenceConfiguration.Server.PASSWORD,
variableContext.mapKeyToPassword(confluencePassword));
/* null means process configuration has been successful */
return null;
}
/**
* <p>
* Fill the configured spaces into the map
* </p>
*
* @param newMap
* @param ds
*/
private void fillInConfSpacesSpecificationMap(Map<String, Object> newMap,
ConfluenceSpecification cs) {
newMap.put(ConfluenceConfiguration.Specification.SPACES.toUpperCase(),
cs.getSpaces());
}
/**
* <p>
* Fill the pages configuration into the map
* </p>
*
* @param newMap
* @param ds
*/
private void fillInConfPagesSpecificationMap(Map<String, Object> newMap,
ConfluenceSpecification cs) {
newMap.put(
ConfluenceConfiguration.Specification.PROCESS_ATTACHMENTS_ATTRIBUTE_KEY
.toUpperCase(), cs.isProcessAttachments());
return;
}
@Override
public void viewSpecification(IHTTPOutput out, Locale locale,
Specification ds, int connectionSequenceNumber)
throws ManifoldCFException, IOException {
Map<String, Object> paramMap = new HashMap<String, Object>();
paramMap.put("SeqNum", Integer.toString(connectionSequenceNumber));
ConfluenceSpecification cs = ConfluenceSpecification.from(ds);
fillInConfSpacesSpecificationMap(paramMap, cs);
fillInConfPagesSpecificationMap(paramMap, cs);
Messages.outputResourceWithVelocity(out, locale, VIEW_SPEC_FORWARD,
paramMap);
}
/*
* Handle job specification post
*
* @see org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector#
* processSpecificationPost
* (org.apache.manifoldcf.core.interfaces.IPostParameters,
* org.apache.manifoldcf.crawler.interfaces.DocumentSpecification)
*/
@Override
public String processSpecificationPost(IPostParameters variableContext,
Locale locale, Specification ds, int connectionSequenceNumber)
throws ManifoldCFException {
String seqPrefix = "s" + connectionSequenceNumber + "_";
String xc = variableContext.getParameter(seqPrefix + "spacescount");
if (xc != null) {
// Delete all preconfigured spaces
int i = 0;
while (i < ds.getChildCount()) {
SpecificationNode sn = ds.getChild(i);
if (sn.getType().equals(
ConfluenceConfiguration.Specification.SPACES))
ds.removeChild(i);
else
i++;
}
SpecificationNode spaces = new SpecificationNode(
ConfluenceConfiguration.Specification.SPACES);
ds.addChild(ds.getChildCount(), spaces);
int spacesCount = Integer.parseInt(xc);
i = 0;
while (i < spacesCount) {
String spaceDescription = "_" + Integer.toString(i);
String spaceOpName = seqPrefix + "spaceop" + spaceDescription;
xc = variableContext.getParameter(spaceOpName);
if (xc != null && xc.equals("Delete")) {
// Next row
i++;
continue;
}
// Get the stuff we need
String spaceKey = variableContext.getParameter(seqPrefix
+ "space" + spaceDescription);
SpecificationNode node = new SpecificationNode(
ConfluenceConfiguration.Specification.SPACE);
node.setAttribute(
ConfluenceConfiguration.Specification.SPACE_KEY_ATTRIBUTE,
spaceKey);
spaces.addChild(spaces.getChildCount(), node);
i++;
}
String op = variableContext.getParameter(seqPrefix + "spaceop");
if (op != null && op.equals("Add")) {
String spaceSpec = variableContext.getParameter(seqPrefix
+ "space");
SpecificationNode node = new SpecificationNode(
ConfluenceConfiguration.Specification.SPACE);
node.setAttribute(
ConfluenceConfiguration.Specification.SPACE_KEY_ATTRIBUTE,
spaceSpec);
spaces.addChild(spaces.getChildCount(), node);
}
}
/* Delete pages configuration */
int i = 0;
while (i < ds.getChildCount()) {
SpecificationNode sn = ds.getChild(i);
if (sn.getType()
.equals(ConfluenceConfiguration.Specification.PAGES))
ds.removeChild(i);
else
i++;
}
SpecificationNode pages = new SpecificationNode(
ConfluenceConfiguration.Specification.PAGES);
ds.addChild(ds.getChildCount(), pages);
String procAttachments = variableContext
.getParameter(seqPrefix
+ ConfluenceConfiguration.Specification.PROCESS_ATTACHMENTS_ATTRIBUTE_KEY);
if (procAttachments != null && !procAttachments.isEmpty()) {
pages.setAttribute(
ConfluenceConfiguration.Specification.PROCESS_ATTACHMENTS_ATTRIBUTE_KEY,
String.valueOf(procAttachments));
}
return null;
}
/*
* (non-Javadoc)
*
* @see org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector#
* outputSpecificationBody
* (org.apache.manifoldcf.core.interfaces.IHTTPOutput, java.util.Locale,
* org.apache.manifoldcf.crawler.interfaces.DocumentSpecification,
* java.lang.String)
*/
@Override
public void outputSpecificationBody(IHTTPOutput out, Locale locale,
Specification ds, int connectionSequenceNumber,
int actualSequenceNumber, String tabName)
throws ManifoldCFException, IOException {
// Output JIRAQuery tab
Map<String, Object> paramMap = new HashMap<String, Object>();
paramMap.put("TabName", tabName);
paramMap.put("SeqNum", Integer.toString(connectionSequenceNumber));
paramMap.put("SelectedNum", Integer.toString(actualSequenceNumber));
ConfluenceSpecification cs = ConfluenceSpecification.from(ds);
fillInConfSpacesSpecificationMap(paramMap, cs);
fillInConfPagesSpecificationMap(paramMap, cs);
Messages.outputResourceWithVelocity(out, locale,
EDIT_SPEC_FORWARD_SPACES, paramMap);
Messages.outputResourceWithVelocity(out, locale,
EDIT_SPEC_FORWARD_CONF_PAGES, paramMap);
}
/*
* Header for the specification
*
* @see org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector#
* outputSpecificationHeader
* (org.apache.manifoldcf.core.interfaces.IHTTPOutput, java.util.Locale,
* org.apache.manifoldcf.crawler.interfaces.DocumentSpecification,
* java.util.List)
*/
@Override
public void outputSpecificationHeader(IHTTPOutput out, Locale locale,
Specification ds, int connectionSequenceNumber,
List<String> tabsArray) throws ManifoldCFException, IOException {
tabsArray.add(Messages.getString(locale, CONF_SPACES_TAB_PROPERTY));
tabsArray.add(Messages.getString(locale, CONF_PAGES_TAB_PROPERTY));
Map<String, Object> paramMap = new HashMap<String, Object>();
paramMap.put("SeqNum", Integer.toString(connectionSequenceNumber));
Messages.outputResourceWithVelocity(out, locale,
EDIT_SPEC_HEADER_FORWARD, paramMap);
}
/*
* Adding seed documents
*
* @see org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector#
* addSeedDocuments
* (org.apache.manifoldcf.crawler.interfaces.ISeedingActivity,
* org.apache.manifoldcf.crawler.interfaces.DocumentSpecification, long,
* long, int)
*/
public String addSeedDocuments(ISeedingActivity activities,
Specification spec, String lastSeedVersion, long seedTime,
int jobMode) throws ManifoldCFException, ServiceInterruption {
if (!isConnected()) {
initConfluenceClient();
}
try {
/*
* Not uses delta seeding because Confluence can't be queried using
* dates or in a ordered way, only start and limit which can cause
* problems if an already indexed document is deleted, because we
* will miss some to-be indexed docs due to the last start parameter
* stored in the last execution
*/
// if(lastSeedVersion != null && !lastSeedVersion.isEmpty()) {
// StringTokenizer tokenizer = new
// StringTokenizer(lastSeedVersion,"|");
//
// lastStart = new Long(lastSeedVersion);
// }
ConfluenceSpecification confluenceSpecification = ConfluenceSpecification
.from(spec);
List<String> spaceKeys = confluenceSpecification.getSpaces();
if (spaceKeys.isEmpty()) {
logger.info("No spaces configured. Processing all spaces");
addSeedDocumentsForSpace(Optional.<String> absent(),
activities, confluenceSpecification, lastSeedVersion,
seedTime, jobMode);
} else {
for (String space : spaceKeys) {
logger.info("Processing configured space {}", space);
addSeedDocumentsForSpace(Optional.<String> of(space),
activities, confluenceSpecification,
lastSeedVersion, seedTime, jobMode);
}
}
return "";
} catch (Exception e) {
handleConfluenceDownException(e, "seeding");
return null;
}
}
/**
* <p>
* Add seed documents for a given optional space
* </p>
*
* @throws ServiceInterruption
* @throws ManifoldCFException
*/
private void addSeedDocumentsForSpace(Optional<String> space,
ISeedingActivity activities,
ConfluenceSpecification confluenceSpec, String lastSeedVersion,
long seedTime, int jobMode) throws ManifoldCFException,
ServiceInterruption {
long lastStart = 0;
long defaultSize = 50;
if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
String spaceDesc = space.isPresent() ? "space with key "
+ space.get() : "all the spaces";
Logging.connectors.debug(MessageFormat.format(
"Starting from {0} and size {1} for {2}", new Object[] {
lastStart, defaultSize, spaceDesc }));
}
try {
Boolean isLast = true;
do {
final ConfluenceResponse<Page> response = confluenceClient.getPages(
(int) lastStart, (int) defaultSize, space);
int count = 0;
for (Page page : response.getResults()) {
activities.addSeedDocument(page.getId());
if (confluenceSpec.isProcessAttachments()) {
processSeedAttachments(page, activities);
}
count++;
}
if (Logging.connectors != null
&& Logging.connectors.isDebugEnabled())
Logging.connectors.debug(MessageFormat.format(
"Fetched and added {0} seed documents",
new Object[] { new Integer(count) }));
lastStart += count;
isLast = response.isLast();
if (Logging.connectors != null
&& Logging.connectors.isDebugEnabled())
Logging.connectors.debug(MessageFormat.format(
"New start {0} and size {1}", new Object[] {
lastStart, defaultSize }));
} while (!isLast);
} catch (Exception e) {
handleConfluenceDownException(e, "seeding");
}
}
/**
* <p>
* Process seed attachments for the given page
* </p>
*
* @param page
* @param activities
*/
private void processSeedAttachments(Page page, ISeedingActivity activities)
throws ManifoldCFException, ServiceInterruption {
long lastStart = 0;
long defaultSize = 50;
if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
Logging.connectors
.debug(MessageFormat
.format("Processing page {} attachments starting from {} and size {}",
new Object[] { page.getId(), lastStart,
defaultSize }));
}
try {
Boolean isLast = true;
do {
final ConfluenceResponse<Attachment> response = confluenceClient
.getPageAttachments(page.getId(), (int) lastStart,
(int) defaultSize);
int count = 0;
for (Page resultPage : response.getResults()) {
activities.addSeedDocument(ConfluenceUtil
.generateRepositoryDocumentIdentifier(
resultPage.getId(), page.getId()));
count++;
}
if (Logging.connectors != null
&& Logging.connectors.isDebugEnabled())
Logging.connectors
.debug(MessageFormat
.format("Fetched and added {} seed document attachments for page {}",
new Object[] { new Integer(count),
page.getId() }));
lastStart += count;
isLast = response.isLast();
if (Logging.connectors != null
&& Logging.connectors.isDebugEnabled())
Logging.connectors.debug(MessageFormat.format(
"New start {0} and size {1}", new Object[] {
lastStart, defaultSize }));
} while (!isLast);
} catch (Exception e) {
handleConfluenceDownException(e, "seeding");
}
}
protected static void handleConfluenceDownException(Exception e,
String context) throws ManifoldCFException, ServiceInterruption {
long currentTime = System.currentTimeMillis();
// Server doesn't appear to by up. Try for a brief time then give up.
String message = "Server appears down during " + context + ": "
+ e.getMessage();
Logging.connectors.warn(message, e);
throw new ServiceInterruption(message, e, currentTime
+ interruptionRetryTime, -1L, 3, true);
}
/*
* Process documents
*
* @see org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector#
* processDocuments(java.lang.String[], java.lang.String[],
* org.apache.manifoldcf.crawler.interfaces.IProcessActivity,
* org.apache.manifoldcf.crawler.interfaces.DocumentSpecification,
* boolean[])
*/
@Override
public void processDocuments(String[] documentIdentifiers,
IExistingVersions statuses, Specification spec,
IProcessActivity activities, int jobMode,
boolean usesDefaultAuthority) throws ManifoldCFException,
ServiceInterruption {
if(Logging.connectors != null && Logging.connectors.isDebugEnabled())
Logging.connectors
.debug("Process Confluence documents: Inside processDocuments");
for (int i = 0; i < documentIdentifiers.length; i++) {
String pageId = documentIdentifiers[i];
String version = statuses.getIndexedVersionString(pageId);
long startTime = System.currentTimeMillis();
String errorCode = "OK";
String errorDesc = StringUtils.EMPTY;
ProcessResult pResult = null;
boolean doLog = true;
try {
if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
Logging.connectors
.debug("Confluence: Processing document identifier '"
+ pageId + "'");
}
/* Ensure Confluence client is connected */
if (!isConnected()) {
initConfluenceClient();
}
if (ConfluenceUtil.isAttachment(pageId)) {
pResult = processPageAsAttachment(pageId, version,
activities, doLog);
}
else {
pResult = processPage(pageId, version, activities, doLog,
Maps.<String, String> newHashMap());
}
} catch (IOException ioe) {
handleIOException(ioe);
} catch (Exception e) {
handleException(e);
}
finally {
if (doLog){
if(pResult.errorCode != null && !pResult.errorCode.isEmpty()){
activities.recordActivity(new Long(startTime),
ACTIVITY_READ, pResult.fileSize, pageId, pResult.errorCode,
pResult.errorDescription, null);
}else{
activities.recordActivity(new Long(startTime),
ACTIVITY_READ, pResult.fileSize, pageId, errorCode,
errorDesc, null);
}
}
}
}
}
/**
* <p>
* Process the specific page
* </p>
*
* @param pageId
* The pageId being an attachment
* @param version
* The version of the page
* @param activities
* @param doLog
* @throws ManifoldCFException
* @throws IOException
* @throws ServiceInterruption
*/
private ProcessResult processPage(String pageId, String version,
IProcessActivity activities, boolean doLog,
Map<String, String> extraProperties) throws ManifoldCFException,
ServiceInterruption, IOException {
Page page = confluenceClient.getPage(pageId);
return processPageInternal(page, pageId, version, activities, doLog,
extraProperties);
}
/**
* <p>
* Process the specific attachment
* </p>
*
* @param pageId
* The pageId being an attachment
* @param version
* The version of the page
* @param activities
* @param doLog
* @throws IOException
* @throws ServiceInterruption
*/
private ProcessResult processPageAsAttachment(String pageId, String version,
IProcessActivity activities, boolean doLog)
throws ManifoldCFException, ServiceInterruption, IOException {
String[] ids = ConfluenceUtil.getAttachmentAndPageId(pageId);
Attachment attachment = confluenceClient.getAttachment(ids[0]);
Map<String, String> extraProperties = Maps.newHashMap();
extraProperties.put("attachedBy", ids[1]);
return processPageInternal(attachment, pageId, version, activities, doLog,
extraProperties);
}
/**
* <p>
* Process the specific page
* </p>
*
* @param pageId
* The pageId being an attachment
* @param manifoldDocumentIdentifier
* @param version
* The version of the page
* @param activities
* @param doLog
* @throws ManifoldCFException
* @throws IOException
* @throws ServiceInterruption
*/
private ProcessResult processPageInternal(Page page,
String manifoldDocumentIdentifier, String version,
IProcessActivity activities, boolean doLog,
Map<String, String> extraProperties) throws ManifoldCFException,
ServiceInterruption, IOException {
/* Remove page if it has no content */
/*
* Page does not have content if there was an error trying to get the
* page
*/
if (!page.hasContent()) {
activities.deleteDocument(manifoldDocumentIdentifier);
return new ProcessResult(page.getLength(), "DELETED", "");
}
if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("Confluence: This content exists: "
+ page.getId());
}
RepositoryDocument rd = new RepositoryDocument();
Date createdDate = page.getCreatedDate();
Date lastModified = page.getLastModifiedDate();
DateFormat df = DateFormat.getDateTimeInstance();
/*
* Retain page in Manifold because it has not changed from last time
* This is needed to keep the identifier in Manifold data, because by
* default if a document is not retained nor ingested, it will be
* deleted by the framework
*/
String lastVersion = df.format(lastModified);
if (!activities.checkDocumentNeedsReindexing(manifoldDocumentIdentifier, lastVersion)) {
return new ProcessResult(page.getLength(), "RETAINED", "");
}
if (!activities.checkLengthIndexable(page.getLength())){
activities.noDocument(page.getId(), lastVersion);
String errorCode = IProcessActivity.EXCLUDED_LENGTH;
String errorDesc = "Excluding document because of length ("+page.getLength()+")";
return new ProcessResult(page.getLength(), errorCode, errorDesc);
}
if (!activities.checkMimeTypeIndexable(page.getMediaType())) {
activities.noDocument(page.getId(), lastVersion);
String errorCode = IProcessActivity.EXCLUDED_MIMETYPE;
String errorDesc = "Excluding document because of mime type ("+page.getMediaType()+")";
return new ProcessResult(page.getLength(), errorCode, errorDesc);
}
if (!activities.checkDateIndexable(lastModified)) {
activities.noDocument(page.getId(), lastVersion);
String errorCode = IProcessActivity.EXCLUDED_DATE;
String errorDesc = "Excluding document because of date ("+lastModified+")";
return new ProcessResult(page.getLength(), errorCode, errorDesc);
}
if (!activities.checkURLIndexable(page.getWebUrl())) {
activities.noDocument(page.getId(), lastVersion);
String errorCode = IProcessActivity.EXCLUDED_URL;
String errorDesc = "Excluding document because of URL ('"+page.getWebUrl()+"')";
return new ProcessResult(page.getLength(), errorCode, errorDesc);
}
/* Add repository document information */
rd.setMimeType(page.getMediaType());
if (createdDate != null)
rd.setCreatedDate(createdDate);
if (lastModified != null)
rd.setModifiedDate(lastModified);
rd.setIndexingDate(new Date());
/* Adding Page Metadata */
Map<String, Object> pageMetadata = page.getMetadataAsMap();
for (Entry<String, Object> entry : pageMetadata.entrySet()) {
if(entry.getValue() instanceof List) {
List<?> list = (List<?>)entry.getValue();
rd.addField(entry.getKey(), list.toArray(new String[list.size()]));
}
else {
rd.addField(entry.getKey(), entry.getValue().toString());
}
}
/* Adding extra properties */
for (Entry<String, String> entry : extraProperties.entrySet()) {
rd.addField(entry.getKey(), entry.getValue());
}
String documentURI = page.getWebUrl();
/* Set repository document ACLs */
rd.setSecurityACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT,
new String[] { page.getSpace() });
rd.setSecurityDenyACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT,
new String[] { defaultAuthorityDenyToken });
rd.setBinary(page.getContentStream(), page.getLength());
rd.addField("size", String.valueOf(page.getLength()));
/* Ingest document */
activities.ingestDocumentWithException(manifoldDocumentIdentifier,
lastVersion, documentURI, rd);
return new ProcessResult(page.getLength(), null, null);
}
/**
* <p>
* Handles IO Exception to manage whether the exception is an interruption
* so that the process needs to be executed again later on
* </p>
*
* @param e
* The Exception
* @throws ManifoldCFException
* @throws ServiceInterruption
*/
private static void handleIOException(IOException e)
throws ManifoldCFException, ServiceInterruption {
if (!(e instanceof java.net.SocketTimeoutException)
&& (e instanceof InterruptedIOException)) {
throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
ManifoldCFException.INTERRUPTED);
}
Logging.connectors.warn("IO exception: " + e.getMessage(), e);
long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("IO exception: " + e.getMessage(), e,
currentTime + 300000L, currentTime + 3 * 60 * 60000L, -1, false);
}
/**
* <p>
* Handles general exceptions
* </p>
*
* @param e
* The Exception
* @throws ManifoldCFException
*/
private static void handleException(Exception e) throws ManifoldCFException {
Logging.connectors.warn("Exception: " + e.getMessage(), e);
throw new ManifoldCFException("Exception: " + e.getMessage(), e,
ManifoldCFException.REPOSITORY_CONNECTION_ERROR);
}
private class ProcessResult{
private long fileSize;
private String errorCode;
private String errorDescription;
private ProcessResult(long fileSize, String errorCode, String errorDescription){
this.fileSize = fileSize;
this.errorCode = errorCode;
this.errorDescription = errorDescription;
}
}
/**
* <p>
* Internal private class used to parse and keep the specification
* configuration in object format
* </p>
*
* @author Antonio David Perez Morales <adperezmorales@gmail.com>
*
*/
private static class ConfluenceSpecification {
private List<String> spaces;
private Boolean processAttachments = false;
/**
* <p>
* Returns if attachments should be processed
* </p>
*
* @return a {@code Boolean} indicating if the attachments should be
* processed or not
*/
public Boolean isProcessAttachments() {
return this.processAttachments;
}
/**
* <p>
* Returns the list of configured spaces or an empty list meaning that
* all spaces should be processed
* </p>
*
* @return a {@code List<String>} of configured spaces
*/
public List<String> getSpaces() {
return this.spaces;
}
public static ConfluenceSpecification from(Specification spec) {
ConfluenceSpecification cs = new ConfluenceSpecification();
cs.spaces = Lists.newArrayList();
for (int i = 0, len = spec.getChildCount(); i < len; i++) {
SpecificationNode sn = spec.getChild(i);
if (sn.getType().equals(
ConfluenceConfiguration.Specification.SPACES)) {
for (int j = 0, sLen = sn.getChildCount(); j < sLen; j++) {
SpecificationNode specNode = sn.getChild(j);
if (specNode.getType().equals(
ConfluenceConfiguration.Specification.SPACE)) {
cs.spaces
.add(specNode
.getAttributeValue(ConfluenceConfiguration.Specification.SPACE_KEY_ATTRIBUTE));
}
}
} else if (sn.getType().equals(
ConfluenceConfiguration.Specification.PAGES)) {
String s = sn
.getAttributeValue(ConfluenceConfiguration.Specification.PROCESS_ATTACHMENTS_ATTRIBUTE_KEY);
cs.processAttachments = Boolean.valueOf(s);
}
}
return cs;
}
}
}