blob: ccbdb1a2de2e6cfbb65480f9e30ebddc8cf25a13 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.extraction;
import java.io.File;
import java.io.InputStream;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.ContentStreamHandlerBase;
import org.apache.solr.handler.loader.ContentStreamLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.security.AuthorizationContext;
import org.apache.solr.security.PermissionNameProvider;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.apache.tika.config.TikaConfig;
/**
* Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted
* first from the document.
*/
public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware , PermissionNameProvider {
public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
public static final String CONFIG_LOCATION = "tika.config";
protected TikaConfig config;
protected ParseContextConfig parseContextConfig;
protected SolrContentHandlerFactory factory;
@Override
public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) {
return PermissionNameProvider.Name.READ_PERM;
}
@Override
public void init(@SuppressWarnings({"rawtypes"})NamedList args) {
super.init(args);
}
@Override
public void inform(SolrCore core) {
try {
String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
if (tikaConfigLoc == null) { // default
ClassLoader classLoader = core.getResourceLoader().getClassLoader();
try (InputStream is = classLoader.getResourceAsStream("solr-default-tika-config.xml")) {
config = new TikaConfig(is);
}
} else {
File configFile = new File(tikaConfigLoc);
if (configFile.isAbsolute()) {
config = new TikaConfig(configFile);
} else { // in conf/
try (InputStream is = core.getResourceLoader().openResource(tikaConfigLoc)) {
config = new TikaConfig(is);
}
}
}
String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
if (parseContextConfigLoc == null) { // default:
parseContextConfig = new ParseContextConfig();
} else {
parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
}
} catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to load Tika Config", e);
}
factory = createFactory();
}
protected SolrContentHandlerFactory createFactory() {
return new SolrContentHandlerFactory();
}
@Override
protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
}
// ////////////////////// SolrInfoMBeans methods //////////////////////
@Override
public String getDescription() {
return "Add/Update Rich document";
}
}