| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.handler.extraction; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.util.ArrayDeque; |
| import java.util.Collections; |
| import java.util.Deque; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.solr.common.SolrInputDocument; |
| import org.apache.solr.common.params.SolrParams; |
| import org.apache.solr.schema.IndexSchema; |
| import org.apache.solr.schema.SchemaField; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.TikaMetadataKeys; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| import org.xml.sax.Attributes; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.DefaultHandler; |
| |
| |
| /** |
| * The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s. |
| * <B>This class is not thread-safe.</B> |
| * <p> |
| * This class cannot be reused, you have to create a new instance per document! |
| * <p> |
| * User's may wish to override this class to provide their own functionality. |
| * |
| * @see org.apache.solr.handler.extraction.SolrContentHandlerFactory |
| * @see org.apache.solr.handler.extraction.ExtractingRequestHandler |
| * @see org.apache.solr.handler.extraction.ExtractingDocumentLoader |
| */ |
| public class SolrContentHandler extends DefaultHandler implements ExtractingParams { |
| private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); |
| |
| public static final String contentFieldName = "content"; |
| |
| protected final SolrInputDocument document; |
| |
| protected final Metadata metadata; |
| protected final SolrParams params; |
| protected final StringBuilder catchAllBuilder = new StringBuilder(2048); |
| protected final IndexSchema schema; |
| protected final Map<String, StringBuilder> fieldBuilders; |
| private final Deque<StringBuilder> bldrStack = new ArrayDeque<>(); |
| |
| protected final boolean captureAttribs; |
| protected final boolean lowerNames; |
| |
| protected final String unknownFieldPrefix; |
| protected final String defaultField; |
| |
| private final boolean literalsOverride; |
| |
| private Set<String> literalFieldNames = null; |
| |
| |
| public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) { |
| this.document = new SolrInputDocument(); |
| this.metadata = metadata; |
| this.params = params; |
| this.schema = schema; |
| |
| this.lowerNames = params.getBool(LOWERNAMES, false); |
| this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false); |
| this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true); |
| this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, ""); |
| this.defaultField = params.get(DEFAULT_FIELD, ""); |
| |
| String[] captureFields = params.getParams(CAPTURE_ELEMENTS); |
| if (captureFields != null && captureFields.length > 0) { |
| fieldBuilders = new HashMap<>(); |
| for (int i = 0; i < captureFields.length; i++) { |
| fieldBuilders.put(captureFields[i], new StringBuilder()); |
| } |
| } else { |
| fieldBuilders = Collections.emptyMap(); |
| } |
| bldrStack.add(catchAllBuilder); |
| } |
| |
| |
| /** |
| * This is called by a consumer when it is ready to deal with a new SolrInputDocument. Overriding |
| * classes can use this hook to add in or change whatever they deem fit for the document at that time. |
| * The base implementation adds the metadata as fields, allowing for potential remapping. |
| * |
| * @return The {@link org.apache.solr.common.SolrInputDocument}. |
| * |
| * @see #addMetadata() |
| * @see #addCapturedContent() |
| * @see #addContent() |
| * @see #addLiterals() |
| */ |
| public SolrInputDocument newDocument() { |
| //handle the literals from the params. NOTE: This MUST be called before the others in order for literals to override other values |
| addLiterals(); |
| |
| //handle the metadata extracted from the document |
| addMetadata(); |
| |
| //add in the content |
| addContent(); |
| |
| //add in the captured content |
| addCapturedContent(); |
| |
| if (log.isDebugEnabled()) { |
| log.debug("Doc: {}", document); |
| } |
| return document; |
| } |
| |
| /** |
| * Add the per field captured content to the Solr Document. Default implementation uses the |
| * {@link #fieldBuilders} info |
| */ |
| protected void addCapturedContent() { |
| for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) { |
| if (entry.getValue().length() > 0) { |
| String fieldName = entry.getKey(); |
| if (literalsOverride && literalFieldNames.contains(fieldName)) |
| continue; |
| addField(fieldName, entry.getValue().toString(), null); } |
| } |
| } |
| |
| /** |
| * Add in the catch all content to the field. Default impl. uses the {@link #contentFieldName} |
| * and the {@link #catchAllBuilder} |
| */ |
| protected void addContent() { |
| if (literalsOverride && literalFieldNames.contains(contentFieldName)) |
| return; |
| addField(contentFieldName, catchAllBuilder.toString(), null); |
| } |
| |
| /** |
| * Add in the literals to the document using the {@link #params} and the {@link #LITERALS_PREFIX}. |
| */ |
| protected void addLiterals() { |
| Iterator<String> paramNames = params.getParameterNamesIterator(); |
| literalFieldNames = new HashSet<>(); |
| while (paramNames.hasNext()) { |
| String pname = paramNames.next(); |
| if (!pname.startsWith(LITERALS_PREFIX)) continue; |
| |
| String name = pname.substring(LITERALS_PREFIX.length()); |
| addField(name, null, params.getParams(pname)); |
| literalFieldNames.add(name); |
| } |
| } |
| |
| /** |
| * Add in any metadata using {@link #metadata} as the source. |
| */ |
| protected void addMetadata() { |
| for (String name : metadata.names()) { |
| if (literalsOverride && literalFieldNames.contains(name)) |
| continue; |
| String[] vals = metadata.getValues(name); |
| addField(name, null, vals); |
| } |
| } |
| |
| // Naming rules: |
| // 1) optionally map names to nicenames (lowercase+underscores) |
| // 2) execute "map" commands |
| // 3) if resulting field is unknown, map it to a common prefix |
| protected void addField(String fname, String fval, String[] vals) { |
| if (lowerNames) { |
| StringBuilder sb = new StringBuilder(); |
| for (int i=0; i<fname.length(); i++) { |
| char ch = fname.charAt(i); |
| if (!Character.isLetterOrDigit(ch)) ch='_'; |
| else ch=Character.toLowerCase(ch); |
| sb.append(ch); |
| } |
| fname = sb.toString(); |
| } |
| |
| String name = findMappedName(fname); |
| SchemaField sf = schema.getFieldOrNull(name); |
| if (sf==null && unknownFieldPrefix.length() > 0) { |
| name = unknownFieldPrefix + name; |
| sf = schema.getFieldOrNull(name); |
| } else if (sf == null && defaultField.length() > 0 && name.equals(TikaMetadataKeys.RESOURCE_NAME_KEY) == false /*let the fall through below handle this*/){ |
| name = defaultField; |
| sf = schema.getFieldOrNull(name); |
| } |
| |
| // Arguably we should handle this as a special case. Why? Because unlike basically |
| // all the other fields in metadata, this one was probably set not by Tika by in |
| // ExtractingDocumentLoader.load(). You shouldn't have to define a mapping for this |
| // field just because you specified a resource.name parameter to the handler, should |
| // you? |
| if (sf == null && unknownFieldPrefix.length()==0 && name == TikaMetadataKeys.RESOURCE_NAME_KEY) { |
| return; |
| } |
| |
| // normalize val params so vals.length>1 |
| if (vals != null && vals.length==1) { |
| fval = vals[0]; |
| vals = null; |
| } |
| |
| // single valued field with multiple values... catenate them. |
| if (sf != null && !sf.multiValued() && vals != null) { |
| StringBuilder builder = new StringBuilder(); |
| boolean first=true; |
| for (String val : vals) { |
| if (first) { |
| first=false; |
| } else { |
| builder.append(' '); |
| } |
| builder.append(val); |
| } |
| fval = builder.toString(); |
| vals=null; |
| } |
| |
| if (fval != null) { |
| document.addField(name, fval); |
| } |
| |
| if (vals != null) { |
| for (String val : vals) { |
| document.addField(name, val); |
| } |
| } |
| |
| // no value set - throw exception for debugging |
| // if (vals==null && fval==null) throw new RuntimeException(name + " has no non-null value "); |
| } |
| |
| @Override |
| public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { |
| StringBuilder theBldr = fieldBuilders.get(localName); |
| if (theBldr != null) { |
| //we need to switch the currentBuilder |
| bldrStack.add(theBldr); |
| } |
| if (captureAttribs == true) { |
| for (int i = 0; i < attributes.getLength(); i++) { |
| addField(localName, attributes.getValue(i), null); |
| } |
| } else { |
| for (int i = 0; i < attributes.getLength(); i++) { |
| bldrStack.getLast().append(' ').append(attributes.getValue(i)); |
| } |
| } |
| bldrStack.getLast().append(' '); |
| } |
| |
| @Override |
| public void endElement(String uri, String localName, String qName) throws SAXException { |
| StringBuilder theBldr = fieldBuilders.get(localName); |
| if (theBldr != null) { |
| //pop the stack |
| bldrStack.removeLast(); |
| assert (bldrStack.size() >= 1); |
| } |
| bldrStack.getLast().append(' '); |
| } |
| |
| |
| @Override |
| public void characters(char[] chars, int offset, int length) throws SAXException { |
| bldrStack.getLast().append(chars, offset, length); |
| } |
| |
| /** |
| * Treat the same as any other characters |
| */ |
| @Override |
| public void ignorableWhitespace(char[] chars, int offset, int length) throws SAXException { |
| characters(chars, offset, length); |
| } |
| |
| /** |
| * Get the name mapping |
| * |
| * @param name The name to check to see if there is a mapping |
| * @return The new name, if there is one, else <code>name</code> |
| */ |
| protected String findMappedName(String name) { |
| return params.get(MAP_PREFIX + name, name); |
| } |
| |
| } |