blob: a6de1790f58dc05cbf9b25a58b2e91d0d64f468b [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.agents.output.searchblox;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
import org.apache.manifoldcf.crawler.system.Logging;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* "Package" class modeling a SearchBox document as a POJO
*
* @author Rafa Haro <rharo@apache.org>
* @author Antonio David Perez Morales <adperezmorales@apache.org>
*/
public class SearchBloxDocument {
static final String API_KEY = "apikey";
static final String SEARCHBLOX_COLLECTION = "collection";
static final String DATE_FORMAT = "dd MMMM yyyy HH:mm:ss z";
public enum IndexingFormat {
JSON, XML
}
public enum DocumentAction {
ADD_UPDATE, DELETE, STATUS, CREATE, CLEAR
}
static final List<String> xmlElements= Lists.newArrayList("searchblox","document","url","title","keywords","content","description","lastmodified","size",
"alpha","contenttype","category","meta","uid");
static final String COLNAME_ATTRIBUTE = "colname";
static final String APIKEY_ATTRIBUTE = "apikey";
static final String NAME_ATTRIBUTE = "name";
static final String UID_ATTRIBUTE = "uid";
static final String BOOST_ATTRIBUTE = "boost";
private Multimap<String, Object> data_fields = HashMultimap.create();
/**
* API key accessible in the SearchBlox Admin Console.
*/
String apiKey;
/**
* Name of the Custom collection
*/
String colName;
/**
* unique identifer for a document (default when unassigned is url location)
*/
String uid;
public SearchBloxDocument(String apikey) {
this.apiKey = apikey;
}
public SearchBloxDocument(String apikey, String documentURI,
RepositoryDocument rd, Map<String, List<String>> args) {
this(apikey);
SimpleDateFormat dateFormat = new SimpleDateFormat(DATE_FORMAT);
this.uid = documentURI;
this.colName = args.get(SEARCHBLOX_COLLECTION).get(0);
Date date = rd.getModifiedDate();
if(date!=null){
data_fields.put(xmlElements.get(7),
dateFormat.format(rd.getModifiedDate()));
}
// content
String content = "";
try {
if (rd.getField(xmlElements.get(5)) != null)
content = (String) rd.getField(xmlElements.get(5))[0];
else
content = this.buildString(rd.getBinaryStream());
} catch (IOException e) {
Logging.connectors
.error("[Parsing Content]Content is not text plain, verify you are properly using Apache Tika Transformer",
e);
}
data_fields.put(xmlElements.get(5), this.clean(content));
// Content Type
data_fields.put(xmlElements.get(10), rd.getMimeType());
// Size
data_fields.put(xmlElements.get(8), "" + rd.getBinaryLength());
// Boosting
for(String boostId:args.keySet()){
if(boostId.endsWith("_boost")){
List<String> argBoost = args.get(boostId);
if(argBoost!=null && !argBoost.isEmpty())
data_fields.put(boostId,argBoost.get(0));
}
}
// Metadata
Multimap<String, String> metadata = HashMultimap.create();
Iterator<String> it = rd.getFields();
while (it.hasNext()) {
String name = it.next();
try {
String[] values = rd.getFieldAsStrings(name);
for (String value : values) {
String key = name.toLowerCase();
int indexOf = xmlElements.indexOf(key);
if(indexOf != 5)
if (indexOf != -1 &&
indexOf != 0 &&
indexOf != 7 &&
indexOf != 8) {
data_fields.put(key, value);
} else
metadata.put(name, value);
}
} catch (IOException e) {
Logging.connectors.error(
"[Getting Field Values]Impossible to read value for metadata "
+ name, e);
}
}
// ACLS must be stored as metadata, as Searchblox use that construct to index custom named fields
//the approach has been implemented and tested live
Iterator<String> aclTypes = rd.securityTypesIterator();
while (aclTypes.hasNext()) {
String aclType = aclTypes.next();
String[] allow_tokens = rd.getSecurityACL(aclType);
for (String token : allow_tokens)
metadata.put(aclType+"_allow", token);
String[] deny_tokens = rd.getSecurityDenyACL(aclType);
for (String token : deny_tokens)
metadata.put(aclType+"_deny", token);
}
data_fields.put(xmlElements.get(12), metadata);
}
/**
* Clean a String from html tags or break lines
* @param content
* @return
*/
private String clean(String content) {
content = content.replaceAll("(\r\n|\n)", " ");
String cleanContent= Jsoup.parseBodyFragment(content).text();
return cleanContent;
}
private String buildString(InputStream binaryStream) throws IOException {
StringWriter writer = new StringWriter();
IOUtils.copy(binaryStream, writer, "UTF-8");
return writer.toString();
}
public String toString(IndexingFormat format, DocumentAction action)
throws SearchBloxException {
if(format == IndexingFormat.XML)
return toStringXML(action);
else
return toStringJSON(action);
}
private String toStringJSON(DocumentAction action) throws SearchBloxException {
JSONObject result = new JSONObject();
if (apiKey == null)
throw new SearchBloxException(
"The API Key for accessing SearchBlox Server CAN'T be NULL");
try {
result.put(APIKEY_ATTRIBUTE, apiKey);
JSONObject document = new JSONObject();
if (colName == null)
throw new SearchBloxException(
"The Collection Name of the SearchBlox Server CAN'T be NULL");
document.put(COLNAME_ATTRIBUTE, colName);
document.put(UID_ATTRIBUTE, uid);
if(action == DocumentAction.ADD_UPDATE){
for(String element:xmlElements){
if (!element.equals(xmlElements.get(12))) {
Collection<Object> values = data_fields.get(element);
if (values!=null && values.size()>0) {
Object next = values.iterator()
.next();
String value =(String) next;
if (value != null && !value.isEmpty()) {
if(element.equals("keywords"))
document.put(element, StringUtils.join(values, ','));
else
document.put(element, value);
// }
}
}
}
}
// Metadata
Collection<Object> metadataSet = data_fields
.get(xmlElements.get(12));
JSONObject metaObject = new JSONObject();
if(metadataSet!=null && metadataSet.size()>0){
Multimap<String, String> metadata = (Multimap<String, String>) metadataSet.iterator().next();
if (metadata != null && !metadata.isEmpty()) {
for (String name : metadata.keySet()){
JSONArray nextMetadata = new JSONArray();
for (String value : metadata.get(name)) {
nextMetadata.put(value);
}
metaObject.put(name, nextMetadata);
}
}
}
document.put(xmlElements.get(12), metaObject);
}
result.put(xmlElements.get(1), document);
} catch (JSONException e) {
throw new SearchBloxException("Error while building Document JSON object", e);
}
return result.toString();
}
private String toStringXML(DocumentAction action) throws SearchBloxException{
Document doc = null;
try {
doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument();
} catch (ParserConfigurationException e) {
throw new SearchBloxException(e);
}
// Document Base Data
Element root = doc.createElement(xmlElements.get(0));
if (apiKey == null)
throw new SearchBloxException(
"The API Key for accessing SearchBlox Server CAN'T be NULL");
root.setAttribute(APIKEY_ATTRIBUTE, apiKey);
doc.appendChild(root);
Element document = doc.createElement(xmlElements.get(1));
if (colName == null)
throw new SearchBloxException(
"The Collection Name of the SearchBlox Server CAN'T be NULL");
document.setAttribute(COLNAME_ATTRIBUTE, colName);
if(action == DocumentAction.DELETE)
document.setAttribute(UID_ATTRIBUTE,uid);
root.appendChild(document);
if (action == DocumentAction.ADD_UPDATE) {
// Uid
if (uid != null && !uid.isEmpty()) {
Element uidElement = doc.createElement(xmlElements.get(13));
uidElement.setTextContent(uid);
document.appendChild(uidElement);
}
for(String element:xmlElements){
if (!element.equals(xmlElements.get(12))) {
Collection<Object> values = data_fields.get(element);
if (values!=null && values.size()>0) {
Object next = values.iterator()
.next();
String value =(String) next;
if (value != null && !value.isEmpty()) {
Element eValue = doc.createElement(element);
if(element.equals("keywords"))
eValue.setTextContent(StringUtils.join(values, ','));
else
eValue.setTextContent(value);
Collection<Object> boostElement = data_fields
.get(element + "_boost");
if(boostElement!=null && boostElement.size()>0){
String value_boost = (String) boostElement.iterator()
.next();
eValue.setAttribute(BOOST_ATTRIBUTE, "" + value_boost);
}
document.appendChild(eValue);
}
}
}
}
// Metadata
Collection<Object> metadataSet = data_fields
.get(xmlElements.get(12));
if(metadataSet!=null && metadataSet.size()>0){
Multimap<String, String> metadata = (Multimap<String, String>) metadataSet.iterator().next();
if (metadata != null && !metadata.isEmpty()) {
for (String name : metadata.keySet())
for (String value : metadata.get(name)) {
Element metaElement = doc.createElement(xmlElements.get(12));
metaElement.setAttribute(NAME_ATTRIBUTE, name);
metaElement.setTextContent(value);
document.appendChild(metaElement);
}
} }
}
return getStringFromDocument(doc);
}
/**
* <p>Transform a {@code Document} to its XML string representation</p>
* @param doc the document to transform
* @return the document in the XML-String format
*/
private String getStringFromDocument(Document doc) {
try {
DOMSource domSource = new DOMSource(doc);
StringWriter writer = new StringWriter();
StreamResult result = new StreamResult(writer);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
// transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
transformer.transform(domSource, result);
return writer.toString();
} catch (TransformerException ex) {
ex.printStackTrace();
return null;
}
}
}