blob: 0345b239abf37ff103e508cf09ac8713d6affda7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.commons.indexedgraph;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableSet;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.clerezza.commons.rdf.BlankNode;
import org.apache.clerezza.commons.rdf.Language;
import org.apache.clerezza.commons.rdf.Literal;
import org.apache.clerezza.commons.rdf.BlankNodeOrIRI;
import org.apache.clerezza.commons.rdf.RDFTerm;
import org.apache.clerezza.commons.rdf.Triple;
import org.apache.clerezza.commons.rdf.Graph;
import org.apache.clerezza.commons.rdf.IRI;
import org.apache.clerezza.commons.rdf.ImmutableGraph;
import org.apache.clerezza.commons.rdf.impl.utils.AbstractGraph;
import org.apache.clerezza.commons.rdf.impl.utils.TripleImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link Graph} implementation that uses indexes for <ul>
* <li> subject, predicate, object [SPO]
* <li> predicate, object, subject [POS]
* <li> object, subject, predicate [OSP]
* </ul>
* Indexes are maintained in {@link TreeSet}s with according {@link Comparator}
* instances ({@link #spoComparator}, {@link #posComparator} ,
* {@link #ospComparator}). {@link RDFTerm}s are compared first using the
* {@link RDFTerm#hashCode()} and only if this matches by using
* {@link RDFTerm}{@link #toString()}
* .<p>
* The {@link #filter(BlankNodeOrIRI, IRI, RDFTerm)} implementation is based on
* {@link TreeSet#subSet(Object, Object)}. All Iterators returned directly
* operate on top of one of the internal indexes.
* <p>
* This class is not public, implementations should use {@link IndexedGraph} or
* {@link IndexedGraph}.
*
* @author rwesten
*/
public class IndexedGraph extends AbstractGraph implements Graph {
private static final Logger log = LoggerFactory.getLogger(IndexedGraph.class);
@Override
public ImmutableGraph getImmutableGraph() {
return new IndexedImmutableGraph(this);
}
/**
* This map is used to ensure constant ordering for {@link BlankNode} that
* do have the same hashcode (and therefore result to have the same
* {@link BlankNode#toString()} value.
*/
private final Map<Integer, List<RDFTerm>> hashCodeConflictMap = new HashMap<Integer, List<RDFTerm>>();
/**
* Compares Triples based on Subject, Predicate, Object
*/
private final Comparator<Triple> spoComparator = new Comparator<Triple>() {
@Override
public int compare(Triple a, Triple b) {
int c = IndexedGraph.compare(a.getSubject(), b.getSubject(), hashCodeConflictMap);
if (c == 0) {
c = IndexedGraph.compare(a.getPredicate(), b.getPredicate(), hashCodeConflictMap);
if (c == 0) {
c = IndexedGraph.compare(a.getObject(), b.getObject(), hashCodeConflictMap);
}
}
return c;
}
};
/**
* The SPO index
*/
private final NavigableSet<Triple> spo = new TreeSet<Triple>(spoComparator);
/**
* Compares Triples based on Predicate, Object, Subject
*/
private final Comparator<Triple> posComparator = new Comparator<Triple>() {
@Override
public int compare(Triple a, Triple b) {
int c = IndexedGraph.compare(a.getPredicate(), b.getPredicate(), hashCodeConflictMap);
if (c == 0) {
c = IndexedGraph.compare(a.getObject(), b.getObject(), hashCodeConflictMap);
if (c == 0) {
c = IndexedGraph.compare(a.getSubject(), b.getSubject(), hashCodeConflictMap);
}
}
return c;
}
};
/**
* The POS index
*/
private final NavigableSet<Triple> pos = new TreeSet<Triple>(posComparator);
/**
* Compares Triples based on Object, Subject, Predicate
*/
private final Comparator<Triple> ospComparator = new Comparator<Triple>() {
@Override
public int compare(Triple a, Triple b) {
int c = IndexedGraph.compare(a.getObject(), b.getObject(), hashCodeConflictMap);
if (c == 0) {
c = IndexedGraph.compare(a.getSubject(), b.getSubject(), hashCodeConflictMap);
if (c == 0) {
c = IndexedGraph.compare(a.getPredicate(), b.getPredicate(), hashCodeConflictMap);
}
}
return c;
}
};
/**
* The OSP index
*/
private final NavigableSet<Triple> osp = new TreeSet<Triple>(ospComparator);
/**
* Creates an empty {@link IndexedGraph}
*/
public IndexedGraph() {
super();
}
/**
* Creates a {@link IndexedGraph} using the passed iterator, the iterator is
* consumed before the constructor returns
*
* @param iterator
*/
public IndexedGraph(Iterator<Triple> iterator) {
super();
while (iterator.hasNext()) {
Triple triple = iterator.next();
performAdd(triple);
}
}
/**
* Creates a {@link IndexedGraph} for the specified collection of triples,
* subsequent modification of baseSet do not affect the created instance.
*
* @param iterable over triples
*/
public IndexedGraph(Collection<Triple> baseCollection) {
super();
spo.addAll(baseCollection);
//use internal index to fill the other indexes, because the parsed
//collection might be slow
pos.addAll(spo);
osp.addAll(spo);
}
@Override
protected Iterator<Triple> performFilter(final BlankNodeOrIRI subject, final IRI predicate, final RDFTerm object) {
if (subject == null && predicate == null && object == null) { //[n,n,n]
return createIterator(spo, spo.iterator());
}
final Triple low = new TripleImpl(
subject == null ? MIN : subject,
predicate == null ? MIN : predicate,
object == null ? MIN : object);
final Triple high = new TripleImpl(
subject == null ? MAX : subject,
predicate == null ? MAX : predicate,
object == null ? MAX : object);
if (subject != null && predicate != null && object != null) { // [S,P,O]
//NOTE: low.equals(high) in that case!
return createIterator(spo, spo.subSet(low, true, low, true).iterator());
} else if (subject != null && object == null) { //[S,n,n], [S,P,n]
return createIterator(spo, spo.subSet(low, high).iterator());
} else if (predicate != null) { //[n,P,n], [n,P,O]
return createIterator(pos, pos.subSet(low, high).iterator());
} else { //[n,n,O] , [S,n,O]
return createIterator(osp, osp.subSet(low, high).iterator());
}
}
@Override
protected boolean performAdd(Triple triple) {
if (spo.add(triple)) {
osp.add(triple);
return pos.add(triple);
}
return false;
}
@Override
protected boolean performRemove(Object t) {
if (t instanceof Triple) {
Triple triple = (Triple) t;
if (spo.remove(triple)) {
osp.remove(triple);
return pos.remove(triple);
}
}
return false;
}
@Override
public int performSize() {
return spo.size();
}
// @Override
// public Iterator<Triple> iterator() {
// return createIterator(spo, spo.iterator());
// }
/**
* Returns an Iterator that ensures that calls to {@link Iterator#remove()}
* remove items from all three indexes
*
* @param index
* @param base
* @return
*/
private Iterator<Triple> createIterator(final SortedSet<Triple> index, final Iterator<Triple> base) {
return new Iterator<Triple>() {
Triple current = null;
@Override
public boolean hasNext() {
return base.hasNext();
}
@Override
public Triple next() {
current = base.next();
return current;
}
@Override
public void remove() {
base.remove();
if (current != null) {
if (!(index == spo)) {
spo.remove(current);
}
if (!(index == pos)) {
pos.remove(current);
}
if (!(index == osp)) {
osp.remove(current);
}
}
}
};
}
protected static IRI MIN = new IRI("") {
@Override
public int hashCode() {
return Integer.MIN_VALUE;
}
;
};
protected static IRI MAX = new IRI("") {
@Override
public int hashCode() {
return Integer.MAX_VALUE;
}
;
};
// /**
// * Compares two resources with special support for {@link #MIN} and
// * {@link #MAX} to allow building {@link SortedSet#subSet(Object, Object)}
// * for <code>null</code> values parsed to
// * {@link #filter(BlankNodeOrIRI, IRI, RDFTerm)}
// * @param a
// * @param b
// * @return
// */
// protected static int compareHash(RDFTerm a, RDFTerm b, Map<Integer,List<RDFTerm>> confictsMap) {
// int hashA = a.hashCode();
// int hashB = b.hashCode();
// if (hashA != hashB) {
// return hashA > hashB ? 1 : -1;
// }
// //those resources might be equals
// //(1) Check for MIN, MAX (used to build sub-sets). Other resources might
// // have a similar hasCode
// int state = a == MIN || b == MAX ? -1 :
// a == MAX || b == MIN ? 1 : 0;
// if(state == 0){
// if(a.equals(b)){ //check of the resources are equals
// return 0; //return zero
// } else if(//we need to care about HashCode conflicts
// a instanceof BlankNode && b instanceof BlankNode){ // of BlankNodes
// log.info("HashCode conflict for {} and {}",a,b); //we have a conflict
// return resolveBlankNodeHashConflict(a, b, confictsMap);
// } else { //same hashCode but not equals
// //use the String representation of the Resources to sort them
// String as = resource2String(a);
// String bs = resource2String(b);
// log.info("same hash code {} - compare Strings a: {}, b: {}",
// new Object[]{a.hashCode(),as,bs});
// return as.compareTo(bs);
// }
// }
// return state;
// }
/**
* Resolved BlankNode hasConflics, by storing the correct order for the affected
* {@link Integer} in a {@link List} of RDFTerm instances.
* @param a the first {@link BlankNode}
* @param b the second {@link BlankNode}
* @param confictsMap the Map used to store the order of BlankNodes with conflicts
* @return the decision taken based on the confictsMap.
*/
private static int resolveBlankNodeHashConflict(RDFTerm a, RDFTerm b,
Map<Integer, List<RDFTerm>> confictsMap) {
//This is not a bad thing. We need just to ensure constant ordering
//and as there is nothing we can use to distinguish we need to keep
//this information in a list.
Integer hash = Integer.valueOf(a.hashCode());
List<RDFTerm> resources = confictsMap.get(hash);
if (resources == null) { //new conflict ... just add and return
resources = new ArrayList<RDFTerm>(2);
confictsMap.put(hash, resources);
resources.add(a);
resources.add(b);
return -1;
}
//already conflicting resource for this hash present
int aIndex = -1;
int bIndex = -1;
for (int i = 0; i < resources.size() && (aIndex < 0 || bIndex < 0); i++) {
RDFTerm r = resources.get(i);
if (aIndex < 0 && r.equals(a)) {
aIndex = i;
}
if (bIndex < 0 && r.equals(b)) {
bIndex = i;
}
}
if (aIndex < 0) { //a not found
aIndex = resources.size();
resources.add(a);
}
if (bIndex < 0) { //b not found
bIndex = resources.size();
resources.add(b);
}
return aIndex < bIndex ? -1 : 1;
}
/**
* Compares Resources to correctly sort them within the index.<p>
* Sort criteria are:<ol>
* <li> URIs are sorted by the {@link IRI#getUnicodeString()} unicode
* string)
* <li> Literals
* <ol>
* <li> sort by the {@link Literal#getLexicalForm() lixical form}
* <li> sort by {@link PlainLiteral#getLanguage() language}
* (<code>null</code> value first)
* <li> sort by {@link TypedLiteral#getDataType() type} (<code>null</code>
* value fist
* </ol>
* <li> BlankNode
* <ol>
* <li> sorted by their
* {@link System#identityHashCode(Object) Object hasCode}
* <li> on hasCode conflicts (same hasCode but not equals) a random order is
* chosen and kept in the parsed conflictsMap
* </ol>
* </ol>
* <b>NOTEs</b><ul>
* <li> parsed {@link RDFTerm} are not required to correctly implement
* {@link Object#hashCode() hashCode} and
* {@link Object#equals(Object) equals}
* <li> parsed {@link IRI} and {@link BlankNode} and {@link Literal} MUST
* NOT extend/implement any of the other classes/interfaces. This means that
* an {@link IRI} MUST NOT implement {@link BlankNode} nor {@link Literal}
* <li> parsed {@link Literal}s MAY implement {@link PlainLiteral} AND
* {@link TypedLiteral}. This allows wrappers over frameworks that do not
* distinguish between those two literal types to be used with the
* {@link IndexedGraph}.
* </ul>
*
* @param a the first resource to compare
* @param b the second resource to compare
* @param confictsMap the map used to resolve BlankNodes with hasCode
* conflicts
* @return
*/
protected static int compare(RDFTerm a, RDFTerm b, Map<Integer, List<RDFTerm>> confictsMap) {
//Handle special cases for MAX and MIN values
if (a == MIN || b == MAX) {
return -1;
} else if (a == MAX || b == MIN) {
return 1;
}
//sort (0) IRIs < (1) Literals (PlainLiterals & TypedLiterals) < (3) BlankNodes
int at = a instanceof IRI ? 0 : a instanceof Literal ? 1 : 2;
int bt = b instanceof IRI ? 0 : b instanceof Literal ? 1 : 2;
if (at == bt) { //same type sort the different types
if (at < 2) { //no BlankNode
//sort in alphabetic order of the string representation
String as = at == 0 ? ((IRI) a).getUnicodeString()
: ((Literal) a).getLexicalForm();
String bs = bt == 0 ? ((IRI) b).getUnicodeString()
: ((Literal) b).getLexicalForm();
int sc = as.compareTo(bs);
if (sc == 0 && at == 1) { //same string value and Literals
//check if the language and types are the same
Language al = a instanceof Literal ? ((Literal) a).getLanguage() : null;
Language bl = b instanceof Literal ? ((Literal) b).getLanguage() : null;
//first try to sort by language
if (al == null) {
sc = bl == null ? 0 : -1;
} else if (bl == null) {
sc = 1;
} else {
sc = al.toString().compareTo(bl.toString());
}
if (sc == 0) {
//if still equals look at the dataType
IRI adt = a instanceof Literal ? ((Literal) a).getDataType() : null;
IRI bdt = b instanceof Literal ? ((Literal) b).getDataType() : null;
if (adt == null) {
sc = bdt == null ? 0 : -1;
} else if (bdt == null) {
sc = 1;
} else {
sc = adt.getUnicodeString().compareTo(bdt.getUnicodeString());
}
}
return sc;
} else { //for IRIs return the string compare
return sc;
}
} else { //handle BlankNodes
//sort BlankNodes based on hashCode
int ah = a.hashCode();
int bh = b.hashCode();
if (ah == bh) {
if (!a.equals(b)) {
//if implementations hash is the same, but the instances
//are not equals, try to sort them by identity hash code
int ash = System.identityHashCode(a);
int bsh = System.identityHashCode(b);
if (ash == bsh) { //if those are still the same, we need
//to resolve the hashCode conflict by memorise the
//decision in a confilctMap
return resolveBlankNodeHashConflict(a, b, confictsMap);
} else {
return ash < bsh ? -1 : 1;
}
} else { //same hash and equals
return 0;
}
} else { //sort by hash
return ah < bh ? -1 : 1;
}
}
} else {
return at < bt ? -1 : 1;
}
}
}