blob: 69bad352dca48bfed5ba60b6565bd4c1531796b2 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jena.riot.lang;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.UUID;
import java.util.concurrent.Callable;
import org.apache.jena.atlas.lib.Bytes;
import org.apache.jena.atlas.lib.Cache;
import org.apache.jena.atlas.lib.CacheFactory;
import org.apache.jena.atlas.lib.InternalErrorException;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.NodeFactory;
/**
* Allocate bnode labels using a per-run seed and the label presented.
* <p>
* This is the most scalable, always legal allocator.
* <p>
* New allocators must be created per parser run, or .reset() called. These are
* fed to a digest to give a bit string, (currently MD5, to get a 128bit bit
* string) that is used to form a bNode AnonId of hex digits.
* <p>
* In addition, there is a cache of label{@literal ->}node allocations, using the natural
* tendency to locality in a database dump. (subject bNodes, bNodes in lists
* and other data values structures like unit values).
* <p>
* Not thread safe.
*/
public class BlankNodeAllocatorHash implements BlankNodeAllocator {
private static String DigestAlgorithm = "MD5";
private static int CacheSize = 1000;
private MessageDigest mDigest;
private byte[] seedBytes;
// long+2 bytes to distinguish from UTF-8 bytes.
private byte[] counterBytes = new byte[10];
private Cache<String, Node> cache;
private long counter = 0;
public BlankNodeAllocatorHash() {
reset();
try {
mDigest = MessageDigest.getInstance(DigestAlgorithm);
} catch (NoSuchAlgorithmException e) {
throw new InternalErrorException("failed to create message digest", e);
}
cache = CacheFactory.createCache(CacheSize);
}
/**
* Gets a fresh seed value
* <p>
* Note that this is called almost immediately by the constructor
* and on this initial call you will not yet have access to any
* implementation specific information used to select the seed.
* </p>
* <p>
* Implementations <strong>must</strong> return a non-null value
* so if you can't decide a seed prior to seeing your derived
* implementations constructor inputs you should return a temporary
* fake value initially. You can then call {@link #reset()} in your
* own constructor after you've taken the necessary steps that allow
* you to decide how to generate your own seed.
* </p>
* @return Seed value
*/
protected UUID freshSeed() {
return UUID.randomUUID();
}
@Override
public void reset() {
UUID seed = this.freshSeed();
seedBytes = new byte[128 / 8];
Bytes.setLong(seed.getMostSignificantBits(), seedBytes, 0);
Bytes.setLong(seed.getLeastSignificantBits(), seedBytes, 8);
if ( cache != null )
cache.clear();
}
@Override
public Node alloc(final String label) {
Callable<Node> getter = new Callable<Node>() {
@Override
public Node call() {
return alloc(Bytes.string2bytes(label));
}
};
Node n = cache.getOrFill(label, getter);
return n;
}
@Override
public Node create() {
counter++;
// Make illegal string bytes so can't clash with alloc(String).
// It is different because it has zeros in it.
counterBytes[0] = 0;
counterBytes[1] = 0;
Bytes.setLong(counter, counterBytes, 2);
return alloc(counterBytes);
}
private Node alloc(byte[] labelBytes) {
// UUID.nameUUIDFromBytes(seedBytes+labelBytes) uses MD5 but creates the digester
// each time. It also stamps in the UUID version/variant bits.
mDigest.update(seedBytes);
mDigest.update(labelBytes);
byte[] bytes = mDigest.digest(); // resets
String hexString = Bytes.asHexLC(bytes);
return NodeFactory.createBlankNode(hexString);
}
}