blob: 7752a52944a95a38d9deab86aff1da6a312f282f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jena.tdb.store.nodetable;
import java.nio.ByteBuffer;
import org.apache.jena.atlas.io.BlockUTF8;
import org.apache.jena.atlas.lib.StrUtils;
import org.apache.jena.atlas.logging.FmtLog;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.NodeFactory;
import org.apache.jena.graph.Node_Triple;
import org.apache.jena.graph.Triple;
import org.apache.jena.riot.RiotException;
import org.apache.jena.riot.out.NodeFmtLib;
import org.apache.jena.riot.system.PrefixMap;
import org.apache.jena.riot.system.PrefixMapNull;
import org.apache.jena.riot.tokens.Token;
import org.apache.jena.riot.tokens.Tokenizer;
import org.apache.jena.riot.tokens.TokenizerFactory;
import org.apache.jena.riot.web.LangTag;
import org.apache.jena.shared.PrefixMapping;
import org.apache.jena.sparql.sse.SSE;
import org.apache.jena.sparql.util.NodeUtils;
import org.apache.jena.tdb.TDB;
import org.apache.jena.tdb.TDBException;
/** Simple encoder/decoder for nodes that uses Turtle term string encoding. */
public class NodecSSE implements Nodec
{
// Characters in IRIs that are illegal and cause SSE problems, but we wish to keep.
private final static char MarkerChar = '_';
private final static char[] invalidIRIChars = { MarkerChar , ' ' };
public NodecSSE() {}
@Override
public int maxSize(Node node)
{
return maxLength(node);
}
private static final PrefixMap pmap0 = PrefixMapNull.empty;
private static final boolean onlySafeBNodeLabels = false;
@Override
public int encode(Node node, ByteBuffer bb, PrefixMapping pmap)
{
if ( ! node.isConcrete() )
FmtLog.warn(TDB.logInfo,"Attempt to encode non-concrete node: "+node);
String str = null;
if ( node.isURI() )
{
// Pesky spaces etc
String x = StrUtils.encodeHex(node.getURI(), MarkerChar, invalidIRIChars);
if ( x != node.getURI() )
node = NodeFactory.createURI(x);
}
if ( node.isLiteral() && NodeUtils.isLangString(node) )
{
// Check syntactically valid.
String lang = node.getLiteralLanguage();
if ( ! LangTag.check(lang) )
throw new TDBException("bad language tag: "+node);
}
if ( node.isBlank() && ! onlySafeBNodeLabels ) {
// Special case.
str = "_:"+node.getBlankNodeLabel();
}
if ( node.isNodeTriple() ) {
str = NodeFmtLib.str(node);
}
// Catch-all: Node->String
if ( str == null )
str = NodeFmtLib.str(node);
// String -> bytes;
BlockUTF8.fromChars(str, bb);
bb.flip();
return bb.limit();
}
@Override
public Node decode(ByteBuffer bb, PrefixMapping pmap) {
// Ideally, this would be straight from the byte buffer.
// But currently we go bytes -> string -> node
// Byte -> String
String str = BlockUTF8.toString(bb);
// String -> Node
// Easy cases.
if ( str.startsWith("_:") )
{
// Must be done this way.
// In particular, bnode labels can contain ":" from Jena
// TokenizerText does not recognize these.
str = str.substring(2);
return NodeFactory.createBlankNode(str);
}
if ( str.startsWith("<<") ) {
// Complex - not a single token so use full machinery.
return SSE.parseNode(str);
}
if ( str.startsWith("<") )
{
// Do directly.
// (is it quicker?)
str = str.substring(1,str.length()-1);
str = StrUtils.unescapeString(str);
str = StrUtils.decodeHex(str, MarkerChar);
return NodeFactory.createURI(str);
}
Tokenizer tokenizer = TokenizerFactory.makeTokenizerString(str);
if ( ! tokenizer.hasNext() )
throw new TDBException("Failed to tokenize: "+str);
Token t = tokenizer.next();
try {
Node n = t.asNode();
if ( n == null ) throw new TDBException("Not a node: "+str);
return n;
} catch (RiotException ex)
{
throw new TDBException("Bad string for node: "+str);
}
}
// Over-estimate the length of the encoding.
private static int maxLength(Node node)
{
if ( node.isBlank() )
// "_:"
return 2+maxLength(node.getBlankNodeLabel());
if ( node.isURI() )
// "<>"
return 2+maxLength(node.getURI());
if ( node.isLiteral() )
{
int len = 2+maxLength(node.getLiteralLexicalForm());
if ( NodeUtils.isLangString(node) )
// Space for @ (language tag is ASCII)
len = len + 3 + node.getLiteralLanguage().length();
else if ( ! NodeUtils.isSimpleString(node) )
// The quotes and also space for ^^<>
len = len + 4 + maxLength(node.getLiteralDatatypeURI());
return len;
}
if ( node.isVariable() )
// "?"
return 1+maxLength(node.getName());
if ( node.isNodeTriple() ) {
Triple t = Node_Triple.triple(node);
// Leading an trailing <<>>, 4 spaces
return (2+4+2)+maxLength(t.getSubject())+maxLength(t.getPredicate())+maxLength(t.getObject());
}
throw new TDBException("Unrecognized node type: "+node);
}
private static int maxLength(String string)
{
// Very worse case for UTF-8 - and then some.
// Encoding every character as _XX or bad UTF-8 conversion (3 bytes)
// Max 3 bytes UTF-8 for up to 10FFFF (NB Java treats above 16bites as surrogate pairs only).
return string.length()*3;
}
// See also StringFile.
// // URI compression can be effective but literals are more of a problem. More variety.
// public final static boolean compression = false;
// private static StringAbbrev abbreviations = new StringAbbrev();
// static {
// abbreviations.add( "rdf", "<http://www.w3.org/1999/02/22-rdf-syntax-ns#");
// abbreviations.add( "rdfs", "<http://www.w3.org/2000/01/rdf-schema#");
// abbreviations.add( "xsd", "<http://www.w3.org/2001/XMLSchema#");
//
// // MusicBrainz
// abbreviations.add( "mal", "<http://musicbrainz.org/mm-2.1/album/");
// abbreviations.add( "mt", "<http://musicbrainz.org/mm-2.1/track/");
// abbreviations.add( "mar", "<http://musicbrainz.org/mm-2.1/artist/");
// abbreviations.add( "mtr", "<http://musicbrainz.org/mm-2.1/trmid/");
// abbreviations.add( "mc", "<http://musicbrainz.org/mm-2.1/cdindex/");
//
// abbreviations.add( "m21", "<http://musicbrainz.org/mm/mm-2.1#");
// abbreviations.add( "dc", "<http://purl.org/dc/elements/1.1/");
// // DBPedia
// abbreviations.add( "r", "<http://dbpedia/resource/");
// abbreviations.add( "p", "<http://dbpedia/property/");
// }
// private String compress(String str)
// {
// if ( !compression || abbreviations == null ) return str;
// return abbreviations.abbreviate(str);
// }
//
// private String decompress(String x)
// {
// if ( !compression || abbreviations == null ) return x;
// return abbreviations.expand(x);
// }
}