jena-tdb/src/main/java/org/apache/jena/tdb/store/nodetable/NodecSSE.java - jena - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.jena.tdb.store.nodetable;

 import java.nio.ByteBuffer ;

 import org.apache.jena.atlas.io.BlockUTF8 ;
 import org.apache.jena.atlas.lib.StrUtils ;
 import org.apache.jena.graph.Node ;
 import org.apache.jena.graph.NodeFactory ;
 import org.apache.jena.riot.RiotException ;
 import org.apache.jena.riot.out.NodeFmtLib ;
 import org.apache.jena.riot.system.PrefixMap ;
 import org.apache.jena.riot.system.PrefixMapNull ;
 import org.apache.jena.riot.tokens.Token ;
 import org.apache.jena.riot.tokens.Tokenizer ;
 import org.apache.jena.riot.tokens.TokenizerFactory ;
 import org.apache.jena.riot.web.LangTag ;
 import org.apache.jena.shared.PrefixMapping ;
 import org.apache.jena.sparql.util.NodeUtils ;
 import org.apache.jena.tdb.TDBException ;
 import org.apache.jena.tdb.lib.StringAbbrev ;

 /** Simple encoder/decoder for nodes that uses Turtle term string encoding. */

 public class NodecSSE implements Nodec
 {
     // Characters in IRIs that are illegal and cause SSE problems, but we wish to keep.
     private final static char MarkerChar = '_' ;
     private final static char[] invalidIRIChars = { MarkerChar , ' ' } ;

     public NodecSSE() {}

     @Override
     public int maxSize(Node node)
     {
         return maxLength(node) ;
     }

     private static final PrefixMap pmap0 = PrefixMapNull.empty ;
     private static final boolean onlySafeBNodeLabels = false ;
     @Override
     public int encode(Node node, ByteBuffer bb, PrefixMapping pmap)
     {
         String str = null ;

         if ( node.isURI() )
         {
             // Pesky spaces etc
             String x = StrUtils.encodeHex(node.getURI(), MarkerChar, invalidIRIChars) ;
             if ( x != node.getURI() )
                 node = NodeFactory.createURI(x) ;
         }

         if ( node.isLiteral() && NodeUtils.isLangString(node) )
         {
             // Check syntactically valid.
             String lang = node.getLiteralLanguage() ;
             if ( ! LangTag.check(lang) )
                 throw new TDBException("bad language tag: "+node) ;
         }

         if ( node.isBlank() && ! onlySafeBNodeLabels ) {
             // Special case.
             str = "_:"+node.getBlankNodeLabel() ;
         }

         // Node->String
         if ( str == null )
             str = NodeFmtLib.str(node, (String)null, pmap0) ;
         // String -> bytes ;
         BlockUTF8.fromChars(str, bb) ;
         bb.flip() ;
         return bb.limit() ;
     }

     @Override
     public Node decode(ByteBuffer bb, PrefixMapping pmap)
     {
         // Ideally, this would be straight from the byte buffer.
         // But currently we go bytes -> string -> node

         // Byte -> String
         String str = BlockUTF8.toString(bb) ;
         //OLD
         //String str = Bytes.fromByteBuffer(bb) ;
         // String -> Node

         // Easy cases.
         if ( str.startsWith("_:") )
         {
             // Must be done this way.
             // In particular, bnode labels can contain ":" from Jena
             // TokenizerText does not recognize these.
             str = str.substring(2) ;
             return NodeFactory.createBlankNode(str) ;
         }

         if ( str.startsWith("<") )
         {
             // Do directly.
             // (is it quicker?)
             str = str.substring(1,str.length()-1) ;
             str = StrUtils.unescapeString(str) ;
             str = StrUtils.decodeHex(str, MarkerChar) ;
             return NodeFactory.createURI(str) ;
         }

         Tokenizer tokenizer = TokenizerFactory.makeTokenizerString(str) ;
         if ( ! tokenizer.hasNext() )
             throw new TDBException("Failed to tokenise: "+str) ;
         Token t = tokenizer.next() ;

         try {
             Node n = t.asNode() ;
             if ( n == null ) throw new TDBException("Not a node: "+str) ;
             return n ;
         } catch (RiotException ex)
         {
             throw new TDBException("Bad string for node: "+str) ;
         }
     }

     // Over-estimate the length of the encoding.
     private static int maxLength(Node node)
     {
         if ( node.isBlank() )
             // "_:"
             return 2+maxLength(node.getBlankNodeLabel()) ;
         if ( node.isURI() )
             // "<>"
             return 2+maxLength(node.getURI()) ;
         if ( node.isLiteral() )
         {
             int len = 2+maxLength(node.getLiteralLexicalForm()) ;
             if ( NodeUtils.isLangString(node) )
                 // Space for @ (language tag is ASCII)
                 len = len + 3 + node.getLiteralLanguage().length() ;
             else if ( ! NodeUtils.isSimpleString(node) )
                 // The quotes and also space for ^^<>
                 len = len + 4 + maxLength(node.getLiteralDatatypeURI()) ;
             return len ;
         }
         if ( node.isVariable() )
             // "?"
             return 1+maxLength(node.getName()) ;
         throw new TDBException("Unrecognized node type: "+node) ;
     }

     private static int maxLength(String string)
     {
         // Very worse case for UTF-8 - and then some.
         // Encoding every character as _XX or bad UTF-8 conversion (3 bytes)
         // Max 3 bytes UTF-8 for up to 10FFFF (NB Java treats above 16bites as surrogate pairs only).
         return string.length()*3 ;
     }

     // URI compression can be effective but literals are more of a problem.  More variety.
     public final static boolean compression = false ;
     private static StringAbbrev abbreviations = new StringAbbrev() ;
     static {
         abbreviations.add(  "rdf",      "<http://www.w3.org/1999/02/22-rdf-syntax-ns#") ;
         abbreviations.add(  "rdfs",     "<http://www.w3.org/2000/01/rdf-schema#") ;
         abbreviations.add(  "xsd",      "<http://www.w3.org/2001/XMLSchema#") ;

         // MusicBrainz
         abbreviations.add(  "mal",      "<http://musicbrainz.org/mm-2.1/album/") ;
         abbreviations.add(  "mt",       "<http://musicbrainz.org/mm-2.1/track/") ;
         abbreviations.add(  "mar",      "<http://musicbrainz.org/mm-2.1/artist/") ;
         abbreviations.add(  "mtr",      "<http://musicbrainz.org/mm-2.1/trmid/") ;
         abbreviations.add(  "mc",       "<http://musicbrainz.org/mm-2.1/cdindex/") ;

         abbreviations.add(  "m21",      "<http://musicbrainz.org/mm/mm-2.1#") ;
         abbreviations.add(  "dc",       "<http://purl.org/dc/elements/1.1/") ;
         // DBPedia
         abbreviations.add(  "r",        "<http://dbpedia/resource/") ;
         abbreviations.add(  "p",        "<http://dbpedia/property/") ;
     }
     private String compress(String str)
     {
         if ( !compression || abbreviations == null ) return str ;
         return abbreviations.abbreviate(str) ;
     }

     private String decompress(String x)
     {
         if ( !compression || abbreviations == null ) return x ;
         return abbreviations.expand(x) ;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.jena.tdb.store.nodetable;

	import java.nio.ByteBuffer ;

	import org.apache.jena.atlas.io.BlockUTF8 ;
	import org.apache.jena.atlas.lib.StrUtils ;
	import org.apache.jena.graph.Node ;
	import org.apache.jena.graph.NodeFactory ;
	import org.apache.jena.riot.RiotException ;
	import org.apache.jena.riot.out.NodeFmtLib ;
	import org.apache.jena.riot.system.PrefixMap ;
	import org.apache.jena.riot.system.PrefixMapNull ;
	import org.apache.jena.riot.tokens.Token ;
	import org.apache.jena.riot.tokens.Tokenizer ;
	import org.apache.jena.riot.tokens.TokenizerFactory ;
	import org.apache.jena.riot.web.LangTag ;
	import org.apache.jena.shared.PrefixMapping ;
	import org.apache.jena.sparql.util.NodeUtils ;
	import org.apache.jena.tdb.TDBException ;
	import org.apache.jena.tdb.lib.StringAbbrev ;

	/** Simple encoder/decoder for nodes that uses Turtle term string encoding. */

	public class NodecSSE implements Nodec
	{
	// Characters in IRIs that are illegal and cause SSE problems, but we wish to keep.
	private final static char MarkerChar = '_' ;
	private final static char[] invalidIRIChars = { MarkerChar , ' ' } ;

	public NodecSSE() {}

	@Override
	public int maxSize(Node node)
	{
	return maxLength(node) ;
	}

	private static final PrefixMap pmap0 = PrefixMapNull.empty ;
	private static final boolean onlySafeBNodeLabels = false ;
	@Override
	public int encode(Node node, ByteBuffer bb, PrefixMapping pmap)
	{
	String str = null ;

	if ( node.isURI() )
	{
	// Pesky spaces etc
	String x = StrUtils.encodeHex(node.getURI(), MarkerChar, invalidIRIChars) ;
	if ( x != node.getURI() )
	node = NodeFactory.createURI(x) ;
	}

	if ( node.isLiteral() && NodeUtils.isLangString(node) )
	{
	// Check syntactically valid.
	String lang = node.getLiteralLanguage() ;
	if ( ! LangTag.check(lang) )
	throw new TDBException("bad language tag: "+node) ;
	}

	if ( node.isBlank() && ! onlySafeBNodeLabels ) {
	// Special case.
	str = "_:"+node.getBlankNodeLabel() ;
	}

	// Node->String
	if ( str == null )
	str = NodeFmtLib.str(node, (String)null, pmap0) ;
	// String -> bytes ;
	BlockUTF8.fromChars(str, bb) ;
	bb.flip() ;
	return bb.limit() ;
	}

	@Override
	public Node decode(ByteBuffer bb, PrefixMapping pmap)
	{
	// Ideally, this would be straight from the byte buffer.
	// But currently we go bytes -> string -> node

	// Byte -> String
	String str = BlockUTF8.toString(bb) ;
	//OLD
	//String str = Bytes.fromByteBuffer(bb) ;
	// String -> Node

	// Easy cases.
	if ( str.startsWith("_:") )
	{
	// Must be done this way.
	// In particular, bnode labels can contain ":" from Jena
	// TokenizerText does not recognize these.
	str = str.substring(2) ;
	return NodeFactory.createBlankNode(str) ;
	}

	if ( str.startsWith("<") )
	{
	// Do directly.
	// (is it quicker?)
	str = str.substring(1,str.length()-1) ;
	str = StrUtils.unescapeString(str) ;
	str = StrUtils.decodeHex(str, MarkerChar) ;
	return NodeFactory.createURI(str) ;
	}

	Tokenizer tokenizer = TokenizerFactory.makeTokenizerString(str) ;
	if ( ! tokenizer.hasNext() )
	throw new TDBException("Failed to tokenise: "+str) ;
	Token t = tokenizer.next() ;

	try {
	Node n = t.asNode() ;
	if ( n == null ) throw new TDBException("Not a node: "+str) ;
	return n ;
	} catch (RiotException ex)
	{
	throw new TDBException("Bad string for node: "+str) ;
	}
	}

	// Over-estimate the length of the encoding.
	private static int maxLength(Node node)
	{
	if ( node.isBlank() )
	// "_:"
	return 2+maxLength(node.getBlankNodeLabel()) ;
	if ( node.isURI() )
	// "<>"
	return 2+maxLength(node.getURI()) ;
	if ( node.isLiteral() )
	{
	int len = 2+maxLength(node.getLiteralLexicalForm()) ;
	if ( NodeUtils.isLangString(node) )
	// Space for @ (language tag is ASCII)
	len = len + 3 + node.getLiteralLanguage().length() ;
	else if ( ! NodeUtils.isSimpleString(node) )
	// The quotes and also space for ^^<>
	len = len + 4 + maxLength(node.getLiteralDatatypeURI()) ;
	return len ;
	}
	if ( node.isVariable() )
	// "?"
	return 1+maxLength(node.getName()) ;
	throw new TDBException("Unrecognized node type: "+node) ;
	}

	private static int maxLength(String string)
	{
	// Very worse case for UTF-8 - and then some.
	// Encoding every character as _XX or bad UTF-8 conversion (3 bytes)
	// Max 3 bytes UTF-8 for up to 10FFFF (NB Java treats above 16bites as surrogate pairs only).
	return string.length()*3 ;
	}

	// URI compression can be effective but literals are more of a problem. More variety.
	public final static boolean compression = false ;
	private static StringAbbrev abbreviations = new StringAbbrev() ;
	static {
	abbreviations.add( "rdf", "<http://www.w3.org/1999/02/22-rdf-syntax-ns#") ;
	abbreviations.add( "rdfs", "<http://www.w3.org/2000/01/rdf-schema#") ;
	abbreviations.add( "xsd", "<http://www.w3.org/2001/XMLSchema#") ;

	// MusicBrainz
	abbreviations.add( "mal", "<http://musicbrainz.org/mm-2.1/album/") ;
	abbreviations.add( "mt", "<http://musicbrainz.org/mm-2.1/track/") ;
	abbreviations.add( "mar", "<http://musicbrainz.org/mm-2.1/artist/") ;
	abbreviations.add( "mtr", "<http://musicbrainz.org/mm-2.1/trmid/") ;
	abbreviations.add( "mc", "<http://musicbrainz.org/mm-2.1/cdindex/") ;

	abbreviations.add( "m21", "<http://musicbrainz.org/mm/mm-2.1#") ;
	abbreviations.add( "dc", "<http://purl.org/dc/elements/1.1/") ;
	// DBPedia
	abbreviations.add( "r", "<http://dbpedia/resource/") ;
	abbreviations.add( "p", "<http://dbpedia/property/") ;
	}
	private String compress(String str)
	{
	if ( !compression \|\| abbreviations == null ) return str ;
	return abbreviations.abbreviate(str) ;
	}

	private String decompress(String x)
	{
	if ( !compression \|\| abbreviations == null ) return x ;
	return abbreviations.expand(x) ;
	}

	}