core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23.extractor.html;

 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.rdf.JSONLDExtractor;
 import org.apache.any23.extractor.rdf.JSONLDExtractorFactory;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.SINDICE;
 import org.apache.commons.io.IOUtils;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
 import org.w3c.dom.Document;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;

 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;

 /**
  * This extractor represents the HTML script tags used to embed blocks of data in documents. This way, JSON-LD content
  * can be easily embedded in HTML by placing it in a script element with the type attribute set to application/ld+json
  * according the <a href="http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents" >JSON-LD specification</a>.
  *
  */
 public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {

     private static final SINDICE vSINDICE = SINDICE.getInstance();

     private IRI profile;

     private Map<String, IRI> prefixes = new HashMap<>();

     private String documentLang;

     private JSONLDExtractor extractor;

     /**
      * {@inheritDoc}
      */
     @Override
     public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
             ExtractionResult out) throws IOException, ExtractionException {
         profile = extractProfile(in);
         documentLang = getDocumentLanguage(in);
         extractLinkDefinedPrefixes(in);

         String baseProfile = vSINDICE.NS;
         if (profile != null) {
             baseProfile = profile.toString();
         }

         extractionContext.getDocumentIRI();
         Set<JSONLDScript> jsonldScripts = extractJSONLDScript(in, baseProfile, extractionParameters, extractionContext,
                 out);
         for (JSONLDScript jsonldScript : jsonldScripts) {
             // String lang = documentLang;
             // if (jsonldScript.getLang() != null) {
             // lang = jsonldScript.getLang();
             // }
             // out.writeTriple(documentIRI, jsonldScript.getName(),
             // SimpleValueFactory.getInstance().createLiteral(jsonldScript.getContent(), lang));
         }
     }

     /**
      * Returns the {@link Document} language if declared, <code>null</code> otherwise.
      *
      * @param in
      *            a instance of {@link Document}.
      *
      * @return the language declared, could be <code>null</code>.
      */
     private String getDocumentLanguage(Document in) {
         String lang = DomUtils.find(in, "string(/HTML/@lang)");
         if ("".equals(lang)) {
             return null;
         }
         return lang;
     }

     private IRI extractProfile(Document in) {
         String profile = DomUtils.find(in, "string(/HTML/@profile)");
         if ("".equals(profile)) {
             return null;
         }
         return SimpleValueFactory.getInstance().createIRI(profile);
     }

     /**
      * It extracts prefixes defined in the <i>LINK</i> meta tags.
      *
      * @param in
      */
     private void extractLinkDefinedPrefixes(Document in) {
         List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
         for (Node linkNode : linkNodes) {
             NamedNodeMap attributes = linkNode.getAttributes();
             Node relNode = attributes.getNamedItem("rel");
             String rel = relNode == null ? null : relNode.getTextContent();
             Node hrefNode = attributes.getNamedItem("href");
             String href = hrefNode == null ? null : hrefNode.getTextContent();
             if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
                 prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
             }
         }
     }

     private Set<JSONLDScript> extractJSONLDScript(Document in, String baseProfile,
             ExtractionParameters extractionParameters, ExtractionContext extractionContext, ExtractionResult out)
             throws IOException, ExtractionException {
         List<Node> scriptNodes = DomUtils.findAll(in, "//SCRIPT");
         Set<JSONLDScript> result = new HashSet<>();
         extractor = new JSONLDExtractorFactory().createExtractor();
         for (Node jsonldNode : scriptNodes) {
             NamedNodeMap attributes = jsonldNode.getAttributes();
             for (int i = 0; i < attributes.getLength(); i++) {
                 if ("application/ld+json".equalsIgnoreCase(attributes.item(i).getTextContent())) {
                     extractor.run(extractionParameters, extractionContext,
                             IOUtils.toInputStream(jsonldNode.getTextContent(), StandardCharsets.UTF_8), out);
                 }
             }
             Node nameAttribute = attributes.getNamedItem("name");
             Node contentAttribute = attributes.getNamedItem("content");
             if (nameAttribute == null || contentAttribute == null) {
                 continue;
             }
             String name = nameAttribute.getTextContent();
             String content = contentAttribute.getTextContent();
             String xpath = DomUtils.getXPathForNode(jsonldNode);
             IRI nameAsIRI = getPrefixIfExists(name);
             if (nameAsIRI == null) {
                 nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
             }
             JSONLDScript jsonldScript = new JSONLDScript(xpath, nameAsIRI, content);
             result.add(jsonldScript);
         }
         return result;
     }

     private IRI getPrefixIfExists(String name) {
         String[] split = name.split("\\.");
         if (split.length == 2 && prefixes.containsKey(split[0])) {
             return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
         }
         return null;
     }

     @Override
     public ExtractorDescription getDescription() {
         return EmbeddedJSONLDExtractorFactory.getDescriptionInstance();
     }

     private static class JSONLDScript {

         private String xpath;

         public JSONLDScript(String xpath, IRI name, String content) {
             this.xpath = xpath;
         }

         @Override
         public boolean equals(Object o) {
             if (this == o) {
                 return true;
             }
             if (o == null) {
                 return false;
             }
             if (!(o instanceof JSONLDScript)) {
                 return false;
             }

             JSONLDScript meta = (JSONLDScript) o;

             if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) {
                 return false;
             }

             return true;
         }

         @Override
         public int hashCode() {
             return xpath != null ? xpath.hashCode() : 0;
         }
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.any23.extractor.html;

	import org.apache.any23.extractor.ExtractionContext;
	import org.apache.any23.extractor.ExtractionException;
	import org.apache.any23.extractor.ExtractionParameters;
	import org.apache.any23.extractor.ExtractionResult;
	import org.apache.any23.extractor.Extractor;
	import org.apache.any23.extractor.ExtractorDescription;
	import org.apache.any23.extractor.rdf.JSONLDExtractor;
	import org.apache.any23.extractor.rdf.JSONLDExtractorFactory;
	import org.apache.any23.rdf.RDFUtils;
	import org.apache.any23.vocab.SINDICE;
	import org.apache.commons.io.IOUtils;
	import org.eclipse.rdf4j.model.IRI;
	import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
	import org.w3c.dom.Document;
	import org.w3c.dom.NamedNodeMap;
	import org.w3c.dom.Node;

	import java.io.IOException;
	import java.nio.charset.StandardCharsets;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;

	/**
	* This extractor represents the HTML script tags used to embed blocks of data in documents. This way, JSON-LD content
	* can be easily embedded in HTML by placing it in a script element with the type attribute set to application/ld+json
	* according the <a href="http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents" >JSON-LD specification</a>.
	*
	*/
	public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {

	private static final SINDICE vSINDICE = SINDICE.getInstance();

	private IRI profile;

	private Map<String, IRI> prefixes = new HashMap<>();

	private String documentLang;

	private JSONLDExtractor extractor;

	/**
	* {@inheritDoc}
	*/
	@Override
	public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
	ExtractionResult out) throws IOException, ExtractionException {
	profile = extractProfile(in);
	documentLang = getDocumentLanguage(in);
	extractLinkDefinedPrefixes(in);

	String baseProfile = vSINDICE.NS;
	if (profile != null) {
	baseProfile = profile.toString();
	}

	extractionContext.getDocumentIRI();
	Set<JSONLDScript> jsonldScripts = extractJSONLDScript(in, baseProfile, extractionParameters, extractionContext,
	out);
	for (JSONLDScript jsonldScript : jsonldScripts) {
	// String lang = documentLang;
	// if (jsonldScript.getLang() != null) {
	// lang = jsonldScript.getLang();
	// }
	// out.writeTriple(documentIRI, jsonldScript.getName(),
	// SimpleValueFactory.getInstance().createLiteral(jsonldScript.getContent(), lang));
	}
	}

	/**
	* Returns the {@link Document} language if declared, <code>null</code> otherwise.
	*
	* @param in
	* a instance of {@link Document}.
	*
	* @return the language declared, could be <code>null</code>.
	*/
	private String getDocumentLanguage(Document in) {
	String lang = DomUtils.find(in, "string(/HTML/@lang)");
	if ("".equals(lang)) {
	return null;
	}
	return lang;
	}

	private IRI extractProfile(Document in) {
	String profile = DomUtils.find(in, "string(/HTML/@profile)");
	if ("".equals(profile)) {
	return null;
	}
	return SimpleValueFactory.getInstance().createIRI(profile);
	}

	/**
	* It extracts prefixes defined in the <i>LINK</i> meta tags.
	*
	* @param in
	*/
	private void extractLinkDefinedPrefixes(Document in) {
	List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
	for (Node linkNode : linkNodes) {
	NamedNodeMap attributes = linkNode.getAttributes();
	Node relNode = attributes.getNamedItem("rel");
	String rel = relNode == null ? null : relNode.getTextContent();
	Node hrefNode = attributes.getNamedItem("href");
	String href = hrefNode == null ? null : hrefNode.getTextContent();
	if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
	prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
	}
	}
	}

	private Set<JSONLDScript> extractJSONLDScript(Document in, String baseProfile,
	ExtractionParameters extractionParameters, ExtractionContext extractionContext, ExtractionResult out)
	throws IOException, ExtractionException {
	List<Node> scriptNodes = DomUtils.findAll(in, "//SCRIPT");
	Set<JSONLDScript> result = new HashSet<>();
	extractor = new JSONLDExtractorFactory().createExtractor();
	for (Node jsonldNode : scriptNodes) {
	NamedNodeMap attributes = jsonldNode.getAttributes();
	for (int i = 0; i < attributes.getLength(); i++) {
	if ("application/ld+json".equalsIgnoreCase(attributes.item(i).getTextContent())) {
	extractor.run(extractionParameters, extractionContext,
	IOUtils.toInputStream(jsonldNode.getTextContent(), StandardCharsets.UTF_8), out);
	}
	}
	Node nameAttribute = attributes.getNamedItem("name");
	Node contentAttribute = attributes.getNamedItem("content");
	if (nameAttribute == null \|\| contentAttribute == null) {
	continue;
	}
	String name = nameAttribute.getTextContent();
	String content = contentAttribute.getTextContent();
	String xpath = DomUtils.getXPathForNode(jsonldNode);
	IRI nameAsIRI = getPrefixIfExists(name);
	if (nameAsIRI == null) {
	nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
	}
	JSONLDScript jsonldScript = new JSONLDScript(xpath, nameAsIRI, content);
	result.add(jsonldScript);
	}
	return result;
	}

	private IRI getPrefixIfExists(String name) {
	String[] split = name.split("\\.");
	if (split.length == 2 && prefixes.containsKey(split[0])) {
	return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
	}
	return null;
	}

	@Override
	public ExtractorDescription getDescription() {
	return EmbeddedJSONLDExtractorFactory.getDescriptionInstance();
	}

	private static class JSONLDScript {

	private String xpath;

	public JSONLDScript(String xpath, IRI name, String content) {
	this.xpath = xpath;
	}

	@Override
	public boolean equals(Object o) {
	if (this == o) {
	return true;
	}
	if (o == null) {
	return false;
	}
	if (!(o instanceof JSONLDScript)) {
	return false;
	}

	JSONLDScript meta = (JSONLDScript) o;

	if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) {
	return false;
	}

	return true;
	}

	@Override
	public int hashCode() {
	return xpath != null ? xpath.hashCode() : 0;
	}
	}

	}