core/src/main/java/org/apache/any23/extractor/csv/CSVExtractor.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23.extractor.csv;

 import static java.lang.Character.toUpperCase;

 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.CSV;
 import org.apache.commons.csv.CSVParser;
 import org.apache.commons.csv.CSVRecord;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Value;
 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
 import org.eclipse.rdf4j.model.vocabulary.RDF;
 import org.eclipse.rdf4j.model.vocabulary.RDFS;
 import org.eclipse.rdf4j.model.vocabulary.XSD;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.StringTokenizer;
 import java.util.Iterator;
 import java.util.Locale;

 /**
  * This extractor produces <i>RDF</i> from a <i>CSV file</i> . It automatically detects fields <i>delimiter</i>. If not
  * able uses the one provided in the <i>Any23</i> configuration.
  *
  * @see CSVReaderBuilder
  *
  * @author Davide Palmisano ( dpalmisano@gmail.com )
  */
 public class CSVExtractor implements Extractor.ContentExtractor {

     private CSVParser csvParser;

     private IRI[] headerIRIs;

     private CSV csv = CSV.getInstance();

     /**
      * {@inheritDoc}
      */
     @Override
     public void setStopAtFirstError(boolean f) {
         // not implemented
     }

     /**
      * {@inheritDoc}
      */
     @Override
     public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream in,
             ExtractionResult out) throws IOException, ExtractionException {
         final IRI documentIRI = extractionContext.getDocumentIRI();

         // build the parser
         csvParser = CSVReaderBuilder.build(in);
         Iterator<CSVRecord> rows = csvParser.iterator();

         // get the header and generate the IRIs for column names
         CSVRecord header = rows.hasNext() ? rows.next() : null;
         headerIRIs = processHeader(header, documentIRI);

         // write triples to describe properties
         writeHeaderPropertiesMetadata(header, out);

         int index = 0;
         while (rows.hasNext()) {
             CSVRecord nextLine = rows.next();
             IRI rowSubject = RDFUtils.iri(documentIRI.stringValue(), "row/" + index);
             // add a row type
             out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
             // for each row produce its statements
             produceRowStatements(rowSubject, nextLine, out);
             // link the row to the document
             out.writeTriple(documentIRI, csv.row, rowSubject);
             // the progressive row number
             out.writeTriple(rowSubject, csv.rowPosition,
                     SimpleValueFactory.getInstance().createLiteral(String.valueOf(index)));
             index++;
         }
         // add some CSV metadata such as the number of rows and columns
         addTableMetadataStatements(documentIRI, out, index, headerIRIs.length);
     }

     /**
      * Check whether a number is an integer.
      *
      * @param number
      *
      * @return
      */
     private boolean isInteger(String number) {
         try {
             Integer.valueOf(number);
             return true;
         } catch (NumberFormatException e) {
             return false;
         }
     }

     /**
      * Check whether a number is a float.
      *
      * @param number
      *
      * @return
      */
     private boolean isFloat(String number) {
         try {
             Float.valueOf(number);
             return true;
         } catch (NumberFormatException e) {
             return false;
         }
     }

     /**
      * It writes <i>RDF</i> statements representing properties of the header.
      *
      * @param header
      * @param out
      */
     private void writeHeaderPropertiesMetadata(CSVRecord header, ExtractionResult out) {
         int index = 0;
         for (IRI singleHeader : headerIRIs) {
             if (index > headerIRIs.length) {
                 break;
             }
             String headerString = header.get(index);
             if (!RDFUtils.isAbsoluteIRI(headerString)) {
                 out.writeTriple(singleHeader, RDFS.LABEL, SimpleValueFactory.getInstance().createLiteral(headerString));
             }
             out.writeTriple(singleHeader, csv.columnPosition,
                     SimpleValueFactory.getInstance().createLiteral(String.valueOf(index), XSD.INTEGER));
             index++;
         }
     }

     /**
      * It process the first row of the file, returning a list of {@link IRI}s representing the properties for each
      * column. If a value of the header is an absolute <i>IRI</i> then it leave it as is. Otherwise the
      * {@link org.apache.any23.vocab.CSV} vocabulary is used.
      *
      * @param header
      *
      * @return an array of {@link IRI}s identifying the column names.
      */
     private IRI[] processHeader(CSVRecord header, IRI documentIRI) {
         if (header == null)
             return new IRI[0];

         IRI[] result = new IRI[header.size()];
         int index = 0;
         for (String h : header) {
             String candidate = h.trim();
             if (RDFUtils.isAbsoluteIRI(candidate)) {
                 result[index] = SimpleValueFactory.getInstance().createIRI(candidate);
             } else {
                 result[index] = normalize(candidate, documentIRI);
             }
             index++;
         }
         return result;
     }

     private IRI normalize(String toBeNormalized, IRI documentIRI) {
         String newToBeNormalized = toBeNormalized.trim().toLowerCase(Locale.ROOT).replace("?", "").replace("&", "");

         StringBuilder result = new StringBuilder(documentIRI.toString());

         StringTokenizer tokenizer = new StringTokenizer(newToBeNormalized, " ");
         while (tokenizer.hasMoreTokens()) {
             String current = tokenizer.nextToken();

             result.append(toUpperCase(current.charAt(0))).append(current.substring(1));
         }

         return SimpleValueFactory.getInstance().createIRI(result.toString());
     }

     /**
      * It writes on the provided {@link ExtractionResult}, the </>RDF statements</> representing the row <i>cell</i>. If
      * a row <i>cell</i> is an absolute <i>IRI</i> then an object property is written, literal otherwise.
      *
      * @param rowSubject
      * @param values
      * @param out
      */
     private void produceRowStatements(IRI rowSubject, CSVRecord values, ExtractionResult out) {
         int index = 0;
         for (String cell : values) {
             if (index >= headerIRIs.length) {
                 // there are some row cells that don't have an associated column name
                 break;
             }
             if ("".equals(cell)) {
                 index++;
                 continue;
             }
             IRI predicate = headerIRIs[index];
             Value object = getObjectFromCell(cell);
             out.writeTriple(rowSubject, predicate, object);
             index++;
         }
     }

     private Value getObjectFromCell(String cell) {
         Value object;
         String newCell = cell.trim();
         if (RDFUtils.isAbsoluteIRI(newCell)) {
             object = SimpleValueFactory.getInstance().createIRI(newCell);
         } else {
             IRI datatype = XSD.STRING;
             if (isInteger(newCell)) {
                 datatype = XSD.INTEGER;
             } else if (isFloat(newCell)) {
                 datatype = XSD.FLOAT;
             }
             object = SimpleValueFactory.getInstance().createLiteral(newCell, datatype);
         }
         return object;
     }

     /**
      * It writes on the provided {@link ExtractionResult} some <i>RDF Statements</i> on generic properties of the
      * <i>CSV</i> file, such as number of rows and columns.
      *
      * @param documentIRI
      * @param out
      * @param numberOfRows
      * @param numberOfColumns
      */
     private void addTableMetadataStatements(IRI documentIRI, ExtractionResult out, int numberOfRows,
             int numberOfColumns) {
         out.writeTriple(documentIRI, csv.numberOfRows,
                 SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfRows), XSD.INTEGER));
         out.writeTriple(documentIRI, csv.numberOfColumns,
                 SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfColumns), XSD.INTEGER));
     }

     /**
      * {@inheritDoc}
      */
     @Override
     public ExtractorDescription getDescription() {
         return CSVExtractorFactory.getDescriptionInstance();
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.any23.extractor.csv;

	import static java.lang.Character.toUpperCase;

	import org.apache.any23.extractor.ExtractionContext;
	import org.apache.any23.extractor.ExtractionException;
	import org.apache.any23.extractor.ExtractionParameters;
	import org.apache.any23.extractor.ExtractionResult;
	import org.apache.any23.extractor.Extractor;
	import org.apache.any23.extractor.ExtractorDescription;
	import org.apache.any23.rdf.RDFUtils;
	import org.apache.any23.vocab.CSV;
	import org.apache.commons.csv.CSVParser;
	import org.apache.commons.csv.CSVRecord;
	import org.eclipse.rdf4j.model.IRI;
	import org.eclipse.rdf4j.model.Value;
	import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
	import org.eclipse.rdf4j.model.vocabulary.RDF;
	import org.eclipse.rdf4j.model.vocabulary.RDFS;
	import org.eclipse.rdf4j.model.vocabulary.XSD;

	import java.io.IOException;
	import java.io.InputStream;
	import java.util.StringTokenizer;
	import java.util.Iterator;
	import java.util.Locale;

	/**
	* This extractor produces <i>RDF</i> from a <i>CSV file</i> . It automatically detects fields <i>delimiter</i>. If not
	* able uses the one provided in the <i>Any23</i> configuration.
	*
	* @see CSVReaderBuilder
	*
	* @author Davide Palmisano ( dpalmisano@gmail.com )
	*/
	public class CSVExtractor implements Extractor.ContentExtractor {

	private CSVParser csvParser;

	private IRI[] headerIRIs;

	private CSV csv = CSV.getInstance();

	/**
	* {@inheritDoc}
	*/
	@Override
	public void setStopAtFirstError(boolean f) {
	// not implemented
	}

	/**
	* {@inheritDoc}
	*/
	@Override
	public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream in,
	ExtractionResult out) throws IOException, ExtractionException {
	final IRI documentIRI = extractionContext.getDocumentIRI();

	// build the parser
	csvParser = CSVReaderBuilder.build(in);
	Iterator<CSVRecord> rows = csvParser.iterator();

	// get the header and generate the IRIs for column names
	CSVRecord header = rows.hasNext() ? rows.next() : null;
	headerIRIs = processHeader(header, documentIRI);

	// write triples to describe properties
	writeHeaderPropertiesMetadata(header, out);

	int index = 0;
	while (rows.hasNext()) {
	CSVRecord nextLine = rows.next();
	IRI rowSubject = RDFUtils.iri(documentIRI.stringValue(), "row/" + index);
	// add a row type
	out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
	// for each row produce its statements
	produceRowStatements(rowSubject, nextLine, out);
	// link the row to the document
	out.writeTriple(documentIRI, csv.row, rowSubject);
	// the progressive row number
	out.writeTriple(rowSubject, csv.rowPosition,
	SimpleValueFactory.getInstance().createLiteral(String.valueOf(index)));
	index++;
	}
	// add some CSV metadata such as the number of rows and columns
	addTableMetadataStatements(documentIRI, out, index, headerIRIs.length);
	}

	/**
	* Check whether a number is an integer.
	*
	* @param number
	*
	* @return
	*/
	private boolean isInteger(String number) {
	try {
	Integer.valueOf(number);
	return true;
	} catch (NumberFormatException e) {
	return false;
	}
	}

	/**
	* Check whether a number is a float.
	*
	* @param number
	*
	* @return
	*/
	private boolean isFloat(String number) {
	try {
	Float.valueOf(number);
	return true;
	} catch (NumberFormatException e) {
	return false;
	}
	}

	/**
	* It writes <i>RDF</i> statements representing properties of the header.
	*
	* @param header
	* @param out
	*/
	private void writeHeaderPropertiesMetadata(CSVRecord header, ExtractionResult out) {
	int index = 0;
	for (IRI singleHeader : headerIRIs) {
	if (index > headerIRIs.length) {
	break;
	}
	String headerString = header.get(index);
	if (!RDFUtils.isAbsoluteIRI(headerString)) {
	out.writeTriple(singleHeader, RDFS.LABEL, SimpleValueFactory.getInstance().createLiteral(headerString));
	}
	out.writeTriple(singleHeader, csv.columnPosition,
	SimpleValueFactory.getInstance().createLiteral(String.valueOf(index), XSD.INTEGER));
	index++;
	}
	}

	/**
	* It process the first row of the file, returning a list of {@link IRI}s representing the properties for each
	* column. If a value of the header is an absolute <i>IRI</i> then it leave it as is. Otherwise the
	* {@link org.apache.any23.vocab.CSV} vocabulary is used.
	*
	* @param header
	*
	* @return an array of {@link IRI}s identifying the column names.
	*/
	private IRI[] processHeader(CSVRecord header, IRI documentIRI) {
	if (header == null)
	return new IRI[0];

	IRI[] result = new IRI[header.size()];
	int index = 0;
	for (String h : header) {
	String candidate = h.trim();
	if (RDFUtils.isAbsoluteIRI(candidate)) {
	result[index] = SimpleValueFactory.getInstance().createIRI(candidate);
	} else {
	result[index] = normalize(candidate, documentIRI);
	}
	index++;
	}
	return result;
	}

	private IRI normalize(String toBeNormalized, IRI documentIRI) {
	String newToBeNormalized = toBeNormalized.trim().toLowerCase(Locale.ROOT).replace("?", "").replace("&", "");

	StringBuilder result = new StringBuilder(documentIRI.toString());

	StringTokenizer tokenizer = new StringTokenizer(newToBeNormalized, " ");
	while (tokenizer.hasMoreTokens()) {
	String current = tokenizer.nextToken();

	result.append(toUpperCase(current.charAt(0))).append(current.substring(1));
	}

	return SimpleValueFactory.getInstance().createIRI(result.toString());
	}

	/**
	* It writes on the provided {@link ExtractionResult}, the </>RDF statements</> representing the row <i>cell</i>. If
	* a row <i>cell</i> is an absolute <i>IRI</i> then an object property is written, literal otherwise.
	*
	* @param rowSubject
	* @param values
	* @param out
	*/
	private void produceRowStatements(IRI rowSubject, CSVRecord values, ExtractionResult out) {
	int index = 0;
	for (String cell : values) {
	if (index >= headerIRIs.length) {
	// there are some row cells that don't have an associated column name
	break;
	}
	if ("".equals(cell)) {
	index++;
	continue;
	}
	IRI predicate = headerIRIs[index];
	Value object = getObjectFromCell(cell);
	out.writeTriple(rowSubject, predicate, object);
	index++;
	}
	}

	private Value getObjectFromCell(String cell) {
	Value object;
	String newCell = cell.trim();
	if (RDFUtils.isAbsoluteIRI(newCell)) {
	object = SimpleValueFactory.getInstance().createIRI(newCell);
	} else {
	IRI datatype = XSD.STRING;
	if (isInteger(newCell)) {
	datatype = XSD.INTEGER;
	} else if (isFloat(newCell)) {
	datatype = XSD.FLOAT;
	}
	object = SimpleValueFactory.getInstance().createLiteral(newCell, datatype);
	}
	return object;
	}

	/**
	* It writes on the provided {@link ExtractionResult} some <i>RDF Statements</i> on generic properties of the
	* <i>CSV</i> file, such as number of rows and columns.
	*
	* @param documentIRI
	* @param out
	* @param numberOfRows
	* @param numberOfColumns
	*/
	private void addTableMetadataStatements(IRI documentIRI, ExtractionResult out, int numberOfRows,
	int numberOfColumns) {
	out.writeTriple(documentIRI, csv.numberOfRows,
	SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfRows), XSD.INTEGER));
	out.writeTriple(documentIRI, csv.numberOfColumns,
	SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfColumns), XSD.INTEGER));
	}

	/**
	* {@inheritDoc}
	*/
	@Override
	public ExtractorDescription getDescription() {
	return CSVExtractorFactory.getDescriptionInstance();
	}
	}