blob: 97a49bcf54241abf646cdf0e8185f048abc08ca9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.csv;
import static java.lang.Character.toUpperCase;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.CSV;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.RDFS;
import org.eclipse.rdf4j.model.vocabulary.XSD;
import java.io.IOException;
import java.io.InputStream;
import java.util.StringTokenizer;
import java.util.Iterator;
import java.util.Locale;
/**
* This extractor produces <i>RDF</i> from a <i>CSV file</i> . It automatically detects fields <i>delimiter</i>. If not
* able uses the one provided in the <i>Any23</i> configuration.
*
* @see CSVReaderBuilder
*
* @author Davide Palmisano ( dpalmisano@gmail.com )
*/
public class CSVExtractor implements Extractor.ContentExtractor {
private CSVParser csvParser;
private IRI[] headerIRIs;
private CSV csv = CSV.getInstance();
/**
* {@inheritDoc}
*/
@Override
public void setStopAtFirstError(boolean f) {
// not implemented
}
/**
* {@inheritDoc}
*/
@Override
public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream in,
ExtractionResult out) throws IOException, ExtractionException {
final IRI documentIRI = extractionContext.getDocumentIRI();
// build the parser
csvParser = CSVReaderBuilder.build(in);
Iterator<CSVRecord> rows = csvParser.iterator();
// get the header and generate the IRIs for column names
CSVRecord header = rows.hasNext() ? rows.next() : null;
headerIRIs = processHeader(header, documentIRI);
// write triples to describe properties
writeHeaderPropertiesMetadata(header, out);
int index = 0;
while (rows.hasNext()) {
CSVRecord nextLine = rows.next();
IRI rowSubject = RDFUtils.iri(documentIRI.stringValue(), "row/" + index);
// add a row type
out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
// for each row produce its statements
produceRowStatements(rowSubject, nextLine, out);
// link the row to the document
out.writeTriple(documentIRI, csv.row, rowSubject);
// the progressive row number
out.writeTriple(rowSubject, csv.rowPosition,
SimpleValueFactory.getInstance().createLiteral(String.valueOf(index)));
index++;
}
// add some CSV metadata such as the number of rows and columns
addTableMetadataStatements(documentIRI, out, index, headerIRIs.length);
}
/**
* Check whether a number is an integer.
*
* @param number
*
* @return
*/
private boolean isInteger(String number) {
try {
Integer.valueOf(number);
return true;
} catch (NumberFormatException e) {
return false;
}
}
/**
* Check whether a number is a float.
*
* @param number
*
* @return
*/
private boolean isFloat(String number) {
try {
Float.valueOf(number);
return true;
} catch (NumberFormatException e) {
return false;
}
}
/**
* It writes <i>RDF</i> statements representing properties of the header.
*
* @param header
* @param out
*/
private void writeHeaderPropertiesMetadata(CSVRecord header, ExtractionResult out) {
int index = 0;
for (IRI singleHeader : headerIRIs) {
if (index > headerIRIs.length) {
break;
}
String headerString = header.get(index);
if (!RDFUtils.isAbsoluteIRI(headerString)) {
out.writeTriple(singleHeader, RDFS.LABEL, SimpleValueFactory.getInstance().createLiteral(headerString));
}
out.writeTriple(singleHeader, csv.columnPosition,
SimpleValueFactory.getInstance().createLiteral(String.valueOf(index), XSD.INTEGER));
index++;
}
}
/**
* It process the first row of the file, returning a list of {@link IRI}s representing the properties for each
* column. If a value of the header is an absolute <i>IRI</i> then it leave it as is. Otherwise the
* {@link org.apache.any23.vocab.CSV} vocabulary is used.
*
* @param header
*
* @return an array of {@link IRI}s identifying the column names.
*/
private IRI[] processHeader(CSVRecord header, IRI documentIRI) {
if (header == null)
return new IRI[0];
IRI[] result = new IRI[header.size()];
int index = 0;
for (String h : header) {
String candidate = h.trim();
if (RDFUtils.isAbsoluteIRI(candidate)) {
result[index] = SimpleValueFactory.getInstance().createIRI(candidate);
} else {
result[index] = normalize(candidate, documentIRI);
}
index++;
}
return result;
}
private IRI normalize(String toBeNormalized, IRI documentIRI) {
String newToBeNormalized = toBeNormalized.trim().toLowerCase(Locale.ROOT).replace("?", "").replace("&", "");
StringBuilder result = new StringBuilder(documentIRI.toString());
StringTokenizer tokenizer = new StringTokenizer(newToBeNormalized, " ");
while (tokenizer.hasMoreTokens()) {
String current = tokenizer.nextToken();
result.append(toUpperCase(current.charAt(0))).append(current.substring(1));
}
return SimpleValueFactory.getInstance().createIRI(result.toString());
}
/**
* It writes on the provided {@link ExtractionResult}, the </>RDF statements</> representing the row <i>cell</i>. If
* a row <i>cell</i> is an absolute <i>IRI</i> then an object property is written, literal otherwise.
*
* @param rowSubject
* @param values
* @param out
*/
private void produceRowStatements(IRI rowSubject, CSVRecord values, ExtractionResult out) {
int index = 0;
for (String cell : values) {
if (index >= headerIRIs.length) {
// there are some row cells that don't have an associated column name
break;
}
if ("".equals(cell)) {
index++;
continue;
}
IRI predicate = headerIRIs[index];
Value object = getObjectFromCell(cell);
out.writeTriple(rowSubject, predicate, object);
index++;
}
}
private Value getObjectFromCell(String cell) {
Value object;
String newCell = cell.trim();
if (RDFUtils.isAbsoluteIRI(newCell)) {
object = SimpleValueFactory.getInstance().createIRI(newCell);
} else {
IRI datatype = XSD.STRING;
if (isInteger(newCell)) {
datatype = XSD.INTEGER;
} else if (isFloat(newCell)) {
datatype = XSD.FLOAT;
}
object = SimpleValueFactory.getInstance().createLiteral(newCell, datatype);
}
return object;
}
/**
* It writes on the provided {@link ExtractionResult} some <i>RDF Statements</i> on generic properties of the
* <i>CSV</i> file, such as number of rows and columns.
*
* @param documentIRI
* @param out
* @param numberOfRows
* @param numberOfColumns
*/
private void addTableMetadataStatements(IRI documentIRI, ExtractionResult out, int numberOfRows,
int numberOfColumns) {
out.writeTriple(documentIRI, csv.numberOfRows,
SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfRows), XSD.INTEGER));
out.writeTriple(documentIRI, csv.numberOfColumns,
SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfColumns), XSD.INTEGER));
}
/**
* {@inheritDoc}
*/
@Override
public ExtractorDescription getDescription() {
return CSVExtractorFactory.getDescriptionInstance();
}
}