blob: b87964110f92e459d4f85db46b49b57483aa93b6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.plugin.officescraper;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.Excel;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import java.io.IOException;
import java.io.InputStream;
/**
* Implementation of {@link org.apache.any23.extractor.Extractor.ContentExtractor} able to process
* a <i>MS Excel 97-2007+</i> file format <i>.xls/.xlsx</i> and
* convert the detected content to triples.
* This extractor is based on
* <a href="http://poi.apache.org/spreadsheet/index.html">Apache POI-HSSF and POI-XSSF Java API</a>.
*
* @author Michele Mostarda (mostarda@fbk.eu)
*/
public class ExcelExtractor implements Extractor.ContentExtractor {
private static final Excel excel = Excel.getInstance();
private boolean stopAtFirstError = false;
public ExcelExtractor() {}
public boolean isStopAtFirstError() {
return stopAtFirstError;
}
@Override
public void setStopAtFirstError(boolean f) {
stopAtFirstError = f;
}
@Override
public ExtractorDescription getDescription() {
return ExcelExtractorFactory.getDescriptionInstance();
}
@Override
public void run(
ExtractionParameters extractionParameters,
ExtractionContext context,
InputStream in,
ExtractionResult er
) throws IOException, ExtractionException {
try {
final IRI documentIRI = context.getDocumentIRI();
final Workbook workbook = createWorkbook(documentIRI, in);
processWorkbook(documentIRI, workbook, er);
} catch (Exception e) {
throw new ExtractionException("An error occurred while extracting MS Excel content.", e);
}
}
// TODO: this should be done by Tika, the extractors should be split.
private Workbook createWorkbook(IRI document, InputStream is) throws IOException {
final String documentIRI = document.toString();
if (documentIRI.endsWith(".xlsx")) {
return new XSSFWorkbook(is);
} else if (documentIRI.endsWith("xls")) {
return new HSSFWorkbook(is);
} else {
throw new IllegalArgumentException("Unsupported extension for resource [" + documentIRI + "]");
}
}
private void processWorkbook(IRI documentIRI, Workbook wb, ExtractionResult er) {
for (int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets(); sheetIndex++) {
final Sheet sheet = wb.getSheetAt(sheetIndex);
final IRI sheetIRI = getSheetIRI(documentIRI, sheet);
er.writeTriple(documentIRI, excel.containsSheet, sheetIRI);
er.writeTriple(sheetIRI, RDF.TYPE, excel.sheet);
writeSheetMetadata(sheetIRI, sheet, er);
for (Row row : sheet) {
final IRI rowIRI = getRowIRI(sheetIRI, row);
er.writeTriple(sheetIRI, excel.containsRow, rowIRI);
er.writeTriple(rowIRI, RDF.TYPE, excel.row);
writeRowMetadata(rowIRI, row, er);
for (Cell cell : row) {
writeCell(rowIRI, cell, er);
}
}
}
}
private void writeSheetMetadata(IRI sheetIRI, Sheet sheet, ExtractionResult er) {
final String sheetName = sheet.getSheetName();
final int firstRowNum = sheet.getFirstRowNum();
final int lastRowNum = sheet.getLastRowNum();
er.writeTriple(sheetIRI, excel.sheetName, RDFUtils.literal(sheetName));
er.writeTriple(sheetIRI, excel.firstRow, RDFUtils.literal(firstRowNum));
er.writeTriple(sheetIRI, excel.lastRow, RDFUtils.literal(lastRowNum));
}
private void writeRowMetadata(IRI rowIRI, Row row, ExtractionResult er) {
final int firstCellNum = row.getFirstCellNum();
final int lastCellNum = row.getLastCellNum();
er.writeTriple(rowIRI, excel.firstCell , RDFUtils.literal(firstCellNum));
er.writeTriple(rowIRI, excel.lastCell , RDFUtils.literal(lastCellNum ));
}
private void writeCell(IRI rowIRI, Cell cell, ExtractionResult er) {
final IRI cellType = cellTypeToType(cell.getCellType());
if (cellType == null)
return; // Skip unsupported cells.
final IRI cellIRI = getCellIRI(rowIRI, cell);
er.writeTriple(rowIRI, excel.containsCell, cellIRI);
er.writeTriple(cellIRI, RDF.TYPE, excel.cell);
er.writeTriple(
cellIRI,
excel.cellValue,
RDFUtils.literal(cell.getStringCellValue(), cellType)
);
}
private IRI getSheetIRI(IRI documentIRI, Sheet sheet) {
return RDFUtils.iri(documentIRI.toString() + "/sheet/" + sheet.getSheetName());
}
private IRI getRowIRI(IRI sheetIRI, Row row) {
return RDFUtils.iri(sheetIRI.toString() + "/" + row.getRowNum());
}
private IRI getCellIRI(IRI rowIRI, Cell cell) {
return RDFUtils.iri(rowIRI +
String.format("/%d/", cell.getColumnIndex()));
}
private IRI cellTypeToType(CellType cellType) {
final String postfix;
if (cellType == null) {
postfix = null;
} else {
switch (cellType) {
case STRING:
postfix = "string";
break;
case BOOLEAN:
postfix = "boolean";
break;
case NUMERIC:
postfix = "numeric";
break;
default:
postfix = null;
}
}
return postfix == null ? null : RDFUtils.iri(excel.getNamespace().toString() + postfix);
}
}