blob: c71d8caebaea779787c898fd4423bf00f627e174 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.metamodel.xml;
import java.io.File;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.metamodel.MetaModelException;
import org.apache.metamodel.MetaModelHelper;
import org.apache.metamodel.QueryPostprocessDataContext;
import org.apache.metamodel.data.CachingDataSetHeader;
import org.apache.metamodel.data.DataSet;
import org.apache.metamodel.data.DataSetHeader;
import org.apache.metamodel.data.DefaultRow;
import org.apache.metamodel.data.InMemoryDataSet;
import org.apache.metamodel.data.Row;
import org.apache.metamodel.query.FromItem;
import org.apache.metamodel.query.JoinType;
import org.apache.metamodel.query.Query;
import org.apache.metamodel.query.SelectItem;
import org.apache.metamodel.schema.Column;
import org.apache.metamodel.schema.ColumnType;
import org.apache.metamodel.schema.MutableColumn;
import org.apache.metamodel.schema.MutableRelationship;
import org.apache.metamodel.schema.MutableSchema;
import org.apache.metamodel.schema.MutableTable;
import org.apache.metamodel.schema.Relationship;
import org.apache.metamodel.schema.Schema;
import org.apache.metamodel.schema.Table;
import org.apache.metamodel.schema.TableType;
import org.apache.metamodel.util.FileResource;
import org.apache.metamodel.util.ImmutableRef;
import org.apache.metamodel.util.NumberComparator;
import org.apache.metamodel.util.Ref;
import org.apache.metamodel.util.Resource;
import org.apache.metamodel.util.UrlResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
/**
* A DataContext strategy that reads XML content and maps it to a table-based
* model similar to the rest of MetaModel. Tables are created by examining the
* data in the XML file, NOT by reading XML Schemas (xsd/dtd's). This enables
* compliancy with ALL xml formats but also raises a risk that two XML files
* with the same format wont nescesarily yield the same table model if some
* optional attributes or tags are omitted in one of the files.
*
* The parsing method applied in this datacontext is DOM based, which means that
* at upon parsing (only a single point in time), the whole file will be read
* and it's tree structure kept in memory. Therefore this DataContext is NOT
* appropriate for large XML files (10's, 100's or 1000's of megabytes).
*
* @see XmlSaxDataContext
*/
public class XmlDomDataContext extends QueryPostprocessDataContext {
private static final Logger logger = LoggerFactory.getLogger(XmlDomDataContext.class);
public static final String NATIVE_TYPE_PRIMARY_KEY = "Auto-generated primary key";
public static final String NATIVE_TYPE_FOREIGN_KEY = "Auto-generated foreign key";
public static final String NATIVE_TYPE_ATTRIBUTE = "XML Attribute";
public static final String NATIVE_TYPE_TEXT = "XML Text";
private static final String TEXT_CONTENT_TEMP_SUFFIX = "_metamodel_text_content";
private final Ref<InputSource> _inputSourceRef;
private final Map<String, List<Object[]>> _tableData = new HashMap<String, List<Object[]>>();;
private final String _schemaName;
private MutableSchema _schema;
private boolean _autoFlattenTables;
/**
* Creates an XML DataContext strategy based on an already parsed Document.
*
* @param schemaName
* @param document
* @param autoFlattenTables
*/
public XmlDomDataContext(String schemaName, Document document, boolean autoFlattenTables) {
_autoFlattenTables = autoFlattenTables;
_schemaName = schemaName;
_schema = new MutableSchema(_schemaName);
_inputSourceRef = null;
loadSchema(document);
}
/**
* Creates an XML DataContext strategy based on a file.
*
* @param resource
* the resource to parse
* @param autoFlattenTables
* a parameter indicating whether or not tags with only text
* content or a single attribute should be flattened with it's
* parent table
*
* @throws IllegalArgumentException
* if the file does not exist
*/
public XmlDomDataContext(Resource resource, boolean autoFlattenTables) throws IllegalArgumentException {
_inputSourceRef = createInputSourceRef(resource);
_schemaName = resource.getName();
_autoFlattenTables = autoFlattenTables;
}
public XmlDomDataContext(File file, boolean autoFlattenTables) {
this(new FileResource(file), autoFlattenTables);
}
public XmlDomDataContext(InputSource inputSource, String schemaName, boolean autoFlattenTables) {
_inputSourceRef = new ImmutableRef<InputSource>(inputSource);
_schemaName = schemaName;
_autoFlattenTables = autoFlattenTables;
}
public XmlDomDataContext(URL url, boolean autoFlattenTables) throws IllegalArgumentException {
this(new UrlResource(url), autoFlattenTables);
}
private static Ref<InputSource> createInputSourceRef(final Resource resource) {
return new Ref<InputSource>() {
@Override
public InputSource get() {
final InputStream in = resource.read();
return new InputSource(in);
}
};
}
/**
* Creates an XML DataContext strategy based on a file.
*
* @param file
* the file to parse
*/
public XmlDomDataContext(File file) {
this(file, true);
}
public boolean isAutoFlattenTables() {
return _autoFlattenTables;
}
public void setAutoFlattenTables(boolean autoFlattenTables) {
_autoFlattenTables = autoFlattenTables;
}
@Override
public DataSet materializeMainSchemaTable(Table table, Column[] columns, int maxRows) {
loadSchema();
List<Object[]> tableData = _tableData.get(table.getName());
if (tableData == null) {
throw new IllegalStateException("No such table name: '" + table.getName() + "'. Valid table names are: "
+ _tableData.keySet());
}
final SelectItem[] selectItems = MetaModelHelper.createSelectItems(columns);
final DataSetHeader header = new CachingDataSetHeader(selectItems);
final List<Row> resultData = new ArrayList<Row>();
for (Object[] tableDataRow : tableData) {
if (maxRows == 0) {
break;
}
maxRows--;
Object[] dataValues = new Object[columns.length];
for (int i = 0; i < columns.length; i++) {
Column column = columns[i];
int columnNumber = column.getColumnNumber();
// Some rows may not contain values for all columns
// (attributes)
if (columnNumber < tableDataRow.length) {
dataValues[i] = tableDataRow[columnNumber];
} else {
dataValues[i] = null;
}
}
resultData.add(new DefaultRow(header, dataValues));
}
return new InMemoryDataSet(header, resultData);
}
@Override
protected String getMainSchemaName() throws MetaModelException {
return _schemaName;
}
@Override
protected Schema getMainSchema() throws MetaModelException {
loadSchema();
return _schema;
}
/**
* Forces a fresh load of the schema, even though it has already been loaded
*/
public XmlDomDataContext reloadSchema() {
_schema = null;
loadSchema();
return this;
}
/**
* Loads the schema if it hasn't been loaded before
*/
public XmlDomDataContext loadSchema() {
if (_schema == null) {
_schema = new MutableSchema(_schemaName);
InputSource inputSource = _inputSourceRef.get();
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setIgnoringComments(true);
DocumentBuilder db = dbf.newDocumentBuilder();
Document document = db.parse(inputSource);
loadSchema(document);
} catch (Exception e) {
throw new MetaModelException("Error parsing XML file: " + e.getMessage(), e);
}
}
return this;
}
private void loadSchema(Document document) {
Element rootElement = document.getDocumentElement();
loadTables(rootElement, "", null, 0);
// Remove tables from schema that has no data (typically root
// node or pure XML structure)
Table[] tables = _schema.getTables();
for (Table table : tables) {
String tableName = table.getName();
List<Object[]> tableRows = _tableData.get(tableName);
if (tableRows == null) {
logger.info("Remove table (no data in it): {}", tableName);
_schema.removeTable(table);
} else {
// Rename all ID columns to reasonable names (preferably
// "id")
MutableColumn idColumn = getIdColumn((MutableTable) table);
MutableColumn column = (MutableColumn) table.getColumnByName("id");
if (column == null) {
idColumn.setName("id");
}
// Remove text content column, if it is never populated
MutableColumn textContentColumn = (MutableColumn) getTextContentColumn((MutableTable) table, null);
int textContentColumnIndex = textContentColumn.getColumnNumber();
boolean found = false;
for (Object[] objects : tableRows) {
if (objects[textContentColumnIndex] != null) {
found = true;
break;
}
}
if (!found) {
((MutableTable) table).removeColumn(textContentColumn);
} else {
// Rename all text content columns to reasonable
// names (preferably element node name)
String currentName = textContentColumn.getName();
String preferredName = currentName.substring(0, currentName.length() - TEXT_CONTENT_TEMP_SUFFIX.length());
column = (MutableColumn) table.getColumnByName(preferredName);
if (column == null) {
textContentColumn.setName(preferredName);
}
}
}
}
if (_autoFlattenTables) {
autoFlattenTables();
}
}
private void loadTables(Element element, String tablePrefix, Column parentKeyColumn, int parentKey) {
Attr[] attributes = getAttributes(element);
String textContent = getTextContent(element);
String tableName = tablePrefix + element.getNodeName();
if (attributes.length > 0 || textContent != null || hasSiblings(element)) {
// We need to represent this type of node with a table
MutableTable table = (MutableTable) _schema.getTableByName(tableName);
Column idColumn;
MutableColumn foreignKeyColumn;
List<Object[]> tableRows;
if (table == null) {
logger.info("Creating table: {}", tableName);
table = new MutableTable(tableName, TableType.TABLE, _schema);
_schema.addTable(table);
idColumn = getIdColumn(table);
tableRows = new ArrayList<Object[]>();
_tableData.put(tableName, tableRows);
if (parentKeyColumn != null) {
Table parentTable = parentKeyColumn.getTable();
foreignKeyColumn = new MutableColumn(parentTable.getName() + "_id", parentKeyColumn.getType(), table,
table.getColumnCount(), false);
foreignKeyColumn.setNativeType(NATIVE_TYPE_FOREIGN_KEY);
table.addColumn(foreignKeyColumn);
MutableRelationship.createRelationship(new Column[] { parentKeyColumn }, new Column[] { foreignKeyColumn });
} else {
foreignKeyColumn = null;
}
} else {
idColumn = getIdColumn(table);
tableRows = _tableData.get(tableName);
Column[] foreignKeys = table.getForeignKeys();
if (foreignKeys.length == 1) {
foreignKeyColumn = (MutableColumn) foreignKeys[0];
} else {
foreignKeyColumn = null;
}
}
Column textContentColumn = getTextContentColumn(table, element.getNodeName());
Map<Column, String> columnValues = new HashMap<Column, String>();
for (Attr attr : attributes) {
String name = attr.getName();
MutableColumn column = (MutableColumn) table.getColumnByName(name);
if (column == null) {
logger.info("Creating column: {}.{}", tableName, name);
column = new MutableColumn(name, ColumnType.STRING, table, table.getColumnCount(), true);
column.setNativeType(NATIVE_TYPE_ATTRIBUTE);
table.addColumn(column);
}
columnValues.put(column, attr.getValue());
}
// Create a row
Object[] rowData = new Object[table.getColumnCount()];
// Iterate id column
int id = tableRows.size() + 1;
rowData[idColumn.getColumnNumber()] = id;
if (foreignKeyColumn != null) {
rowData[foreignKeyColumn.getColumnNumber()] = parentKey;
}
// Add value for text content (if available)
if (textContent != null) {
rowData[textContentColumn.getColumnNumber()] = textContent;
}
// Add values for attributes
for (Entry<Column, String> entry : columnValues.entrySet()) {
rowData[entry.getKey().getColumnNumber()] = entry.getValue();
}
if (logger.isDebugEnabled()) {
logger.debug("Adding data [{}] to table: {}", Arrays.toString(rowData), tableName);
}
if (!isRootElement(element)) {
// Set the parent key column to this tables id column so
// child tables can create relationship to it
parentKey = id;
parentKeyColumn = idColumn;
}
tableRows.add(rowData);
}
if (!isRootElement(element)) {
tablePrefix = tableName + "_";
}
Element[] childElements = getChildElements(element);
for (int i = 0; i < childElements.length; i++) {
loadTables(childElements[i], tablePrefix, parentKeyColumn, parentKey);
}
}
private Column getTextContentColumn(MutableTable table, String preferredColumnName) {
Column[] columns = table.getColumns();
MutableColumn column = null;
for (Column col : columns) {
if (NATIVE_TYPE_TEXT.equals(col.getNativeType())) {
column = (MutableColumn) col;
break;
}
}
if (column == null && preferredColumnName != null) {
logger.info("Creating text content column for table: " + table.getName());
column = new MutableColumn(preferredColumnName + TEXT_CONTENT_TEMP_SUFFIX, ColumnType.STRING, table,
table.getColumnCount(), true);
column.setNativeType(NATIVE_TYPE_TEXT);
table.addColumn(column);
}
return column;
}
private MutableColumn getIdColumn(MutableTable table) {
Column[] columns = table.getColumns();
MutableColumn column = null;
for (Column col : columns) {
if (NATIVE_TYPE_PRIMARY_KEY.equals(col.getNativeType())) {
column = (MutableColumn) col;
break;
}
}
if (column == null) {
String tableName = table.getName();
logger.info("Creating id column for table: " + tableName);
column = new MutableColumn(tableName + "_metamodel_surrogate_id", ColumnType.INTEGER, table, table.getColumnCount(),
false);
column.setNativeType(NATIVE_TYPE_PRIMARY_KEY);
column.setIndexed(true);
table.addColumn(column);
}
return column;
}
public static String getTextContent(Element element) {
String textContent = null;
NodeList childNodes = element.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node node = childNodes.item(i);
if (node instanceof Text) {
textContent = ((Text) node).getWholeText();
break;
}
}
if (textContent != null) {
textContent = textContent.trim();
if (!"".equals(textContent)) {
return textContent;
}
}
return null;
}
public static Attr[] getAttributes(Element element) {
List<Attr> result = new ArrayList<Attr>();
NamedNodeMap attributes = element.getAttributes();
for (int i = 0; i < attributes.getLength(); i++) {
Attr attribute = (Attr) attributes.item(i);
result.add(attribute);
}
return result.toArray(new Attr[result.size()]);
}
public static boolean hasSiblings(Element element) {
// Don't look for siblings when we are at the root element
if (!isRootElement(element)) {
String name = element.getNodeName();
Element[] siblingNodes = getChildElements((Element) element.getParentNode());
for (int i = 0; i < siblingNodes.length; i++) {
Element siblingNode = siblingNodes[i];
if (siblingNode != element && name.equals(siblingNode.getNodeName())) {
return true;
}
}
}
return false;
}
public static Element[] getChildElements(Element element) {
List<Element> result = new ArrayList<Element>();
NodeList childNodes = element.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node child = childNodes.item(i);
if (child instanceof Element) {
result.add((Element) child);
}
}
return result.toArray(new Element[result.size()]);
}
public static boolean isRootElement(Element element) {
return !(element.getParentNode() instanceof Element);
}
public XmlDomDataContext flattenTables(Relationship relationship) {
MutableTable primaryTable = (MutableTable) relationship.getPrimaryTable();
MutableTable foreignTable = (MutableTable) relationship.getForeignTable();
// Check that foreignTable is not primary table in other relationships
// (if so we can't flatten as that would require id-rewriting of those
// foreign tables as well)
if (foreignTable.getPrimaryKeyRelationships().length != 0) {
Relationship[] foreignPrimaryRelationships = foreignTable.getPrimaryKeyRelationships();
String[] foreignPrimaryNames = new String[foreignPrimaryRelationships.length];
for (int i = 0; i < foreignPrimaryRelationships.length; i++) {
foreignPrimaryNames[i] = foreignPrimaryRelationships[i].getForeignTable().getName();
}
throw new UnsupportedOperationException("Cannot flatten foreign table '" + foreignTable.getName()
+ "' as it acts as primary table for tables: " + Arrays.toString(foreignPrimaryNames));
}
List<Column> primaryColumns = new ArrayList<Column>(Arrays.asList(primaryTable.getColumns()));
List<Column> foreignColumns = new ArrayList<Column>(Arrays.asList(foreignTable.getColumns()));
// Remove the surrogate id
String primaryTableName = primaryTable.getName();
String foreignTableName = foreignTable.getName();
MutableColumn idColumn = getIdColumn(foreignTable);
foreignColumns.remove(idColumn);
// Remove the foreign keys
Column[] foreignKeys = foreignTable.getForeignKeys();
for (Column foreignKey : foreignKeys) {
foreignColumns.remove(foreignKey);
}
Query q = new Query();
q.select(primaryColumns.toArray(new Column[primaryColumns.size()]));
q.select(foreignColumns.toArray(new Column[foreignColumns.size()]));
q.from(new FromItem(JoinType.LEFT, relationship));
if (logger.isDebugEnabled()) {
logger.debug("Setting table data for '{}' to query result: {}", primaryTableName, q.toString());
}
List<Object[]> tableRows = executeQuery(q).toObjectArrays();
for (Column foreignColumn : foreignColumns) {
MutableColumn newPrimaryColumn = new MutableColumn(foreignColumn.getName(), foreignColumn.getType(), primaryTable,
primaryTable.getColumnCount(), foreignColumn.isNullable());
newPrimaryColumn.setIndexed(foreignColumn.isIndexed());
newPrimaryColumn.setNativeType(foreignColumn.getNativeType());
primaryTable.addColumn(newPrimaryColumn);
}
_tableData.put(primaryTableName, tableRows);
MutableSchema mutableSchema = (MutableSchema) foreignTable.getSchema();
mutableSchema.removeTable(foreignTable);
_tableData.remove(foreignTableName);
((MutableRelationship) relationship).remove();
if (logger.isInfoEnabled()) {
logger.info("Tables '" + primaryTableName + "' and '" + foreignTableName + "' flattened to: " + primaryTableName);
if (logger.isDebugEnabled()) {
logger.debug(primaryTableName + " columns: " + Arrays.toString(primaryTable.getColumns()));
}
}
return this;
}
/**
* Automatically flattens tables that only contain a single data carrying
* column. Data carrying column are all columns that are not artificial
* columns (created to enable referential integrity between tag-to-table
* mapped tables).
*/
public XmlDomDataContext autoFlattenTables() {
Table[] tables = _schema.getTables();
for (Table table : tables) {
// First check to see that this table still exist (ie. has not been
// flattened in a previous loop)
if (_tableData.containsKey(table.getName())) {
// Find all tables that represent inner tags
Relationship[] foreignKeyRelationships = table.getForeignKeyRelationships();
if (foreignKeyRelationships.length == 1 && table.getPrimaryKeyRelationships().length == 0) {
Relationship foreignKeyRelationship = foreignKeyRelationships[0];
// If there is exactly one inner tag then we can probably
// flatten the tables, but it's only relevant if the inner
// tag only carry a single data column
int nonDataColumns = 0;
Column[] columns = table.getColumns();
for (Column column : columns) {
String nativeType = column.getNativeType();
// Use the native column type constants to determine if
// the column is an artificial column
if (NATIVE_TYPE_FOREIGN_KEY.equals(nativeType) || NATIVE_TYPE_PRIMARY_KEY.equals(nativeType)) {
nonDataColumns++;
}
}
if (columns.length == nonDataColumns + 1) {
// If the foreign key is unique for all rows, we will
// flatten it (otherwise it means that multiple inner
// tags occur, which requires two tables to deal with
// multiplicity)
boolean uniqueForeignKeys = true;
Column[] foreignColumns = foreignKeyRelationship.getForeignColumns();
SelectItem countAllItem = SelectItem.getCountAllItem();
Query q = new Query().select(foreignColumns).select(countAllItem).from(table).groupBy(foreignColumns);
DataSet data = executeQuery(q);
Comparable<Object> comparable = NumberComparator.getComparable(1);
while (data.next()) {
Object value = data.getRow().getValue(countAllItem);
if (comparable.compareTo(value) < 0) {
// If the value is compared larger than 1, we
// have several inner tags
uniqueForeignKeys = false;
break;
}
}
data.close();
if (uniqueForeignKeys) {
flattenTables(foreignKeyRelationship);
}
}
}
}
}
return this;
}
}