blob: 264287fa10c88c2e705c97630ede20d025ca9a94 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.metamodel.fixedwidth;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.metamodel.csv.CsvConfiguration;
import org.apache.metamodel.csv.CsvDataContext;
import org.apache.metamodel.data.DataSet;
import org.apache.metamodel.schema.Table;
import org.apache.metamodel.util.Action;
import org.apache.metamodel.util.Resource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Object capable of reading fixed width metadata from external sources and
* thereby producing an appropriate {@link FixedWidthConfiguration} to use with
* a {@link FixedWidthDataContext}.
*/
public class FixedWidthConfigurationReader {
private static final Logger logger = LoggerFactory.getLogger(FixedWidthConfigurationReader.class);
// example: @1 COL1 $char1.
private final Pattern PATTERN_SAS_INPUT_LINE = Pattern.compile("\\@(\\d+) (.+) .*?(\\d+)\\.");
// example: COL1 "Record type"
private final Pattern PATTERN_SAS_LABEL_LINE = Pattern.compile("(.+) \\\"(.+)\\\"");
/**
* Reads a {@link FixedWidthConfiguration} based on a SAS 'format file',
* <a href=
* "http://support.sas.com/documentation/cdl/en/etlug/67323/HTML/default/viewer.htm#p0h03yig7fp1qan1arghp3lwjqi6.htm">
* described here</a>.
*
* @param encoding the format file encoding
* @param resource the format file resource
* @param failOnInconsistentLineWidth flag specifying whether inconsistent line should stop processing or not
* @return a {@link FixedWidthConfiguration} object to use
*/
public FixedWidthConfiguration readFromSasFormatFile(String encoding, Resource resource,
boolean failOnInconsistentLineWidth) {
final List<FixedWidthColumnSpec> columnSpecs = new ArrayList<>();
final CsvDataContext dataContext = new CsvDataContext(resource, new CsvConfiguration());
final Table table = dataContext.getDefaultSchema().getTable(0);
try (final DataSet dataSet = dataContext.query().from(table).select("Name", "BeginPosition", "EndPosition")
.execute()) {
while (dataSet.next()) {
final String name = (String) dataSet.getRow().getValue(0);
final int beginPosition = Integer.parseInt((String) dataSet.getRow().getValue(1));
final int endPosition = Integer.parseInt((String) dataSet.getRow().getValue(2));
final int width = 1 + endPosition - beginPosition;
columnSpecs.add(new FixedWidthColumnSpec(name, width));
}
}
return new FixedWidthConfiguration(encoding, columnSpecs, failOnInconsistentLineWidth);
}
/**
* Reads a {@link FixedWidthConfiguration} based on a SAS INPUT declaration.
* The reader method also optionally will look for a LABEL definition for column naming.
*
* @param encoding the format file encoding
* @param resource the format file resource
* @param failOnInconsistentLineWidth flag specifying whether inconsistent line should stop processing or not
* @return a {@link FixedWidthConfiguration} object to use
*/
public FixedWidthConfiguration readFromSasInputDefinition(String encoding, Resource resource,
boolean failOnInconsistentLineWidth) {
final Map<String, Integer> inputWidthDeclarations = new LinkedHashMap<>();
final Map<String, String> labelDeclarations = new HashMap<>();
resource.read(new Action<InputStream>() {
private boolean inInputSection = false;
private boolean inLabelSection = false;
@Override
public void run(InputStream in) throws Exception {
try (final BufferedReader reader = new BufferedReader(new InputStreamReader(in))) {
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
processLine(line);
}
}
}
private void processLine(String line) {
line = line.trim();
if (line.isEmpty()) {
return;
}
if (";".equals(line)) {
inInputSection = false;
inLabelSection = false;
return;
} else if ("INPUT".equals(line)) {
inInputSection = true;
return;
} else if ("LABEL".equals(line)) {
inLabelSection = true;
return;
}
if (inInputSection) {
final Matcher matcher = PATTERN_SAS_INPUT_LINE.matcher(line);
if (matcher.matches()) {
final String positionSpec = matcher.group(1);
final String nameSpec = matcher.group(2);
final int width = Integer.parseInt(matcher.group(3));
logger.debug("Parsed INPUT line \"{}\": position={}, name={}, width={}", line, positionSpec,
nameSpec, width);
inputWidthDeclarations.put(nameSpec, width);
} else {
logger.debug("Failed to parse/recognize INPUT line \"{}\"", line);
}
} else if (inLabelSection) {
final Matcher matcher = PATTERN_SAS_LABEL_LINE.matcher(line);
if (matcher.matches()) {
final String nameSpec = matcher.group(1);
final String labelSpec = matcher.group(2);
logger.debug("Parsed LABEL line \"{}\": name={}, label={}", line, nameSpec, labelSpec);
labelDeclarations.put(nameSpec, labelSpec);
} else {
logger.debug("Failed to parse/recognize LABEL line \"{}\"", line);
}
}
if (line.endsWith(";")) {
inInputSection = false;
inLabelSection = false;
}
}
});
final List<FixedWidthColumnSpec> columnSpecs = new ArrayList<>();
for (Entry<String, Integer> entry : inputWidthDeclarations.entrySet()) {
final String columnKey = entry.getKey();
final Integer columnWidth = entry.getValue();
final String columnLabel = labelDeclarations.get(columnKey);
final String columnName = columnLabel == null ? columnKey : columnLabel;
columnSpecs.add(new FixedWidthColumnSpec(columnName, columnWidth));
}
return new FixedWidthConfiguration(encoding, columnSpecs, failOnInconsistentLineWidth);
}
}