/** | |
* Licensed to the Apache Software Foundation (ASF) under one | |
* or more contributor license agreements. See the NOTICE file | |
* distributed with this work for additional information | |
* regarding copyright ownership. The ASF licenses this file | |
* to you under the Apache License, Version 2.0 (the | |
* "License"); you may not use this file except in compliance | |
* with the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, | |
* software distributed under the License is distributed on an | |
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
* KIND, either express or implied. See the License for the | |
* specific language governing permissions and limitations | |
* under the License. | |
*/ | |
package org.apache.metamodel.fixedwidth; | |
import java.io.BufferedReader; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.LinkedHashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Map.Entry; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import org.apache.metamodel.csv.CsvConfiguration; | |
import org.apache.metamodel.csv.CsvDataContext; | |
import org.apache.metamodel.data.DataSet; | |
import org.apache.metamodel.schema.Table; | |
import org.apache.metamodel.util.Action; | |
import org.apache.metamodel.util.Resource; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
/** | |
* Object capable of reading fixed width metadata from external sources and | |
* thereby producing an appropriate {@link FixedWidthConfiguration} to use with | |
* a {@link FixedWidthDataContext}. | |
*/ | |
public class FixedWidthConfigurationReader { | |
private static final Logger logger = LoggerFactory.getLogger(FixedWidthConfigurationReader.class); | |
// example: @1 COL1 $char1. | |
private final Pattern PATTERN_SAS_INPUT_LINE = Pattern.compile("\\@(\\d+) (.+) .*?(\\d+)\\."); | |
// example: COL1 "Record type" | |
private final Pattern PATTERN_SAS_LABEL_LINE = Pattern.compile("(.+) \\\"(.+)\\\""); | |
/** | |
* Reads a {@link FixedWidthConfiguration} based on a SAS 'format file', | |
* <a href= | |
* "http://support.sas.com/documentation/cdl/en/etlug/67323/HTML/default/viewer.htm#p0h03yig7fp1qan1arghp3lwjqi6.htm"> | |
* described here</a>. | |
* | |
* @param encoding the format file encoding | |
* @param resource the format file resource | |
* @param failOnInconsistentLineWidth flag specifying whether inconsistent line should stop processing or not | |
* @return a {@link FixedWidthConfiguration} object to use | |
*/ | |
public FixedWidthConfiguration readFromSasFormatFile(String encoding, Resource resource, | |
boolean failOnInconsistentLineWidth) { | |
final List<FixedWidthColumnSpec> columnSpecs = new ArrayList<>(); | |
final CsvDataContext dataContext = new CsvDataContext(resource, new CsvConfiguration()); | |
final Table table = dataContext.getDefaultSchema().getTable(0); | |
try (final DataSet dataSet = dataContext.query().from(table).select("Name", "BeginPosition", "EndPosition") | |
.execute()) { | |
while (dataSet.next()) { | |
final String name = (String) dataSet.getRow().getValue(0); | |
final int beginPosition = Integer.parseInt((String) dataSet.getRow().getValue(1)); | |
final int endPosition = Integer.parseInt((String) dataSet.getRow().getValue(2)); | |
final int width = 1 + endPosition - beginPosition; | |
columnSpecs.add(new FixedWidthColumnSpec(name, width)); | |
} | |
} | |
return new FixedWidthConfiguration(encoding, columnSpecs, failOnInconsistentLineWidth); | |
} | |
/** | |
* Reads a {@link FixedWidthConfiguration} based on a SAS INPUT declaration. | |
* The reader method also optionally will look for a LABEL definition for column naming. | |
* | |
* @param encoding the format file encoding | |
* @param resource the format file resource | |
* @param failOnInconsistentLineWidth flag specifying whether inconsistent line should stop processing or not | |
* @return a {@link FixedWidthConfiguration} object to use | |
*/ | |
public FixedWidthConfiguration readFromSasInputDefinition(String encoding, Resource resource, | |
boolean failOnInconsistentLineWidth) { | |
final Map<String, Integer> inputWidthDeclarations = new LinkedHashMap<>(); | |
final Map<String, String> labelDeclarations = new HashMap<>(); | |
resource.read(new Action<InputStream>() { | |
private boolean inInputSection = false; | |
private boolean inLabelSection = false; | |
@Override | |
public void run(InputStream in) throws Exception { | |
try (final BufferedReader reader = new BufferedReader(new InputStreamReader(in))) { | |
for (String line = reader.readLine(); line != null; line = reader.readLine()) { | |
processLine(line); | |
} | |
} | |
} | |
private void processLine(String line) { | |
line = line.trim(); | |
if (line.isEmpty()) { | |
return; | |
} | |
if (";".equals(line)) { | |
inInputSection = false; | |
inLabelSection = false; | |
return; | |
} else if ("INPUT".equals(line)) { | |
inInputSection = true; | |
return; | |
} else if ("LABEL".equals(line)) { | |
inLabelSection = true; | |
return; | |
} | |
if (inInputSection) { | |
final Matcher matcher = PATTERN_SAS_INPUT_LINE.matcher(line); | |
if (matcher.matches()) { | |
final String positionSpec = matcher.group(1); | |
final String nameSpec = matcher.group(2); | |
final int width = Integer.parseInt(matcher.group(3)); | |
logger.debug("Parsed INPUT line \"{}\": position={}, name={}, width={}", line, positionSpec, | |
nameSpec, width); | |
inputWidthDeclarations.put(nameSpec, width); | |
} else { | |
logger.debug("Failed to parse/recognize INPUT line \"{}\"", line); | |
} | |
} else if (inLabelSection) { | |
final Matcher matcher = PATTERN_SAS_LABEL_LINE.matcher(line); | |
if (matcher.matches()) { | |
final String nameSpec = matcher.group(1); | |
final String labelSpec = matcher.group(2); | |
logger.debug("Parsed LABEL line \"{}\": name={}, label={}", line, nameSpec, labelSpec); | |
labelDeclarations.put(nameSpec, labelSpec); | |
} else { | |
logger.debug("Failed to parse/recognize LABEL line \"{}\"", line); | |
} | |
} | |
if (line.endsWith(";")) { | |
inInputSection = false; | |
inLabelSection = false; | |
} | |
} | |
}); | |
final List<FixedWidthColumnSpec> columnSpecs = new ArrayList<>(); | |
for (Entry<String, Integer> entry : inputWidthDeclarations.entrySet()) { | |
final String columnKey = entry.getKey(); | |
final Integer columnWidth = entry.getValue(); | |
final String columnLabel = labelDeclarations.get(columnKey); | |
final String columnName = columnLabel == null ? columnKey : columnLabel; | |
columnSpecs.add(new FixedWidthColumnSpec(columnName, columnWidth)); | |
} | |
return new FixedWidthConfiguration(encoding, columnSpecs, failOnInconsistentLineWidth); | |
} | |
} |