| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.metamodel.fixedwidth; |
| |
| import java.io.BufferedReader; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.metamodel.csv.CsvConfiguration; |
| import org.apache.metamodel.csv.CsvDataContext; |
| import org.apache.metamodel.data.DataSet; |
| import org.apache.metamodel.schema.Table; |
| import org.apache.metamodel.util.Action; |
| import org.apache.metamodel.util.Resource; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Object capable of reading fixed width metadata from external sources and |
| * thereby producing an appropriate {@link FixedWidthConfiguration} to use with |
| * a {@link FixedWidthDataContext}. |
| */ |
| public class FixedWidthConfigurationReader { |
| |
| private static final Logger logger = LoggerFactory.getLogger(FixedWidthConfigurationReader.class); |
| |
| // example: @1 COL1 $char1. |
| private final Pattern PATTERN_SAS_INPUT_LINE = Pattern.compile("\\@(\\d+) (.+) .*?(\\d+)\\."); |
| |
| // example: COL1 "Record type" |
| private final Pattern PATTERN_SAS_LABEL_LINE = Pattern.compile("(.+) \\\"(.+)\\\""); |
| |
| /** |
| * Reads a {@link FixedWidthConfiguration} based on a SAS 'format file', |
| * <a href= |
| * "http://support.sas.com/documentation/cdl/en/etlug/67323/HTML/default/viewer.htm#p0h03yig7fp1qan1arghp3lwjqi6.htm"> |
| * described here</a>. |
| * |
| * @param encoding the format file encoding |
| * @param resource the format file resource |
| * @param failOnInconsistentLineWidth flag specifying whether inconsistent line should stop processing or not |
| * @return a {@link FixedWidthConfiguration} object to use |
| */ |
| public FixedWidthConfiguration readFromSasFormatFile(String encoding, Resource resource, |
| boolean failOnInconsistentLineWidth) { |
| final List<FixedWidthColumnSpec> columnSpecs = new ArrayList<>(); |
| |
| final CsvDataContext dataContext = new CsvDataContext(resource, new CsvConfiguration()); |
| final Table table = dataContext.getDefaultSchema().getTable(0); |
| try (final DataSet dataSet = dataContext.query().from(table).select("Name", "BeginPosition", "EndPosition") |
| .execute()) { |
| while (dataSet.next()) { |
| final String name = (String) dataSet.getRow().getValue(0); |
| final int beginPosition = Integer.parseInt((String) dataSet.getRow().getValue(1)); |
| final int endPosition = Integer.parseInt((String) dataSet.getRow().getValue(2)); |
| final int width = 1 + endPosition - beginPosition; |
| columnSpecs.add(new FixedWidthColumnSpec(name, width)); |
| } |
| } |
| |
| return new FixedWidthConfiguration(encoding, columnSpecs, failOnInconsistentLineWidth); |
| } |
| |
| /** |
| * Reads a {@link FixedWidthConfiguration} based on a SAS INPUT declaration. |
| * The reader method also optionally will look for a LABEL definition for column naming. |
| * |
| * @param encoding the format file encoding |
| * @param resource the format file resource |
| * @param failOnInconsistentLineWidth flag specifying whether inconsistent line should stop processing or not |
| * @return a {@link FixedWidthConfiguration} object to use |
| */ |
| public FixedWidthConfiguration readFromSasInputDefinition(String encoding, Resource resource, |
| boolean failOnInconsistentLineWidth) { |
| |
| final Map<String, Integer> inputWidthDeclarations = new LinkedHashMap<>(); |
| final Map<String, String> labelDeclarations = new HashMap<>(); |
| |
| resource.read(new Action<InputStream>() { |
| |
| private boolean inInputSection = false; |
| private boolean inLabelSection = false; |
| |
| @Override |
| public void run(InputStream in) throws Exception { |
| try (final BufferedReader reader = new BufferedReader(new InputStreamReader(in))) { |
| for (String line = reader.readLine(); line != null; line = reader.readLine()) { |
| processLine(line); |
| } |
| } |
| } |
| |
| private void processLine(String line) { |
| line = line.trim(); |
| if (line.isEmpty()) { |
| return; |
| } |
| if (";".equals(line)) { |
| inInputSection = false; |
| inLabelSection = false; |
| return; |
| } else if ("INPUT".equals(line)) { |
| inInputSection = true; |
| return; |
| } else if ("LABEL".equals(line)) { |
| inLabelSection = true; |
| return; |
| } |
| |
| if (inInputSection) { |
| final Matcher matcher = PATTERN_SAS_INPUT_LINE.matcher(line); |
| if (matcher.matches()) { |
| final String positionSpec = matcher.group(1); |
| final String nameSpec = matcher.group(2); |
| final int width = Integer.parseInt(matcher.group(3)); |
| logger.debug("Parsed INPUT line \"{}\": position={}, name={}, width={}", line, positionSpec, |
| nameSpec, width); |
| inputWidthDeclarations.put(nameSpec, width); |
| } else { |
| logger.debug("Failed to parse/recognize INPUT line \"{}\"", line); |
| } |
| } else if (inLabelSection) { |
| final Matcher matcher = PATTERN_SAS_LABEL_LINE.matcher(line); |
| if (matcher.matches()) { |
| final String nameSpec = matcher.group(1); |
| final String labelSpec = matcher.group(2); |
| logger.debug("Parsed LABEL line \"{}\": name={}, label={}", line, nameSpec, labelSpec); |
| labelDeclarations.put(nameSpec, labelSpec); |
| } else { |
| logger.debug("Failed to parse/recognize LABEL line \"{}\"", line); |
| } |
| } |
| |
| if (line.endsWith(";")) { |
| inInputSection = false; |
| inLabelSection = false; |
| } |
| } |
| }); |
| |
| final List<FixedWidthColumnSpec> columnSpecs = new ArrayList<>(); |
| for (Entry<String, Integer> entry : inputWidthDeclarations.entrySet()) { |
| final String columnKey = entry.getKey(); |
| final Integer columnWidth = entry.getValue(); |
| final String columnLabel = labelDeclarations.get(columnKey); |
| final String columnName = columnLabel == null ? columnKey : columnLabel; |
| columnSpecs.add(new FixedWidthColumnSpec(columnName, columnWidth)); |
| } |
| |
| return new FixedWidthConfiguration(encoding, columnSpecs, failOnInconsistentLineWidth); |
| } |
| } |