| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.sysds.runtime.iogen; |
| |
| import java.util.ArrayList; |
| import java.util.BitSet; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.concurrent.Callable; |
| import java.util.concurrent.ExecutorService; |
| import java.util.concurrent.Future; |
| |
| import org.apache.sysds.hops.OptimizerUtils; |
| import org.apache.sysds.lops.Lop; |
| import org.apache.sysds.runtime.DMLRuntimeException; |
| import org.apache.sysds.runtime.frame.data.FrameBlock; |
| import org.apache.sysds.runtime.matrix.data.MatrixBlock; |
| import org.apache.sysds.runtime.matrix.data.Pair; |
| import org.apache.sysds.runtime.util.CommonThreadPool; |
| |
| public class FormatIdentifyer { |
| private int[][] mapRow; |
| private int[][] mapCol; |
| private int[][] mapLen; |
| private int actualValueCount; |
| private MappingProperties mappingProperties; |
| private RawIndex[] sampleRawIndexes; |
| private int nrows; |
| private int ncols; |
| private int nlines; |
| private int suffixStringLength = 50; |
| private ReaderMapping mappingValues; |
| private CustomProperties properties; |
| private BitSet staticColIndexes; |
| |
| public FormatIdentifyer(String raw, MatrixBlock matrix) throws Exception { |
| this.mappingValues = new ReaderMapping(raw, matrix); |
| this.runIdentification(); |
| } |
| |
| public FormatIdentifyer(String raw, FrameBlock frame) throws Exception { |
| this.mappingValues = new ReaderMapping(raw, frame); |
| this.runIdentification(); |
| } |
| |
| @SuppressWarnings("unchecked") |
| private void runIdentification() { |
| |
| /* Index properties: |
| 1. Identity: |
| 2. Exist: |
| 3. Sequential Scattered: |
| 4. Array: |
| |
| Table 1: supported formats by row and column indexes: |
| # | row | col | Value | example |
| -------------------------------------- |
| 1 | Identity | Identity | Exist | csv, JSON/XML L single-line |
| 2 | Identity | Exist | Exist | LibSVM single |
| 3 | Identity | Exist | Not-Exist | LibSVM+Pattern single |
| 4 | Exist | Exist | Exist | MM Coordinate General multi |
| 5 | Array | Array | Exist | MM Array multi |
| 6 | Exist | Exist | Partially-Exist | MM Coordinate Symmetric multi |
| 7 | Exist | Exist | Partially-Exist+Pattern | MM Coordinate Skew-Symmetric multi |
| 8 | Exist | Exist | Not-Exist | MM Coordinate Pattern multi |
| 9 | Exist | Exist | Not-Exist+Pattern | MM Coordinate Symmetric Pattern multi |
| 10 | SEQSCATTER| Identity | Exist | JSON/XML Multi Line, AMiner multi |
| |
| strategy for checking the structure of indexes and values: |
| 1. map values: |
| 1.a values are full exist in the source |
| 1.b values are partially exist in the dataset (we have to check the Symmetric, Skew-Symmetric, and so on) |
| 1.c values are not exist in the source, in this case we have to check static value(s) |
| 2. map indexes: |
| 2.a after finding value properties the next step is looking for index maps, row index is in the first order |
| 2.b column index mapping |
| */ |
| |
| // value mapping |
| mapRow = mappingValues.getMapRow(); |
| mapCol = mappingValues.getMapCol(); |
| mapLen = mappingValues.getMapLen(); |
| mappingProperties = mappingValues.getMappingProperties(); |
| |
| // save line by line index of string(index for Int, Long, float, Double, String, Boolean) |
| sampleRawIndexes = mappingValues.getSampleRawIndexes(); |
| |
| // matrix/frame properties for analysis and create datastructures |
| nrows = mappingValues.getNrows(); |
| ncols = mappingValues.getNcols(); |
| nlines = mappingValues.getNlines(); |
| actualValueCount = mappingValues.getActualValueCount(); |
| staticColIndexes = new BitSet(ncols); |
| |
| // collect custom properties |
| // 1. properties of row-index |
| RowIndexStructure rowIndexStructure = getRowIndexStructure(); |
| |
| // 2. properties of column-index |
| ColIndexStructure colIndexStructure = getColIndexStructure(); |
| |
| properties = new CustomProperties(mappingProperties, rowIndexStructure, colIndexStructure); |
| properties.setNcols(ncols); |
| |
| // ref to Table 1: |
| if(mappingProperties.getRecordProperties() == MappingProperties.RecordProperties.SINGLELINE) { |
| // #1 |
| if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && |
| colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.Identity) { |
| Pair<ArrayList<String>[], HashSet<String>[]> bckpsr = buildColsKeyPatternSingleRow(); |
| properties.setColKeyPatterns(bckpsr.getKey()); |
| properties.setEndWithValueStrings(bckpsr.getValue()); |
| } |
| |
| // #2 |
| else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && |
| colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { |
| // find cell-index and value separators |
| RawIndex raw = null; |
| for(int c = 0; c < ncols; c++) { |
| if(mapCol[0][c] != -1) { |
| raw = sampleRawIndexes[mapRow[0][c]]; |
| raw.cloneReservedPositions(); |
| break; |
| } |
| } |
| if(raw == null) |
| throw new DMLRuntimeException("Invalid raw"); |
| HashMap<String, Long> indexDelimCount = new HashMap<>(); |
| String valueDelim = null; |
| String indexDelim = null; |
| Long maxCount = 0L; |
| int begin = colIndexStructure.getColIndexBegin(); |
| for(int c = 0; c < ncols; c++) { |
| if(mapCol[0][c] != -1) { |
| Pair<Integer, Integer> pair = raw.findValue(c + begin); |
| String tmpIndexDelim = raw.getSubString(pair.getKey() + pair.getValue(), mapCol[0][c]); |
| if(indexDelimCount.containsKey(tmpIndexDelim)) |
| indexDelimCount.put(tmpIndexDelim, indexDelimCount.get(tmpIndexDelim) + 1); |
| else |
| indexDelimCount.put(tmpIndexDelim, 1L); |
| if(maxCount < indexDelimCount.get(tmpIndexDelim)) { |
| maxCount = indexDelimCount.get(tmpIndexDelim); |
| indexDelim = tmpIndexDelim; |
| } |
| if(valueDelim == null) { |
| int nextPos = raw.getNextNumericPosition(mapCol[0][c] + mapLen[0][c]); |
| if(nextPos < raw.getRawLength()) { |
| valueDelim = raw.getSubString(mapCol[0][c] + mapLen[0][c], nextPos); |
| } |
| } |
| } |
| } |
| // update properties |
| colIndexStructure.setIndexDelim(indexDelim); |
| colIndexStructure.setValueDelim(valueDelim); |
| } |
| |
| } |
| else { |
| // # 4, 6, 7, 8, 9 |
| if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist && |
| colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { |
| |
| if(mappingProperties.getDataProperties() != MappingProperties.DataProperties.NOTEXIST) { |
| Pair<ArrayList<String>, HashSet<String>> bvkpsr = buildValueKeyPattern(); |
| HashSet<String>[] endWithValueStrings = new HashSet[1]; |
| endWithValueStrings[0] = bvkpsr.getValue(); |
| properties.setValueKeyPattern(bvkpsr.getKey()); |
| properties.setEndWithValueStrings(endWithValueStrings); |
| } |
| |
| int beginRowIndex = rowIndexStructure.getRowIndexBegin(); |
| int beginColIndex = colIndexStructure.getColIndexBegin(); |
| // build pattern for row-index |
| Pair<ArrayList<String>, HashSet<String>> rowIndexPattern = buildIndexKeyPattern(true, beginRowIndex); |
| rowIndexStructure.setKeyPattern(rowIndexPattern.getKey()); |
| rowIndexStructure.setEndWithValueString(rowIndexPattern.getValue()); |
| |
| // build pattern for col-index |
| Pair<ArrayList<String>, HashSet<String>> colIndexPattern = buildIndexKeyPattern(false, beginColIndex); |
| colIndexStructure.setKeyPattern(colIndexPattern.getKey()); |
| colIndexStructure.setEndWithValueString(colIndexPattern.getValue()); |
| |
| } |
| // #10 sequential scattered |
| if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.SeqScatter) { |
| ArrayList<Pair<String, String>> prefixSuffixBeginEndCells = extractPrefixSuffixBeginEndCells(false); |
| |
| ArrayList<Pair<String, Set<Integer>>> keys; |
| TextTrie textTrie = new TextTrie(); |
| textTrie.insert(prefixSuffixBeginEndCells.get(0).getKey(), 0); |
| char startChar = prefixSuffixBeginEndCells.get(0).getKey().charAt(0); |
| |
| int minSubStringLength = Math.min(80, prefixSuffixBeginEndCells.get(0).getKey().length()); |
| for(int i = 1; i < prefixSuffixBeginEndCells.size(); i++) { |
| String prefix = prefixSuffixBeginEndCells.get(i).getKey(); |
| for(int j = 0; j < prefix.length(); j++) { |
| if(startChar == prefix.charAt(j)) |
| textTrie.insert(prefix.substring(j, j + Math.min(minSubStringLength, prefix.length() - j)), |
| i); |
| } |
| } |
| // scoring the prefix tree |
| keys = textTrie.getAllKeys(); |
| String beginString = null; |
| String endString = null; |
| if(keys.get(0).getValue().size() == nrows) { |
| int index = keys.get(0).getKey().indexOf("\n"); |
| if(index == -1) |
| beginString = keys.get(0).getKey(); |
| else |
| beginString = keys.get(0).getKey().substring(0, index); |
| |
| // recompute suffix strings to find end of string |
| int minSuffixStringLength = prefixSuffixBeginEndCells.get(0).getValue().length(); |
| String reverseBeginString = new StringBuilder(beginString).reverse().toString(); |
| ArrayList<String> suffixes = new ArrayList<>(); |
| for(int i = 0; i < prefixSuffixBeginEndCells.size() - 1; i++) { |
| String str = new StringBuilder(prefixSuffixBeginEndCells.get(i).getValue()).reverse() |
| .toString(); |
| int indexBeginString = str.indexOf(reverseBeginString); |
| if(indexBeginString != -1) { |
| for(int j = indexBeginString + reverseBeginString.length(); j < str.length(); j++) { |
| if(str.charAt(j) == '\n') |
| indexBeginString++; |
| else |
| break; |
| } |
| minSuffixStringLength = Math.min(minSuffixStringLength, indexBeginString); |
| suffixes.add(new StringBuilder( |
| str.substring(0, indexBeginString + reverseBeginString.length())).reverse().toString()); |
| } |
| else |
| suffixes.add(str); |
| } |
| StringBuilder sbEndString = new StringBuilder(); |
| for(int i = 0; i < minSuffixStringLength; i++) { |
| if(suffixes.get(0).length() == 0) |
| break; |
| char intersectChar = suffixes.get(0).charAt(i); |
| if(intersectChar == '\n') |
| break; |
| boolean flag = true; |
| for(String ss : suffixes) { |
| if(ss.charAt(i) != intersectChar) { |
| flag = false; |
| break; |
| } |
| } |
| if(flag) |
| sbEndString.append(intersectChar); |
| else |
| break; |
| } |
| if(sbEndString.length() == 0) |
| endString = beginString; |
| else |
| endString = sbEndString.toString(); |
| updateMapsAndExtractAllSuffixStringsOfColsMultiLine(beginString, endString); |
| rowIndexStructure.setSeqBeginString(beginString); |
| rowIndexStructure.setSeqEndString(endString); |
| |
| Pair<ArrayList<String>[], HashSet<String>[]> bckpsr = buildColsKeyPatternSingleRow(); |
| properties.setColKeyPatterns(bckpsr.getKey()); |
| properties.setEndWithValueStrings(bckpsr.getValue()); |
| } |
| else { |
| // TODO: extend sequential scattered format algorithm for heterogeneous structures |
| } |
| } |
| } |
| |
| if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist || |
| colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { |
| properties.setSparse(true); |
| } |
| } |
| |
| // check row-index Exist |
| // 1. row-index exist and can be reachable with a pattern |
| // 2. row-index exist but there is no pattern for it |
| // 3. row-index exist but just not for all cells! row-index appeared when the text broken newline="\n" |
| private RowIndexStructure getRowIndexStructure() { |
| // check the row index is a prefix string in sample raw, or the sample data line number equal to the sample matrix/frame row index |
| // if the row indexes are in the prefix of values, so we need to build a key pattern to extract row indexes |
| // to understanding row indexes are in sample raw we check just 3 column of data |
| // to build a key pattern related to row indexes we just selected a row |
| // TODO: decrease the number of row/col indexes want to check(3 or 5) |
| |
| RowIndexStructure rowIndexStructure = new RowIndexStructure(); |
| |
| if(mappingProperties.getDataProperties() == MappingProperties.DataProperties.NOTEXIST) { |
| if(nlines >= this.actualValueCount) { |
| rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CellWiseExist); |
| rowIndexStructure.setRowIndexBegin(0); |
| return rowIndexStructure; |
| } |
| } |
| |
| // check row-index Identity, the identity properties available just for |
| // exist and partially exist mapped values |
| if(mappingProperties.getDataProperties() != MappingProperties.DataProperties.NOTEXIST) { |
| boolean identity = false; |
| int missedCount = 0; |
| |
| for(int r = 0; r < nrows; r++) |
| for(int c = 0; c < ncols; c++) |
| if(mapRow[r][c] != -1 && mapRow[r][c] != r) { |
| missedCount++; |
| } |
| if((float) missedCount / actualValueCount < 0.07) |
| identity = true; |
| |
| if(identity) { |
| rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.Identity); |
| return rowIndexStructure; |
| } |
| } |
| |
| BitSet[] bitSets = new BitSet[nrows]; |
| int[] rowCardinality = new int[nrows]; |
| int[] rowNZ = new int[nrows]; |
| boolean isCellWise = true; |
| boolean isSeqScatter = true; |
| boolean isExist = true; |
| |
| for(int r = 0; r < nrows; r++) { |
| bitSets[r] = new BitSet(nlines); |
| rowNZ[r] = 0; |
| for(int c = 0; c < ncols; c++) { |
| if(mapRow[r][c] != -1) { |
| bitSets[r].set(mapRow[r][c]); |
| rowNZ[r]++; |
| } |
| } |
| rowCardinality[r] = bitSets[r].cardinality(); |
| } |
| // check for Sequential: |
| for(int r = 0; r < nrows && isSeqScatter; r++) { |
| BitSet bitSet = bitSets[r]; |
| ArrayList<Integer> list = new ArrayList<>(); |
| for(int i = bitSet.nextSetBit(0); i != -1; i = bitSet.nextSetBit(i + 1)) |
| list.add(i); |
| for(int i = 0; i < list.size() - 1 && isSeqScatter; i++) |
| isSeqScatter = list.get(i) <= list.get(i + 1); |
| } |
| |
| // check for Cell Wise |
| for(int r = 0; r < nrows && isCellWise; r++) |
| isCellWise = rowCardinality[r] == rowNZ[r]; |
| |
| // check exist: |
| int begin = 0; |
| if(isCellWise) { |
| for(int c = 0; c < ncols; c++) { |
| begin = checkRowIndexesOnColumnRaw(c, 0); |
| if(begin == -1) { |
| isExist = false; |
| break; |
| } |
| } |
| if(isExist) { |
| rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CellWiseExist); |
| rowIndexStructure.setRowIndexBegin(begin); |
| return rowIndexStructure; |
| } |
| } |
| else { |
| ArrayList<RawIndex> list = new ArrayList<>(); |
| for(int r = 0; r < nrows; r++) { |
| BitSet bitSet = bitSets[r]; |
| for(int i = bitSet.nextSetBit(0); i != -1; i = bitSet.nextSetBit(i + 1)) |
| list.add(sampleRawIndexes[i]); |
| begin = checkRowIndexOnRaws(r, 0, list); |
| if(begin == -1) { |
| isExist = false; |
| break; |
| } |
| } |
| |
| if(isExist) { |
| rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.RowWiseExist); |
| rowIndexStructure.setRowIndexBegin(begin); |
| return rowIndexStructure; |
| } |
| |
| } |
| if(isSeqScatter) { |
| rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.SeqScatter); |
| return rowIndexStructure; |
| } |
| return rowIndexStructure; |
| } |
| private ColIndexStructure getColIndexStructure() { |
| ColIndexStructure colIndexStructure = new ColIndexStructure(); |
| int begin = 0; |
| boolean colIndexExist = true; |
| |
| if(mappingProperties.getDataProperties() == MappingProperties.DataProperties.NOTEXIST) { |
| if(nlines >= this.actualValueCount) { |
| colIndexStructure.setProperties(ColIndexStructure.IndexProperties.CellWiseExist); |
| colIndexStructure.setColIndexBegin(0); |
| return colIndexStructure; |
| } |
| } |
| |
| if(mappingProperties.getRecordProperties() == MappingProperties.RecordProperties.SINGLELINE) { |
| // 1. check for column index are in the record |
| for(int r = 0; r < Math.min(10, nrows); r++) { |
| int rowIndex = -1; |
| for(int c = 0; c < ncols; c++) { |
| rowIndex = mapRow[r][c]; |
| if(rowIndex != -1) |
| break; |
| } |
| begin = checkColIndexesOnRowRaw(rowIndex, 0); |
| if(begin == -1) { |
| colIndexExist = false; |
| break; |
| } |
| } |
| if(colIndexExist) { |
| colIndexStructure.setColIndexBegin(begin); |
| colIndexStructure.setProperties(ColIndexStructure.IndexProperties.CellWiseExist); |
| return colIndexStructure; |
| } |
| // 2. check the column index are identity |
| else { |
| colIndexStructure.setProperties(ColIndexStructure.IndexProperties.Identity); |
| return colIndexStructure; |
| } |
| } |
| else { |
| for(int r = 0; r < nrows && colIndexExist; r++) { |
| for(int c = 0; c < Math.min(10, ncols) && colIndexExist; c++) { |
| if(mapRow[r][c] != -1) { |
| begin = checkColIndexOnRowRaw(mapRow[r][c], c, begin); |
| colIndexExist = begin != -1; |
| } |
| } |
| } |
| |
| if(colIndexExist) { |
| colIndexStructure.setColIndexBegin(begin); |
| colIndexStructure.setProperties(ColIndexStructure.IndexProperties.CellWiseExist); |
| return colIndexStructure; |
| } |
| } |
| |
| return colIndexStructure; |
| } |
| private int checkRowIndexesOnColumnRaw(int colIndex, int beginPos) { |
| int nne = 0; |
| for(int r = 0; r < nrows; r++) { |
| if(mapRow[r][colIndex] != -1) { |
| RawIndex raw = sampleRawIndexes[mapRow[r][colIndex]]; |
| raw.cloneReservedPositions(); |
| Pair<Integer, Integer> pair = raw.findValue(r + beginPos); |
| raw.restoreReservedPositions(); |
| if(pair == null) |
| nne++; |
| } |
| } |
| |
| if(nne > 0) { |
| if(beginPos == 1) |
| return -1; |
| else |
| return checkRowIndexesOnColumnRaw(colIndex, 1); |
| } |
| else |
| return beginPos; |
| } |
| private int checkRowIndexOnRaws(int rowIndex, int beginPos, ArrayList<RawIndex> list) { |
| int nne = 0; |
| for(RawIndex raw : list) { |
| raw.cloneReservedPositions(); |
| Pair<Integer, Integer> pair = raw.findValue(rowIndex + beginPos); |
| if(pair == null) |
| nne++; |
| raw.restoreReservedPositions(); |
| } |
| |
| if(nne > list.size() * 0.3) { |
| if(beginPos == 1) |
| return -1; |
| else |
| return checkRowIndexOnRaws(rowIndex, 1, list); |
| } |
| else |
| return beginPos; |
| } |
| private int checkColIndexesOnRowRaw(int rowIndex, int beginPos) { |
| int nne = 0; |
| RawIndex raw = sampleRawIndexes[rowIndex]; |
| raw.cloneReservedPositions(); |
| for(int c = 0; c < ncols; c++) { |
| if(mapCol[rowIndex][c] != -1) { |
| Pair<Integer, Integer> pair = raw.findValue(c + beginPos); |
| if(pair == null || pair.getKey() > mapCol[rowIndex][c]) |
| nne++; |
| } |
| } |
| raw.restoreReservedPositions(); |
| if(nne > ncols * 0.05) { |
| if(beginPos == 1) |
| return -1; |
| else |
| return checkColIndexesOnRowRaw(rowIndex, 1); |
| } |
| else |
| return beginPos; |
| } |
| private int checkColIndexOnRowRaw(int rowIndex, int colIndex, int beginPos) { |
| RawIndex raw = sampleRawIndexes[rowIndex]; |
| raw.cloneReservedPositions(); |
| Pair<Integer, Integer> pair = raw.findValue(colIndex + beginPos); |
| raw.restoreReservedPositions(); |
| |
| if(pair == null) { |
| if(beginPos == 1) |
| return -1; |
| else |
| return checkColIndexOnRowRaw(rowIndex, colIndex, 1); |
| } |
| else |
| return beginPos; |
| } |
| // Extract prefix strings: |
| private ArrayList<Pair<String, String>> extractPrefixSuffixBeginEndCells(boolean reverse) { |
| |
| ArrayList<Pair<String, String>> result = new ArrayList<>(); |
| BitSet[] recordUsedLines = new BitSet[nlines]; |
| BitSet[] usedLines = new BitSet[nlines]; |
| for(int r = 0; r < nrows; r++) |
| recordUsedLines[r] = new BitSet(); |
| |
| for(int r = 0; r < nrows; r++) |
| for(int c = 0; c < ncols; c++) |
| if(mapRow[r][c] != -1) |
| recordUsedLines[r].set(mapRow[r][c]); |
| |
| for(int r = 0; r < nrows; r++) { |
| usedLines[r] = new BitSet(nlines); |
| for(int i = 0; i < nrows; i++) { |
| if(i != r) |
| usedLines[r].or(recordUsedLines[i]); |
| } |
| } |
| int lastLine = 0; |
| int lastPos = 0; |
| for(int r = 0; r < nrows; r++) { |
| int beginLine = 0; |
| int endLine = 0; |
| int beginPos = 0; |
| int endPos; |
| for(int i = 0; i < nlines; i++) |
| if(recordUsedLines[r].get(i)) { |
| beginLine = i; |
| break; |
| } |
| for(int i = nlines - 1; i >= 0; i--) |
| if(recordUsedLines[r].get(i)) { |
| endLine = i; |
| break; |
| } |
| |
| endPos = 0; |
| beginPos = sampleRawIndexes[beginLine].getRawLength(); |
| for(int c = 0; c < ncols; c++) { |
| if(mapRow[r][c] == beginLine) |
| beginPos = Math.min(beginPos, mapCol[r][c]); |
| |
| if(mapRow[r][c] == endLine) |
| endPos = Math.max(endPos, mapCol[r][c] + mapLen[r][c]); |
| } |
| StringBuilder sbPrefix = new StringBuilder(); |
| if(lastLine != beginLine) |
| sbPrefix.append(sampleRawIndexes[lastLine].getRaw().substring(lastPos)).append("\n"); |
| |
| for(int i = lastLine + 1; i < beginLine; i++) |
| sbPrefix.append(sampleRawIndexes[i].getRaw()).append("\n"); |
| sbPrefix.append(sampleRawIndexes[beginLine].getRaw().substring(0, beginPos)); |
| |
| lastLine = endLine; |
| lastPos = endPos; |
| |
| result.add(new Pair<>(sbPrefix.toString(), null)); |
| } |
| |
| // set suffix |
| for(int r = 0; r < nrows - 1; r++) { |
| result.get(r).setValue(result.get(r + 1).getKey()); |
| } |
| result.get(nrows - 1).setValue(null); |
| return result; |
| } |
| |
| public CustomProperties getFormatProperties() { |
| return properties; |
| } |
| |
| @SuppressWarnings("unchecked") |
| private Pair<ArrayList<String>, HashSet<String>> buildValueKeyPattern() { |
| int minSelectCols = Math.min(10, ncols); |
| ArrayList<String>[] prefixesRemovedReverse = new ArrayList[1]; |
| ArrayList<String>[] prefixesRemoved = new ArrayList[1]; |
| ArrayList<String>[] prefixes = new ArrayList[1]; |
| ArrayList<String>[] suffixes = new ArrayList[1]; |
| ArrayList<Pair<String, Integer>>[] prefixesRemovedReverseSort = new ArrayList[1]; |
| ArrayList<String>[] keys = new ArrayList[minSelectCols]; |
| HashSet<String>[] colSuffixes = new HashSet[minSelectCols]; |
| LongestCommonSubsequence lcs = new LongestCommonSubsequence(); |
| |
| for(int c = 0; c < minSelectCols; c++) { |
| prefixesRemovedReverse[0] = new ArrayList<>(); |
| prefixes[0] = new ArrayList<>(); |
| suffixes[0] = new ArrayList<>(); |
| } |
| |
| for(int c = 0; c < minSelectCols; c++) { |
| prefixesRemovedReverse[0].addAll(extractAllPrefixStringsOfAColSingleLine(c, true, true).getKey()); |
| prefixes[0].addAll(extractAllPrefixStringsOfAColSingleLine(c,false, false).getKey()); |
| suffixes[0].addAll(extractAllSuffixStringsOfColsSingleLine(c, true)); |
| } |
| HashSet<Integer> colIndexes = new HashSet<>(); |
| colIndexes.add(0); |
| |
| try { |
| ArrayList<BuildColsKeyPatternSingleRowTask> tasks = new ArrayList<>(); |
| tasks.add( |
| new BuildColsKeyPatternSingleRowTask(prefixesRemovedReverse, prefixesRemoved, prefixes, suffixes, |
| prefixesRemovedReverseSort, keys, colSuffixes, lcs, colIndexes)); |
| |
| //check for exceptions |
| for(Callable<Object> task : tasks) |
| task.call(); |
| } |
| catch(Exception e) { |
| throw new RuntimeException("Failed BuildValueKeyPattern.", e); |
| } |
| |
| return new Pair<>(keys[0], colSuffixes[0]); |
| } |
| |
| private String addToPrefixes(Set<String> list, String strValue, int value, boolean reverse){ |
| String str = reverse ? new StringBuilder(strValue).reverse().toString() : strValue; |
| RawIndex rawIndex = new RawIndex(str); |
| Pair<Integer, Integer> pair = rawIndex.findValue(value); |
| if(pair != null) { |
| String pstr = str.substring(0, pair.getKey()); |
| list.add(pstr); |
| return Lop.OPERAND_DELIMITOR+str.substring(pair.getKey() + pair.getValue()).replaceAll("\\d", Lop.OPERAND_DELIMITOR); |
| } |
| return null; |
| } |
| |
| @SuppressWarnings("unchecked") |
| private Pair<ArrayList<String>, HashSet<String>> buildIndexKeyPattern(boolean keyForRowIndexes, int begin) { |
| ArrayList<String>[] prefixesRemovedReverse = new ArrayList[1]; |
| ArrayList<String>[] prefixesRemoved = new ArrayList[1]; |
| ArrayList<String>[] prefixes = new ArrayList[1]; |
| ArrayList<String>[] suffixes = new ArrayList[1]; |
| ArrayList<Pair<String, Integer>>[] prefixesRemovedReverseSort = new ArrayList[1]; |
| ArrayList<String>[] keys = new ArrayList[1]; |
| HashSet<String>[] colSuffixes = new HashSet[1]; |
| LongestCommonSubsequence lcs = new LongestCommonSubsequence(); |
| |
| prefixesRemovedReverse[0] = new ArrayList<>(); |
| prefixesRemoved[0] = new ArrayList<>(); |
| prefixes[0] = new ArrayList<>(); |
| suffixes[0] = new ArrayList<>(); |
| |
| Map<Integer,ArrayList<Integer>> selectedRowColForIndexes = new HashMap<>(); |
| int maxSize = 0; |
| for(int r = 1; r < nrows && maxSize < 1000; r++) { |
| for(int c = 0; c < ncols && maxSize < 1000; c++) { |
| if(mapCol[r][c] != -1 && r != c ) { |
| selectedRowColForIndexes.computeIfAbsent(r, k -> new ArrayList<>()); |
| selectedRowColForIndexes.get(r).add(c); |
| maxSize++; |
| } |
| } |
| } |
| if(keyForRowIndexes) { |
| for(int r : selectedRowColForIndexes.keySet()) { |
| ArrayList<Integer> colSet = selectedRowColForIndexes.get(r); |
| ArrayList<String> tmpPrefixesRemovedReverse = extractAllPrefixStringsOfAColSingleLine(r, colSet, true, true).getKey(); |
| ArrayList<String> tmpPrefixesRemoved = extractAllPrefixStringsOfAColSingleLine(r, colSet, false, true).getKey(); |
| ArrayList<String> tmpPrefixes = extractAllPrefixStringsOfAColSingleLine(r, colSet, false, false).getKey(); |
| |
| Set<String> tmpSet = new HashSet<>(); |
| for(String s : tmpPrefixesRemovedReverse) { |
| String suf = addToPrefixes(tmpSet, s, r+begin, true); |
| if(suf != null) |
| suffixes[0].add(suf); |
| } |
| prefixesRemovedReverse[0].addAll(tmpSet); |
| |
| tmpSet = new HashSet<>(); |
| for(String s : tmpPrefixesRemoved) |
| addToPrefixes(tmpSet, s, r+begin, false); |
| prefixesRemoved[0].addAll(tmpSet); |
| |
| tmpSet = new HashSet<>(); |
| for(String s : tmpPrefixes) |
| addToPrefixes(tmpSet, s, r+begin, false); |
| prefixes[0].addAll(tmpSet); |
| } |
| } |
| else { |
| for(int r : selectedRowColForIndexes.keySet()) { |
| ArrayList<Integer> colSet = selectedRowColForIndexes.get(r); |
| ArrayList<String> tmpPrefixesRemovedReverse = extractAllPrefixStringsOfAColSingleLine(r, colSet, true, true).getKey(); |
| ArrayList<String> tmpPrefixesRemoved = extractAllPrefixStringsOfAColSingleLine(r, colSet, false, true).getKey(); |
| ArrayList<String> tmpPrefixes = extractAllPrefixStringsOfAColSingleLine(r, colSet, false, false).getKey(); |
| |
| Set<String> tmpSet = new HashSet<>(); |
| for(String s : tmpPrefixesRemovedReverse) { |
| for(int c: colSet) { |
| String suf = addToPrefixes(tmpSet, s, c+begin, true); |
| if(suf != null) |
| suffixes[0].add(suf); |
| } |
| } |
| prefixesRemovedReverse[0].addAll(tmpSet); |
| |
| tmpSet = new HashSet<>(); |
| for(String s : tmpPrefixesRemoved) |
| for(int c: colSet) { |
| addToPrefixes(tmpSet, s, c+begin, false); |
| } |
| prefixesRemoved[0].addAll(tmpSet); |
| |
| tmpSet = new HashSet<>(); |
| for(String s : tmpPrefixes) |
| for(int c: colSet) { |
| addToPrefixes(tmpSet, s, c+begin, false); |
| } |
| prefixes[0].addAll(tmpSet); |
| } |
| } |
| |
| HashSet<Integer> colIndexe = new HashSet<>(); |
| colIndexe.add(0); |
| |
| try { |
| ArrayList<BuildColsKeyPatternSingleRowTask> tasks = new ArrayList<>(); |
| tasks.add( |
| new BuildColsKeyPatternSingleRowTask(prefixesRemovedReverse, prefixesRemoved, prefixes, suffixes, |
| prefixesRemovedReverseSort, keys, colSuffixes, lcs, colIndexe)); |
| //check for exceptions |
| for(Callable<Object> task : tasks) |
| task.call(); |
| } |
| catch(Exception e) { |
| throw new RuntimeException("Failed BuildValueKeyPattern.", e); |
| } |
| |
| return new Pair<>(keys[0], colSuffixes[0]); |
| } |
| |
| // Get all prefix strings of a column |
| @SuppressWarnings("unchecked") |
| public Pair<ArrayList<String>[], ArrayList<Integer>[]> extractAllPrefixStringsOfColsSingleLine(boolean reverse, boolean removesSelected) { |
| ArrayList<String>[] prefixStrings = new ArrayList[ncols]; |
| ArrayList<Integer>[] rowIndexes = new ArrayList[ncols]; |
| for(int c = 0; c < ncols; c++) { |
| Pair<ArrayList<String>, ArrayList<Integer>> pair = extractAllPrefixStringsOfAColSingleLine(c, reverse, removesSelected); |
| prefixStrings[c] = pair.getKey(); |
| rowIndexes[c] = pair.getValue(); |
| } |
| return new Pair<>(prefixStrings, rowIndexes); |
| } |
| |
| public Pair<ArrayList<String>, ArrayList<Integer>> extractAllPrefixStringsOfAColSingleLine(int r, |
| ArrayList<Integer> colIndexes, boolean reverse, boolean removesSelected) { |
| ArrayList<String> prefixStrings = new ArrayList<>(); |
| ArrayList<Integer> rowIndexes = new ArrayList<>(); |
| for(int c : colIndexes) { |
| int rowIndex = mapRow[r][c]; |
| if(rowIndex != -1) { |
| rowIndexes.add(rowIndex); |
| String str; |
| if(removesSelected) |
| str = sampleRawIndexes[rowIndex].getRemainedTexts(0, mapCol[r][c]); |
| else |
| str = sampleRawIndexes[rowIndex].getRaw().substring(0, mapCol[r][c]); |
| if(reverse) |
| prefixStrings.add(new StringBuilder(str).reverse().toString()); |
| else |
| prefixStrings.add(str); |
| } |
| } |
| return new Pair<>(prefixStrings, rowIndexes); |
| } |
| |
| public Pair<ArrayList<String>, ArrayList<Integer>> extractAllPrefixStringsOfAColSingleLine(int colIndex, |
| boolean reverse, boolean removesSelected) { |
| ArrayList<String> prefixStrings = new ArrayList<>(); |
| ArrayList<Integer> rowIndexes = new ArrayList<>(); |
| for(int r = 0; r < nrows; r++) { |
| int rowIndex = mapRow[r][colIndex]; |
| if(rowIndex != -1) { |
| rowIndexes.add(rowIndex); |
| String str; |
| if(removesSelected) |
| str = sampleRawIndexes[rowIndex].getRemainedTexts(0, mapCol[r][colIndex]); |
| else |
| str = sampleRawIndexes[rowIndex].getRaw().substring(0, mapCol[r][colIndex]); |
| if(reverse) |
| prefixStrings.add(new StringBuilder(str).reverse().toString()); |
| else |
| prefixStrings.add(str); |
| } |
| } |
| return new Pair<>(prefixStrings, rowIndexes); |
| } |
| |
| @SuppressWarnings("unchecked") |
| private ArrayList<String>[] extractAllSuffixStringsOfColsSingleLine(boolean removeData) { |
| ArrayList<String>[] result = new ArrayList[ncols]; |
| for(int c = 0; c < ncols; c++) { |
| result[c] = new ArrayList<>(); |
| for(int r = 0; r < nrows; r++) { |
| int rowIndex = mapRow[r][c]; |
| if(rowIndex == -1) |
| continue; |
| String str; |
| if(removeData) |
| str = sampleRawIndexes[rowIndex].getRemainedTexts(mapCol[r][c] + mapLen[r][c], -1); |
| else |
| str = sampleRawIndexes[rowIndex].getRaw().substring(mapCol[r][c] + mapLen[r][c]); |
| result[c].add(str); |
| } |
| } |
| return result; |
| } |
| |
| private ArrayList<String> extractAllSuffixStringsOfColsSingleLine(int col, boolean removeData) { |
| ArrayList<String> result = new ArrayList<>(); |
| for(int r = 0; r < nrows; r++) { |
| int rowIndex = mapRow[r][col]; |
| if(rowIndex == -1) |
| continue; |
| String str; |
| if(removeData) |
| str = sampleRawIndexes[rowIndex].getRemainedTexts(mapCol[r][col] + mapLen[r][col], -1); |
| else |
| str = sampleRawIndexes[rowIndex].getRaw().substring(mapCol[r][col] + mapLen[r][col]); |
| result.add(str); |
| } |
| return result; |
| } |
| |
| @SuppressWarnings("unused") |
| private ArrayList<String> extractAllSuffixStringsOfColsSingleLine(ArrayList<Integer> rows,int col, boolean removeData) { |
| ArrayList<String> result = new ArrayList<>(); |
| for(int r: rows) { |
| int rowIndex = mapRow[r][col]; |
| if(rowIndex == -1) |
| continue; |
| String str; |
| if(removeData) |
| str = sampleRawIndexes[rowIndex].getRemainedTexts(mapCol[r][col] + mapLen[r][col], -1); |
| else |
| str = sampleRawIndexes[rowIndex].getRaw().substring(mapCol[r][col] + mapLen[r][col]); |
| result.add(str); |
| } |
| return result; |
| } |
| |
| private void updateMapsAndExtractAllSuffixStringsOfColsMultiLine(String beginString, String endString) { |
| RawIndex[] upRawIndexes = new RawIndex[nrows]; |
| ArrayList<Pair<Integer, Integer>> beginIndexes = getTokenIndexOnMultiLineRecords(beginString); |
| ArrayList<Pair<Integer, Integer>> endIndexes; |
| String endToken; |
| if(!beginString.equals(endString)) { |
| endIndexes = getTokenIndexOnMultiLineRecords(endString); |
| endToken = endString; |
| } |
| else { |
| endIndexes = new ArrayList<>(); |
| for(int i = 1; i < beginIndexes.size(); i++) |
| endIndexes.add(beginIndexes.get(i)); |
| endIndexes.add(new Pair<>(this.sampleRawIndexes.length - 1, |
| this.sampleRawIndexes[this.sampleRawIndexes.length - 1].getRawLength())); |
| endToken = ""; |
| } |
| int r = 0; |
| int i = 0; |
| int j = 0; |
| StringBuilder sb = new StringBuilder(); |
| while(i < beginIndexes.size() && j < endIndexes.size() && r < nrows) { |
| Pair<Integer, Integer> p1 = beginIndexes.get(i); |
| Pair<Integer, Integer> p2 = endIndexes.get(j); |
| int n = 0; |
| while(p1.getKey() < p2.getKey() || (p1.getKey() == p2.getKey() && p1.getValue() < p2.getValue())) { |
| n++; |
| i++; |
| if(i == beginIndexes.size()) |
| break; |
| p1 = beginIndexes.get(i); |
| } |
| j += n - 1; |
| sb.append(this.sampleRawIndexes[beginIndexes.get(i - n).getKey()].getRaw() |
| .substring(beginIndexes.get(i - n).getValue())); |
| for(int ri = beginIndexes.get(i - n).getKey() + 1; ri < endIndexes.get(j).getKey(); ri++) { |
| sb.append(this.sampleRawIndexes[ri].getRaw()); |
| } |
| sb.append(this.sampleRawIndexes[endIndexes.get(j).getKey()].getRaw() |
| .substring(0, endIndexes.get(j).getValue())).append(endToken); |
| RawIndex rawIndex = new RawIndex(); |
| rawIndex.setRaw(sb.toString()); |
| sb = new StringBuilder(); |
| j++; |
| // update mapping |
| for(int c = 0; c < ncols; c++) { |
| if(mapRow[r][c] != -1) { |
| if(mapRow[r][c] != beginIndexes.get(i - n).getKey()) |
| this.mapCol[r][c] += |
| this.sampleRawIndexes[beginIndexes.get(i - n).getKey()].getRawLength() - |
| beginIndexes.get(i - n).getValue(); |
| else |
| this.mapCol[r][c] -= beginIndexes.get(i - n).getValue(); |
| |
| for(int ci = beginIndexes.get(i - n).getKey() + 1; ci < this.mapRow[r][c]; ci++) |
| this.mapCol[r][c] += this.sampleRawIndexes[ci].getRawLength(); |
| rawIndex.setReservedPositions(mapCol[r][c], mapLen[r][c]); |
| this.mapRow[r][c] = r; |
| } |
| } |
| upRawIndexes[r] = rawIndex; |
| r++; |
| } |
| this.sampleRawIndexes = upRawIndexes; |
| } |
| |
| private ArrayList<Pair<Integer, Integer>> getTokenIndexOnMultiLineRecords(String token) { |
| ArrayList<Pair<Integer, Integer>> result = new ArrayList<>(); |
| |
| for(int ri = 0; ri < this.sampleRawIndexes.length; ri++) { |
| String raw = this.sampleRawIndexes[ri].getRaw(); |
| int index; |
| int fromIndex = 0; |
| do { |
| index = raw.indexOf(token, fromIndex); |
| if(index != -1) { |
| result.add(new Pair<>(ri, index)); |
| fromIndex = index + token.length(); |
| } |
| else |
| break; |
| } |
| while(true); |
| } |
| return result; |
| } |
| |
| @SuppressWarnings("unused") |
| private ArrayList<Pair<Integer, Integer>> getTokenIndexOnMultiLineRecords(String beginToken, String endToken) { |
| ArrayList<Pair<Integer, Integer>> result = new ArrayList<>(); |
| |
| for(int ri = 0; ri < this.sampleRawIndexes.length; ) { |
| String raw = this.sampleRawIndexes[ri].getRaw(); |
| int index; |
| int fromIndex = 0; |
| do { |
| index = raw.indexOf(endToken, fromIndex); |
| if(index != -1) { |
| if(index + endToken.length() + beginToken.length() <= raw.length()) { |
| boolean flag = true; |
| for(int i = index + endToken.length(), j = 0; |
| i < index + endToken.length() + beginToken.length() && flag; |
| i++, j++) { |
| flag = raw.charAt(i) == beginToken.charAt(j); |
| } |
| if(flag) { |
| result.add(new Pair<>(ri, index)); |
| fromIndex = index + beginToken.length() + endToken.length(); |
| } |
| else |
| fromIndex++; |
| } |
| else { |
| if(ri + 1 == this.sampleRawIndexes.length) |
| break; |
| // skip empty rows |
| do { |
| raw = this.sampleRawIndexes[++ri].getRaw(); |
| } |
| while(raw.length() == 0); |
| |
| if(raw.startsWith(beginToken)) { |
| result.add(new Pair<>(ri, 0)); |
| fromIndex = 1; |
| } |
| } |
| } |
| else |
| break; |
| } |
| while(true); |
| ri++; |
| } |
| return result; |
| } |
| |
| private Pair<Set<String>, Set<String>> getNewRefineKeys(LongestCommonSubsequence lcs, String firstKey, |
| ArrayList<String> prefixesRemoved, ArrayList<String> prefixes, Set<String> refineKeys) { |
| |
| Set<String> setRefineLCS = new HashSet<>(); |
| Set<String> newSetRefineLCS = new HashSet<>(); |
| |
| for(String refineKey : refineKeys) { |
| boolean flagRefine = true; |
| boolean isInTheMiddleOfString = false; |
| String[] lcsKey = (refineKey+Lop.OPERAND_DELIMITOR+firstKey).split(Lop.OPERAND_DELIMITOR); |
| ArrayList<String> tmpList = new ArrayList<>(); |
| for(String sk : lcsKey) |
| if(sk.length() > 0) |
| tmpList.add(sk); |
| |
| for(int i = 0; i < prefixes.size() && !isInTheMiddleOfString; i++) { |
| String str = prefixes.get(i); |
| int indexOnString = getIndexOfKeyPatternOnString(str, tmpList, 0); |
| flagRefine &= indexOnString == str.length(); |
| if(!flagRefine) |
| isInTheMiddleOfString = indexOnString != -1; |
| } |
| if(flagRefine) |
| setRefineLCS.add(refineKey); |
| else if(!isInTheMiddleOfString) { |
| for(int i = 0; i < prefixesRemoved.size() ; i++) { |
| String psStr = prefixesRemoved.get(i).substring(0, firstKey.length()); |
| ArrayList<String> list1 = lcs.getLCS(refineKey, psStr); |
| Set<String> set = new HashSet<>(); |
| set.addAll(list1); |
| |
| for(String lcsKeys : set) { |
| // TODO Removed an unlikely argument it should not be a problem. |
| if(setRefineLCS.contains(lcsKeys)) |
| continue; |
| String[] newLCSKey = (lcsKeys+Lop.OPERAND_DELIMITOR+firstKey).split(Lop.OPERAND_DELIMITOR); |
| ArrayList<String> tmpLCSKeyList = new ArrayList<>(); |
| for(String sk : newLCSKey) |
| if(sk.length() > 0) |
| tmpLCSKeyList.add(sk); |
| |
| boolean str1Check = getIndexOfKeyPatternOnString(psStr + firstKey, tmpLCSKeyList, 0) == psStr.length(); |
| if(str1Check) |
| newSetRefineLCS.add(lcsKeys); |
| } |
| } |
| } |
| } |
| return new Pair<>(setRefineLCS, newSetRefineLCS); |
| } |
| |
| private Set<String> getRefineKeysStep(LongestCommonSubsequence lcs, String string1, String string2, |
| String psString1, String psString2, String firstKey){ |
| // remove first key from end of Str1 and Str2 |
| String str1 = string1.substring(0, string1.length() - firstKey.length()); |
| String str2 = string2.substring(0, string2.length() - firstKey.length()); |
| |
| ArrayList<String> list1 = lcs.getLCS(str1, str2); |
| Set<String> setLCS = new HashSet<>(); |
| setLCS.addAll(list1); |
| |
| Set<String> refineKeysStep = new HashSet<>(); |
| for(String lcsKeys : setLCS) { |
| String[] lcsKey = (lcsKeys+Lop.OPERAND_DELIMITOR+firstKey).split(Lop.OPERAND_DELIMITOR); |
| ArrayList<String> tmpList = new ArrayList<>(); |
| for(String sk : lcsKey) |
| if(sk.length() > 0) |
| tmpList.add(sk); |
| |
| boolean str1Check = getIndexOfKeyPatternOnString(psString1, tmpList, 0) == psString1.length(); |
| boolean str2Check = getIndexOfKeyPatternOnString(psString2, tmpList, 0) == psString2.length(); |
| if(str1Check && str2Check) |
| refineKeysStep.add(lcsKeys); |
| } |
| return refineKeysStep; |
| } |
| |
| private ArrayList<String> cleanUPKey(ArrayList<String> keys, ArrayList<String> prefixes){ |
| ArrayList<String> result = new ArrayList<>(); |
| int i = keys.size() -1; |
| for(; i>=0; i--) { |
| boolean flag = true; |
| for(int j =0; j< prefixes.size() && flag; j++) { |
| // TODO find out if used: |
| // String bk = keys.get(i); |
| // int k1 = getIndexOfKeyPatternOnString(prefixes.get(j), i, keys, 0); |
| // int k2 = prefixes.get(j).length(); |
| flag = getIndexOfKeyPatternOnString(prefixes.get(j), i, keys, 0) == prefixes.get(j).length(); |
| } |
| if(flag) |
| break; |
| } |
| if( i == -1) |
| return keys; |
| else { |
| for(int index = i; index< keys.size(); index++) |
| result.add(keys.get(index)); |
| |
| } |
| return result; |
| } |
| |
| |
| private boolean checkExtraKeyForCol(ArrayList<String> keys, String extraKey , ArrayList<String> prefixes){ |
| boolean flag = true; |
| for(int i=0; i<keys.size()-1 && flag; i++) |
| flag = keys.get(i).equals(keys.get(i+1)); |
| if(!flag) |
| return false; |
| for(int j = 0; j < prefixes.size() && flag; j++) { |
| int index = prefixes.get(j).indexOf(extraKey); |
| if(index !=-1) { |
| index += extraKey.length(); |
| flag = getIndexOfKeyPatternOnString(prefixes.get(j), 0, keys, index) == prefixes.get(j).length(); |
| } |
| else |
| flag = false; |
| } |
| return flag; |
| } |
| |
| private Integer getIndexOfKeyPatternOnString(String str, ArrayList<String> key, int beginPos) { |
| return getIndexOfKeyPatternOnString(str,0, key, beginPos); |
| } |
| |
| private Integer getIndexOfKeyPatternOnString(String str, int keyFromIndex,ArrayList<String> key, int beginPos) { |
| int currPos = beginPos; |
| boolean flag = true; |
| for(int i = keyFromIndex; i < key.size(); i++) { |
| int index = str.indexOf(key.get(i), currPos); |
| if(index != -1) |
| currPos = index + key.get(i).length(); |
| else { |
| flag = false; |
| break; |
| } |
| } |
| if(flag) |
| return currPos; |
| else |
| return -1; |
| } |
| |
| @SuppressWarnings("unchecked") |
| private Pair<ArrayList<String>[], HashSet<String>[]> buildColsKeyPatternSingleRow() { |
| ArrayList<String>[] prefixesRemovedReverse = extractAllPrefixStringsOfColsSingleLine(true, true).getKey(); |
| ArrayList<String>[] prefixesRemoved = new ArrayList[ncols]; |
| ArrayList<String>[] prefixes = extractAllPrefixStringsOfColsSingleLine(false, false).getKey(); |
| ArrayList<String>[] suffixes = extractAllSuffixStringsOfColsSingleLine(true); |
| ArrayList<Pair<String, Integer>>[] prefixesRemovedReverseSort = new ArrayList[ncols]; |
| ArrayList<String>[] keys = new ArrayList[ncols]; |
| HashSet<String>[] colSuffixes = new HashSet[ncols]; |
| LongestCommonSubsequence lcs = new LongestCommonSubsequence(); |
| |
| int numThreads = OptimizerUtils.getParallelTextWriteParallelism(); |
| ExecutorService pool = CommonThreadPool.get(numThreads); |
| try { |
| ArrayList<BuildColsKeyPatternSingleRowTask> tasks = new ArrayList<>(); |
| int blklen = (int) Math.ceil((double) ncols / (numThreads * numThreads)); |
| for(int i = 0; i < numThreads; i++) { |
| HashSet<Integer> colIndexes = new HashSet<>(); |
| for(int j = 0; j < numThreads && j * numThreads * blklen + i * blklen < ncols; j++) { |
| int begin = j * numThreads * blklen + i * blklen; |
| int end = Math.min(j * numThreads * blklen + (i + 1) * blklen, ncols); |
| for(int k = begin; k < end; k++) |
| colIndexes.add(k); |
| } |
| tasks.add( |
| new BuildColsKeyPatternSingleRowTask(prefixesRemovedReverse, prefixesRemoved, prefixes, suffixes, |
| prefixesRemovedReverseSort, keys, colSuffixes, lcs, colIndexes)); |
| } |
| |
| for(Future<Object> task : pool.invokeAll(tasks)) |
| task.get(); |
| return new Pair<>(keys, colSuffixes); |
| } |
| catch(Exception e) { |
| throw new RuntimeException("Failed parallel ColsKeyPatternSingleRow.", e); |
| } |
| finally{ |
| pool.shutdown(); |
| } |
| } |
| |
| private class BuildColsKeyPatternSingleRowTask implements Callable<Object> { |
| private final ArrayList<String>[] prefixesRemovedReverse; |
| private final ArrayList<String>[] prefixesRemoved; |
| private final ArrayList<String>[] prefixes; |
| private final ArrayList<String>[] suffixes; |
| private final ArrayList<Pair<String, Integer>>[] prefixesRemovedReverseSort; |
| private final ArrayList<String>[] keys; |
| private final HashSet<String>[] colSuffixes; |
| private final LongestCommonSubsequence lcs; |
| private final HashSet<Integer> colIndexes; |
| |
| public BuildColsKeyPatternSingleRowTask(ArrayList<String>[] prefixesRemovedReverse, |
| ArrayList<String>[] prefixesRemoved, ArrayList<String>[] prefixes, ArrayList<String>[] suffixes, |
| ArrayList<Pair<String, Integer>>[] prefixesRemovedReverseSort, ArrayList<String>[] keys, |
| HashSet<String>[] colSuffixes, LongestCommonSubsequence lcs, HashSet<Integer> colIndexes) { |
| this.prefixesRemovedReverse = prefixesRemovedReverse; |
| this.prefixesRemoved = prefixesRemoved; |
| this.prefixes = prefixes; |
| this.suffixes = suffixes; |
| this.prefixesRemovedReverseSort = prefixesRemovedReverseSort; |
| this.keys = keys; |
| this.colSuffixes = colSuffixes; |
| this.lcs = lcs; |
| this.colIndexes = colIndexes; |
| } |
| |
| @Override |
| public Object call() throws Exception { |
| // Sort prefixesRemovedReverse list |
| for(int c :colIndexes){ |
| keys[c] = new ArrayList<>(); |
| Map<String, ArrayList<Integer>> mapPrefixesRemovedReverse = new HashMap<>(); |
| for(int i=0; i<prefixesRemovedReverse[c].size(); i++) { |
| StringBuilder sb = new StringBuilder(); |
| String str = prefixesRemovedReverse[c].get(i).replaceAll("\\d", Lop.OPERAND_DELIMITOR); |
| for(int j = 0; j< str.length(); j++){ |
| String charStr = str.charAt(j)+""; |
| if(!charStr.equals(Lop.OPERAND_DELIMITOR)) |
| sb.append(charStr); |
| else if(sb.length() == 0 || !(sb.charAt(sb.length() -1)+"").equals(Lop.OPERAND_DELIMITOR)) |
| sb.append(Lop.OPERAND_DELIMITOR); |
| } |
| String sbStr = sb.toString(); |
| if(!mapPrefixesRemovedReverse.containsKey(sbStr)) |
| mapPrefixesRemovedReverse.put(sbStr, new ArrayList<>()); |
| mapPrefixesRemovedReverse.get(sbStr).add(i); |
| } |
| prefixesRemovedReverse[c] = new ArrayList<>(); |
| prefixesRemoved[c] = new ArrayList<>(); |
| prefixesRemovedReverseSort[c] = new ArrayList<>(); |
| |
| for(String s: mapPrefixesRemovedReverse.keySet()){ |
| prefixesRemovedReverseSort[c].add(new Pair<>(s, mapPrefixesRemovedReverse.get(s).get(0))); |
| } |
| prefixesRemovedReverseSort[c].sort(AscendingPairStringComparator); |
| for(Pair<String, Integer> pair: prefixesRemovedReverseSort[c]){ |
| prefixesRemovedReverse[c].add(pair.getKey()); |
| prefixesRemoved[c].add(new StringBuilder(pair.getKey()).reverse().toString()); |
| } |
| } |
| |
| // build patterns: |
| for(int c :colIndexes) { |
| if(prefixesRemoved[c].size() == 1){ |
| keys[c] = new ArrayList<>(); |
| if(prefixesRemoved[c].get(0).length() == 0 || prefixesRemoved[c].get(0).equals(Lop.OPERAND_DELIMITOR)) |
| keys[c].add(""); |
| |
| String[] lcsKey = prefixesRemoved[c].get(0).split(Lop.OPERAND_DELIMITOR); |
| for(String sk : lcsKey) |
| if(sk.length() > 0) |
| keys[c].add(sk); |
| continue; |
| } |
| |
| String firstKey; |
| // STEP 1: find fist key: |
| String selectedString = prefixesRemoved[c].get(0); |
| boolean flag = true; |
| StringBuilder sbToken = new StringBuilder(); |
| sbToken.append(selectedString.charAt(selectedString.length() -1)); |
| for(int i = 2; i < selectedString.length() && flag; i++) { |
| char ch = selectedString.charAt(selectedString.length()-i); |
| for(int j = 1; j < prefixesRemoved[c].size() && flag; j++) { |
| String str = prefixesRemoved[c].get(j); |
| flag = str.charAt(str.length()-i) == ch; |
| } |
| if(flag) |
| sbToken.append(ch); |
| } |
| firstKey = sbToken.reverse().toString(); |
| flag = true; |
| |
| String[] lcsKey = firstKey.split(Lop.OPERAND_DELIMITOR); |
| ArrayList<String> tmpList = new ArrayList<>(); |
| for(String sk : lcsKey) |
| if(sk.length() > 0) |
| tmpList.add(sk); |
| |
| for(int i = 0; i < prefixes[c].size() && flag; i++) |
| flag = getIndexOfKeyPatternOnString(prefixes[c].get(i), tmpList, 0) == prefixes[c].get(i).length(); |
| |
| if(flag) { |
| keys[c] = tmpList; |
| continue; |
| } |
| // STEP 2: add another keys |
| int indexI = 0; |
| int indexJ = 0; |
| Set<String> refineKeysStep = new HashSet<>(); |
| do { |
| for(; indexI < prefixesRemovedReverseSort[c].size() - 1 && refineKeysStep.size() == 0; indexI++) { |
| String str1 = prefixesRemoved[c].get(indexI); |
| String psStr1 = prefixes[c].get(prefixesRemovedReverseSort[c].get(indexI).getValue()); |
| for(indexJ = indexI + 1; |
| indexJ < prefixesRemovedReverseSort[c].size() && refineKeysStep.size() == 0; |
| indexJ++) { |
| String str2 = prefixesRemoved[c].get(indexJ); |
| String psStr2 = prefixes[c].get(prefixesRemovedReverseSort[c].get(indexJ).getValue()); |
| refineKeysStep = getRefineKeysStep(lcs, str1, str2, psStr1, psStr2, firstKey); |
| } |
| } |
| if(indexI < prefixesRemovedReverse[c].size() -1 && indexJ < prefixesRemovedReverse[c].size()) |
| break; |
| |
| do { |
| Pair<Set<String>, Set<String>> pair = getNewRefineKeys(lcs, firstKey, prefixesRemoved[c], prefixes[c], refineKeysStep); |
| refineKeysStep = pair.getKey(); |
| if(pair.getValue().size() == 0) |
| break; |
| else |
| refineKeysStep.addAll(pair.getValue()); |
| } |
| while(true); |
| |
| } while(refineKeysStep.size() == 0); |
| |
| if(refineKeysStep.size() == 0) { |
| // TODO: we have to apply tokenizer |
| } |
| else if(refineKeysStep.size() == 1) { |
| String[] refinedLCSKey = (refineKeysStep.iterator().next()+Lop.OPERAND_DELIMITOR+firstKey).split(Lop.OPERAND_DELIMITOR); |
| keys[c] = new ArrayList<>(); |
| for(String sk : refinedLCSKey) |
| if(sk.length() > 0) |
| keys[c].add(sk); |
| } |
| else{ |
| ArrayList<String> sortedStrings = new ArrayList<>(); |
| sortedStrings.addAll(refineKeysStep); |
| Collections.sort(sortedStrings, AscendingStringLengthComparator); |
| String[] refinedLCSKey = (sortedStrings.get(sortedStrings.size()-1)+Lop.OPERAND_DELIMITOR+firstKey).split(Lop.OPERAND_DELIMITOR); |
| keys[c] = new ArrayList<>(); |
| for(String sk : refinedLCSKey) |
| if(sk.length() > 0) |
| keys[c].add(sk); |
| } |
| } |
| |
| // CleanUP keys: reduce key list if it possible |
| for(int c :colIndexes) { |
| ArrayList<String> cleanUPKeys = cleanUPKey(keys[c], prefixes[c]); |
| |
| // set static col flag |
| Boolean flagFixCol = true; |
| for(int r = 0; r < nrows && flagFixCol && prefixes[c].size() !=nrows; r++){ |
| String rawStr = sampleRawIndexes[r].getRaw(); |
| flagFixCol = getIndexOfKeyPatternOnString(rawStr, cleanUPKeys, 0) !=-1; |
| } |
| staticColIndexes.set(c, flagFixCol); |
| if(!flagFixCol && cleanUPKeys.size() < keys[c].size()){ |
| String extraKey = keys[c].get(keys[c].size()-cleanUPKeys.size()-1); |
| if(checkExtraKeyForCol(cleanUPKeys, extraKey,prefixes[c])){ |
| keys[c] = new ArrayList<>(); |
| keys[c].add(extraKey); |
| keys[c].addAll(cleanUPKeys); |
| } |
| else |
| keys[c] = cleanUPKeys; |
| } |
| else |
| keys[c] = cleanUPKeys; |
| |
| // Build suffixes |
| Set<String> setSuffix = new HashSet<>(); |
| TextTrie suffixTrie = new TextTrie(); |
| for(String su: suffixes[c]) { |
| String[] suffixesList = su.split(Lop.OPERAND_DELIMITOR, -1); |
| if(suffixesList.length > 0) { |
| if(suffixesList.length == 1 && suffixesList[0].length() == 0) |
| continue; |
| if(suffixesList[1].length() < suffixStringLength) |
| setSuffix.add(suffixesList[1]); |
| else |
| setSuffix.add(suffixesList[1].substring(0, suffixStringLength)); |
| } |
| } |
| if(setSuffix.size() == 0) { |
| colSuffixes[c] = new HashSet<>(); |
| continue; |
| } |
| int rowIndexSuffix = 0; |
| for(String ss: setSuffix){ |
| suffixTrie.insert(ss, rowIndexSuffix++); |
| } |
| HashSet<String> colSuffixe = new HashSet<>(); |
| ArrayList<Pair<String, Set<Integer>>> allSuffixes = suffixTrie.getAllKeys(); |
| if(allSuffixes.get(0).getValue().size() == setSuffix.size()) |
| colSuffixe.add(allSuffixes.get(0).getKey()); |
| else { |
| Set<Integer> coveredRowIndexes = new HashSet<>(); |
| for(Pair<String, Set<Integer>> p: allSuffixes){ |
| int currentSize = coveredRowIndexes.size(); |
| coveredRowIndexes.addAll(p.getValue()); |
| if(currentSize != coveredRowIndexes.size()) |
| colSuffixe.add(p.getKey()); |
| } |
| } |
| colSuffixes[c] = colSuffixe; |
| } |
| return new Pair<>(keys, colSuffixes); |
| } |
| } |
| public String getConflictToken(int[] cols) { |
| boolean flagStatic = true; |
| for(int c=0; c<cols.length && flagStatic ; c++){ |
| flagStatic = staticColIndexes.get(cols[c]); |
| } |
| if(flagStatic) |
| return null; |
| |
| int lastColIndex = cols[cols.length - 1]; |
| ArrayList<String> suffixesBetweenBeginEnd = new ArrayList<>(); |
| ArrayList<String> suffixesRefine = extractAllSuffixStringsOfColsSingleLine(lastColIndex, true); |
| Set<String> setSuffixesRefine = new HashSet<>(); |
| setSuffixesRefine.addAll(suffixesRefine); |
| if(setSuffixesRefine.size() == 1 && setSuffixesRefine.iterator().next().length() == 0) |
| return null; |
| |
| int rowIndex; |
| for(int r = 0; r < nrows; r++) { |
| ArrayList<Integer> filledCols = new ArrayList<>(); |
| for(int c: cols){ |
| if(mapCol[r][c] !=-1) |
| filledCols.add(c); |
| } |
| if(filledCols.size() <= 1 || (rowIndex=mapRow[r][filledCols.get(0)]) == -1) |
| continue; |
| |
| int ib = filledCols.get(0); |
| int ie = filledCols.get(filledCols.size() -1); |
| String str = sampleRawIndexes[rowIndex].getRaw().substring(mapCol[r][ib] + mapLen[r][ib], mapCol[r][ie]); |
| suffixesBetweenBeginEnd.add(str); |
| } |
| |
| ArrayList<String> containList = new ArrayList<>(); |
| int maxTokenLength = 0; |
| String selectedString = ""; |
| for(String suf : suffixesRefine) { |
| int index = suf.indexOf(Lop.OPERAND_DELIMITOR, 1); |
| if(index == -1) |
| index = suf.length(); |
| String str; |
| if((str = suf.substring(1, index)).length() > 0) { |
| containList.add(str); |
| if(maxTokenLength == 0 || maxTokenLength > str.length()) { |
| maxTokenLength = str.length(); |
| selectedString = str; |
| } |
| } |
| } |
| if(containList.size() == 0) |
| return null; |
| |
| Map<Integer, ArrayList<String>> conflicts = new HashMap<>(); |
| maxTokenLength = Math.min(maxTokenLength, 50); |
| for(int tl = 1; tl < maxTokenLength; tl++) { |
| ArrayList<String> tokens = stringTokenize(selectedString, tl); |
| conflicts.put(tl, new ArrayList<>()); |
| for(String t : tokens) { |
| boolean flag = false; |
| for(String between : suffixesBetweenBeginEnd) { |
| flag = between.contains(t); |
| if(flag) |
| break; |
| } |
| if(!flag) |
| conflicts.get(tl).add(t); |
| } |
| } |
| ArrayList<Pair<String, ArrayList<Integer>>> candidate = new ArrayList<>(); |
| ArrayList<String> tokens = new ArrayList<>(); |
| for(int i = maxTokenLength - 1; i > 0 ; i--) { |
| for(String tc : conflicts.get(i)) { |
| boolean flag = false; |
| for(String currenToken: tokens) |
| if(currenToken.startsWith(tc)){ |
| flag = true; |
| break; |
| } |
| if(flag) |
| continue; |
| else flag = true; |
| ArrayList<Integer> distances = new ArrayList<>(); |
| boolean containZero = false; |
| for(String s : containList) { |
| int index = s.indexOf(tc); |
| flag = index!=-1; |
| if(!flag) |
| break; |
| else { |
| distances.add(index); |
| containZero |= index == 0; |
| } |
| } |
| if(flag) { |
| if(containZero) |
| return tc; |
| candidate.add(new Pair<>(tc, distances)); |
| tokens.add(tc); |
| } |
| } |
| } |
| if(candidate.size() > 0) { |
| candidate.sort(AscendingPairListComparator); |
| return candidate.get(0).getKey(); |
| } |
| else |
| return null; |
| } |
| |
| public boolean isDelimAndSuffixesSame(String delim, int[] cols, String conflict){ |
| HashSet<String>[] ends = properties.endWithValueStrings(); |
| boolean flag = true; |
| for(int c = 0; c<cols.length && flag; c++){ |
| if(ends[cols[c]].size() == 0) |
| continue; |
| if(ends[cols[c]].size() != 1 || !ends[cols[c]].iterator().next().equals(delim)) |
| flag = false; |
| } |
| if(!flag){ |
| for(int r=0; r<nrows; r++){ |
| ArrayList<Integer> c = new ArrayList<>(); |
| for(int ci:cols){ |
| if(mapCol[r][ci] !=-1) |
| c.add(ci); |
| } |
| if(c.size() <=1) |
| continue; |
| int c1 = c.get(0); |
| int rowIndex = mapRow[r][c1]; |
| String str; |
| if(conflict == null) |
| str = sampleRawIndexes[rowIndex].getRaw().substring(mapCol[r][c1]); |
| else { |
| int conflictIndex = sampleRawIndexes[rowIndex].getRaw().indexOf(conflict, mapCol[r][c1]); |
| if(conflictIndex!=-1) |
| str =sampleRawIndexes[rowIndex].getRaw().substring(mapCol[r][c1], conflictIndex); |
| else |
| str = sampleRawIndexes[rowIndex].getRaw().substring(mapCol[r][c1]); |
| } |
| flag = true; |
| if(str.length() > 0) { |
| String[] strValues = str.split(delim, -1); |
| for(int ci=0; ci<c.size() && flag; ci++){ |
| if(mapCol[r][c.get(ci)]!=-1) |
| flag = mappingValues.compareCellValue(r, c.get(ci), strValues[ci]); |
| } |
| } |
| if(!flag) |
| break; |
| } |
| } |
| return flag; |
| } |
| private ArrayList<String> stringTokenize(String str, int tokenLength) { |
| ArrayList<String> result = new ArrayList<>(); |
| HashSet<String> tokenSet = new HashSet<>(); |
| for(int i = 0; i <= str.length() - tokenLength; i++) { |
| String token = str.substring(i, i + tokenLength); |
| if(!token.contains(Lop.OPERAND_DELIMITOR) && !tokenSet.contains(token)) { |
| result.add(token); |
| tokenSet.add(token); |
| } |
| } |
| return result; |
| } |
| |
| @SuppressWarnings("all") // unused and unsafe |
| private ArrayList<String> optimalKeyPattern(ArrayList<String> keys, ArrayList<String> prefixes) { |
| ArrayList<ArrayList<String>> keysList = new ArrayList<>(); |
| for(int i = 0; i < keys.size() - 1; i++) { |
| String[] keyList = keys.get(i).split("\\s+"); |
| ArrayList<String> orderedKeys = new ArrayList<>(); |
| for(int j = 0; j < keyList.length; j++) |
| orderedKeys.add(keyList[j]); |
| keysList.add(orderedKeys); |
| } |
| int lastIndex = keys.size() - 1; |
| String[] keyList = keys.get(lastIndex).split("\\s+"); |
| if(keyList.length == 0){ |
| return keys; |
| } |
| StringBuilder sbToken = new StringBuilder(keyList[keyList.length - 1]); |
| StringBuilder sbSource = new StringBuilder(keys.get(lastIndex)); |
| int index = sbSource.reverse().indexOf(sbToken.reverse().toString()); |
| keyList = keys.get(lastIndex).substring(0, keys.get(lastIndex).length() - index - sbToken.length()).split("\\s+"); |
| ArrayList<String> orderedKeys = new ArrayList<>(); |
| for(int j = 0; j < keyList.length; j++) |
| orderedKeys.add(keyList[j]); |
| if(orderedKeys.size() > 0) { |
| keysList.add(orderedKeys); |
| orderedKeys = new ArrayList<>(); |
| } |
| orderedKeys.add(sbToken.reverse().toString()); |
| keysList.add(orderedKeys); |
| |
| ArrayList<ArrayList<ArrayList<String>>> fullList = new ArrayList<>(keysList.size()); |
| for(int i = 0; i < keysList.size() - 1; i++) |
| fullList.set(i, selfPropagate(keysList.get(i))); |
| |
| ArrayList<ArrayList<String>> tmpLastKey = new ArrayList<>(); |
| tmpLastKey.add(keysList.get(keysList.size() - 1)); |
| fullList.set(keysList.size() - 1, tmpLastKey); |
| |
| ArrayList<ArrayList<String>> candidates = fullList.get(0); |
| |
| for(int i = 1; i < keysList.size(); i++) { |
| if(candidates.size() * fullList.get(i).size() > 500000) { |
| ArrayList<ArrayList<String>> tmpCandidates = new ArrayList<>(); |
| for(ArrayList<String> tmpList : candidates) { |
| ArrayList<String> tmpRemainList = new ArrayList<>(); |
| for(String s : tmpList) |
| tmpRemainList.add(s); |
| for(int j = i; j < keys.size(); j++) |
| tmpRemainList.add(keys.get(j)); |
| |
| tmpCandidates.add(tmpRemainList); |
| } |
| candidates = new ArrayList<>(); |
| ArrayList<String> tmp = new ArrayList<>(); |
| ArrayList<String> update = checkPattern(tmpCandidates, prefixes).getKey(); |
| for(int j = 0; j < update.size() - (keys.size() - i); j++) { |
| tmp.add(update.get(j)); |
| } |
| candidates.add(tmp); |
| candidates = cartesianProduct(candidates, fullList.get(i)); |
| } |
| else |
| candidates = cartesianProduct(candidates, fullList.get(i)); |
| } |
| Pair<ArrayList<String>, Boolean> update = checkPattern(candidates, prefixes); |
| if(update.getValue()) |
| return update.getKey(); |
| else |
| return keys; |
| } |
| private Pair<ArrayList<String>, Boolean> checkPattern(ArrayList<ArrayList<String>> candidates, ArrayList<String> prefixes) { |
| candidates.sort(AscendingArrayOfStringComparator); |
| int index = -1; |
| for(int i = 0; i < candidates.size(); i++){ |
| boolean tmpCheck = true; |
| for(int j = 0; j< prefixes.size() && tmpCheck; j++){ |
| tmpCheck = getIndexOfKeyPatternOnString(prefixes.get(j), candidates.get(i),0) == prefixes.get(j).length(); |
| } |
| if(tmpCheck){ |
| index = i; |
| break; |
| } |
| } |
| if(index!=-1) |
| return new Pair<>(candidates.get(index), true); |
| else |
| return new Pair<>(new ArrayList<>(), false); |
| } |
| private ArrayList<ArrayList<String>> cartesianProduct(ArrayList<ArrayList<String>> list1, ArrayList<ArrayList<String>> list2) { |
| ArrayList<ArrayList<String>> result = new ArrayList<>(); |
| for(ArrayList<String> stringArrayList : list1) { |
| for(ArrayList<String> strings : list2) { |
| ArrayList<String> tmpList = new ArrayList<>(); |
| for(String s : stringArrayList) |
| if(s.length() > 0) |
| tmpList.add(s); |
| for(String s : strings) |
| if(s.length() > 0) |
| tmpList.add(s); |
| result.add(tmpList); |
| } |
| } |
| return result; |
| } |
| private ArrayList<ArrayList<String>> selfPropagate(ArrayList<String> list) { |
| ArrayList<ArrayList<String>> result = new ArrayList<>(); |
| int n = list.size(); |
| int allMasks = (1 << n); |
| for(int i = 1; i < allMasks; i++) { |
| ArrayList<String> tmp = new ArrayList<>(); |
| for(int j = 0; j < n; j++) { |
| if((i & (1 << j)) > 0) |
| tmp.add(list.get(j)); |
| |
| } |
| result.add(tmp); |
| } |
| ArrayList<String> tmp = new ArrayList<>(); |
| tmp.add(""); |
| result.add(tmp); |
| result.sort(AscendingArrayOfStringComparator); |
| return result; |
| } |
| |
| Comparator<ArrayList<String>> AscendingArrayOfStringComparator = new Comparator<>() { |
| @Override |
| public int compare(ArrayList<String> strings, ArrayList<String> t1) { |
| return Integer.compare(strings.size(), t1.size()); |
| } |
| }; |
| Comparator<String> AscendingStringLengthComparator = new Comparator<>() { |
| @Override |
| public int compare(String s, String t1) { |
| return s.length() - t1.length(); |
| } |
| }; |
| Comparator<Pair<String, Integer>> AscendingPairStringComparator = new Comparator<>() { |
| @Override |
| public int compare(Pair<String, Integer> stringIntegerPair, Pair<String, Integer> t1) { |
| return stringIntegerPair.getKey().length() - t1.getKey().length(); |
| } |
| }; |
| |
| Comparator<Pair<String, ArrayList<Integer>>> AscendingPairListComparator = new Comparator<>() { |
| @Override |
| public int compare(Pair<String, ArrayList<Integer>> stringArrayListPair, Pair<String, ArrayList<Integer>> t1) { |
| boolean flag = true; |
| for(int i=0; i< stringArrayListPair.getValue().size() && flag; i++){ |
| flag = stringArrayListPair.getValue().get(i) > t1.getValue().get(i); |
| } |
| if(flag) |
| return 1; |
| else |
| return -1; |
| } |
| }; |
| } |