blob: 69a070ef4a780826e082eda7ae935ebb7540ad91 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.carbondata.tool;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.carbondata.common.Strings;
import org.apache.carbondata.core.constants.CarbonCommonConstants;
import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
import org.apache.carbondata.core.datastore.impl.FileFactory;
import org.apache.carbondata.core.indexstore.blockletindex.SegmentIndexFileStore;
import org.apache.carbondata.core.util.path.CarbonTablePath;
import org.apache.carbondata.format.BlockletInfo3;
import org.apache.carbondata.format.ColumnSchema;
import org.apache.carbondata.format.FileFooter3;
import org.apache.carbondata.format.IndexHeader;
/**
* A helper to collect all data files, schema file, table status file in a given folder
*/
class FileCollector {
private long numBlock;
private long numShard;
private long numBlocklet;
private long numPage;
private long numRow;
private long totalDataSize;
private List<String> outPuts;
// file path mapping to file object
private LinkedHashMap<String, DataFile> dataFiles = new LinkedHashMap<>();
private CarbonFile tableStatusFile;
private CarbonFile schemaFile;
FileCollector(List<String> outPuts) {
this.outPuts = outPuts;
}
void collectFiles(String dataFolder) throws IOException {
Set<String> shards = new HashSet<>();
CarbonFile folder = FileFactory.getCarbonFile(dataFolder);
List<CarbonFile> files = new ArrayList<>();
if (folder.exists()) {
if (folder.isDirectory()) {
files = folder.listFiles(true);
} else {
files.add(folder);
}
}
List<DataFile> unsortedFiles = new ArrayList<>();
for (CarbonFile file : files) {
if (isColumnarFile(file.getName())) {
DataFile dataFile = new DataFile(file);
dataFile.collectAllMeta();
unsortedFiles.add(dataFile);
collectNum(dataFile.getFooter());
shards.add(dataFile.getShardName());
totalDataSize += file.getSize();
} else if (file.getName().endsWith(CarbonTablePath.TABLE_STATUS_FILE)) {
tableStatusFile = file;
} else if (file.getName().startsWith(CarbonTablePath.SCHEMA_FILE)) {
schemaFile = file;
} else if (isStreamFile(file.getName())) {
outPuts.add(("WARN: input path contains streaming file, this tool does not support it yet, "
+ "skipping it..."));
}
}
Collections.sort(unsortedFiles, new Comparator<DataFile>() {
@Override
public int compare(DataFile o1, DataFile o2) {
if (o1.getShardName().equalsIgnoreCase(o2.getShardName())) {
return Integer.parseInt(o1.getPartNo()) - Integer.parseInt(o2.getPartNo());
} else {
return o1.getShardName().compareTo(o2.getShardName());
}
}
});
for (DataFile collectedFile : unsortedFiles) {
this.dataFiles.put(collectedFile.getFilePath(), collectedFile);
}
numShard = shards.size();
}
private void collectNum(FileFooter3 footer) {
numBlock++;
numBlocklet += footer.blocklet_index_list.size();
numRow += footer.num_rows;
for (BlockletInfo3 blockletInfo3 : footer.blocklet_info_list3) {
numPage += blockletInfo3.number_number_of_pages;
}
}
private boolean isColumnarFile(String fileName) {
// if the timestamp in file name is "0", it is a streaming file
return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) &&
!CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0");
}
private boolean isStreamFile(String fileName) {
// if the timestamp in file name is "0", it is a streaming file
return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) &&
CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0");
}
// return file path mapping to file object
LinkedHashMap<String, DataFile> getDataFiles() {
return dataFiles;
}
CarbonFile getTableStatusFile() {
return tableStatusFile;
}
CarbonFile getSchemaFile() {
return schemaFile;
}
int getNumDataFiles() {
return dataFiles.size();
}
void printBasicStats() {
if (dataFiles.size() == 0) {
System.out.println("no data file found");
return;
}
outPuts.add("## Summary");
String format = String
.format("total: %,d blocks, %,d shards, %,d blocklets, %,d pages, %,d rows, %s", numBlock,
numShard, numBlocklet, numPage, numRow, Strings.formatSize(totalDataSize));
outPuts.add(format);
String format1 = String.format("avg: %s/block, %s/blocklet, %,d rows/block, %,d rows/blocklet",
Strings.formatSize((float) totalDataSize / numBlock),
Strings.formatSize((float) totalDataSize / numBlocklet), numRow / numBlock,
numRow / numBlocklet);
outPuts.add(format1);
}
private String makeSortColumnsString(List<ColumnSchema> columnList) {
StringBuilder builder = new StringBuilder();
for (ColumnSchema column : columnList) {
if (column.isDimension()) {
Map<String, String> properties = column.getColumnProperties();
if (properties != null) {
if (properties.get(CarbonCommonConstants.SORT_COLUMNS) != null) {
builder.append(column.column_name).append(",");
}
}
}
}
if (builder.length() > 1) {
return builder.substring(0, builder.length() - 1);
} else {
return "";
}
}
public void collectSortColumns(String segmentFolder) throws IOException {
CarbonFile[] files = SegmentIndexFileStore.getCarbonIndexFiles(
segmentFolder, FileFactory.getConfiguration());
if (files.length == 0) {
throw new IllegalArgumentException("\"" + segmentFolder + "\" is not a valid Segment Folder");
}
Set<Boolean> isSortSet = new HashSet<>();
Set<String> sortColumnsSet = new HashSet<>();
for (CarbonFile file : files) {
IndexHeader indexHeader = SegmentIndexFileStore.readIndexHeader(
file.getCanonicalPath(), FileFactory.getConfiguration());
if (indexHeader != null) {
if (indexHeader.isSetIs_sort()) {
isSortSet.add(indexHeader.is_sort);
if (indexHeader.is_sort) {
sortColumnsSet.add(makeSortColumnsString(indexHeader.getTable_columns()));
}
} else {
// if is_sort is not set, it will be old store and consider as local_sort by default.
sortColumnsSet.add(makeSortColumnsString(indexHeader.getTable_columns()));
}
}
if (isSortSet.size() >= 2 || sortColumnsSet.size() >= 2) {
break;
}
}
// for all index files, sort_columns should be same
if (isSortSet.size() <= 1 && sortColumnsSet.size() == 1) {
outPuts.add("sorted by " + sortColumnsSet.iterator().next());
} else {
outPuts.add("unsorted");
}
}
public void close() throws IOException {
for (DataFile file : dataFiles.values()) {
file.close();
}
}
}