tools/cli/src/main/java/org/apache/carbondata/tool/FileCollector.java - carbondata - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.carbondata.tool;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;

 import org.apache.carbondata.common.Strings;
 import org.apache.carbondata.core.constants.CarbonCommonConstants;
 import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
 import org.apache.carbondata.core.datastore.impl.FileFactory;
 import org.apache.carbondata.core.indexstore.blockletindex.SegmentIndexFileStore;
 import org.apache.carbondata.core.util.path.CarbonTablePath;
 import org.apache.carbondata.format.BlockletInfo3;
 import org.apache.carbondata.format.ColumnSchema;
 import org.apache.carbondata.format.FileFooter3;
 import org.apache.carbondata.format.IndexHeader;

 /**
  * A helper to collect all data files, schema file, table status file in a given folder
  */
 class FileCollector {

   private long numBlock;
   private long numShard;
   private long numBlocklet;
   private long numPage;
   private long numRow;
   private long totalDataSize;
   private List<String> outPuts;

   // file path mapping to file object
   private LinkedHashMap<String, DataFile> dataFiles = new LinkedHashMap<>();
   private CarbonFile tableStatusFile;
   private CarbonFile schemaFile;

   FileCollector(List<String> outPuts) {
     this.outPuts = outPuts;
   }

   void collectFiles(String dataFolder) throws IOException {
     Set<String> shards = new HashSet<>();
     CarbonFile folder = FileFactory.getCarbonFile(dataFolder);
     List<CarbonFile> files = new ArrayList<>();
     if (folder.exists()) {
       if (folder.isDirectory()) {
         files = folder.listFiles(true);
       } else {
         files.add(folder);
       }
     }
     List<DataFile> unsortedFiles = new ArrayList<>();
     for (CarbonFile file : files) {
       if (isColumnarFile(file.getName())) {
         DataFile dataFile = new DataFile(file);
         dataFile.collectAllMeta();
         unsortedFiles.add(dataFile);
         collectNum(dataFile.getFooter());
         shards.add(dataFile.getShardName());
         totalDataSize += file.getSize();
       } else if (file.getName().endsWith(CarbonTablePath.TABLE_STATUS_FILE)) {
         tableStatusFile = file;
       } else if (file.getName().startsWith(CarbonTablePath.SCHEMA_FILE)) {
         schemaFile = file;
       } else if (isStreamFile(file.getName())) {
         outPuts.add(("WARN: input path contains streaming file, this tool does not support it yet, "
             + "skipping it..."));

       }
     }

     Collections.sort(unsortedFiles, new Comparator<DataFile>() {
       @Override
       public int compare(DataFile o1, DataFile o2) {
         if (o1.getShardName().equalsIgnoreCase(o2.getShardName())) {
           return Integer.parseInt(o1.getPartNo()) - Integer.parseInt(o2.getPartNo());
         } else {
           return o1.getShardName().compareTo(o2.getShardName());
         }
       }
     });

     for (DataFile collectedFile : unsortedFiles) {
       this.dataFiles.put(collectedFile.getFilePath(), collectedFile);
     }
     numShard = shards.size();
   }

   private void collectNum(FileFooter3 footer) {
     numBlock++;
     numBlocklet += footer.blocklet_index_list.size();
     numRow += footer.num_rows;
     for (BlockletInfo3 blockletInfo3 : footer.blocklet_info_list3) {
       numPage += blockletInfo3.number_number_of_pages;
     }
   }

   private boolean isColumnarFile(String fileName) {
     // if the timestamp in file name is "0", it is a streaming file
     return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) &&
         !CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0");
   }

   private boolean isStreamFile(String fileName) {
     // if the timestamp in file name is "0", it is a streaming file
     return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) &&
         CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0");
   }

   // return file path mapping to file object
   LinkedHashMap<String, DataFile> getDataFiles() {
     return dataFiles;
   }

   CarbonFile getTableStatusFile() {
     return tableStatusFile;
   }

   CarbonFile getSchemaFile() {
     return schemaFile;
   }

   int getNumDataFiles() {
     return dataFiles.size();
   }

   void printBasicStats() {
     if (dataFiles.size() == 0) {
       System.out.println("no data file found");
       return;
     }
     outPuts.add("## Summary");
     String format = String
         .format("total: %,d blocks, %,d shards, %,d blocklets, %,d pages, %,d rows, %s", numBlock,
             numShard, numBlocklet, numPage, numRow, Strings.formatSize(totalDataSize));
     outPuts.add(format);

     String format1 = String.format("avg: %s/block, %s/blocklet, %,d rows/block, %,d rows/blocklet",
         Strings.formatSize((float) totalDataSize / numBlock),
         Strings.formatSize((float) totalDataSize / numBlocklet), numRow / numBlock,
         numRow / numBlocklet);
     outPuts.add(format1);
   }

   private String makeSortColumnsString(List<ColumnSchema> columnList) {
     StringBuilder builder = new StringBuilder();
     for (ColumnSchema column : columnList) {
       if (column.isDimension()) {
         Map<String, String> properties = column.getColumnProperties();
         if (properties != null) {
           if (properties.get(CarbonCommonConstants.SORT_COLUMNS) != null) {
             builder.append(column.column_name).append(",");
           }
         }
       }
     }
     if (builder.length() > 1) {
       return builder.substring(0, builder.length() - 1);
     } else {
       return "";
     }
   }

   public void collectSortColumns(String segmentFolder) throws IOException {
     CarbonFile[] files = SegmentIndexFileStore.getCarbonIndexFiles(
         segmentFolder, FileFactory.getConfiguration());
     if (files.length == 0) {
       throw new IllegalArgumentException("\"" + segmentFolder + "\" is not a valid Segment Folder");
     }
     Set<Boolean> isSortSet = new HashSet<>();
     Set<String> sortColumnsSet = new HashSet<>();
     for (CarbonFile file : files) {
       IndexHeader indexHeader = SegmentIndexFileStore.readIndexHeader(
           file.getCanonicalPath(), FileFactory.getConfiguration());
       if (indexHeader != null) {
         if (indexHeader.isSetIs_sort()) {
           isSortSet.add(indexHeader.is_sort);
           if (indexHeader.is_sort) {
             sortColumnsSet.add(makeSortColumnsString(indexHeader.getTable_columns()));
           }
         } else {
           // if is_sort is not set, it will be old store and consider as local_sort by default.
           sortColumnsSet.add(makeSortColumnsString(indexHeader.getTable_columns()));
         }
       }
       if (isSortSet.size() >= 2 || sortColumnsSet.size() >= 2) {
         break;
       }
     }
     // for all index files, sort_columns should be same
     if (isSortSet.size() <= 1 && sortColumnsSet.size() == 1) {
       outPuts.add("sorted by " + sortColumnsSet.iterator().next());
     } else {
       outPuts.add("unsorted");
     }
   }

   public void close() throws IOException {
     for (DataFile file : dataFiles.values()) {
       file.close();
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.carbondata.tool;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.HashSet;
	import java.util.LinkedHashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;

	import org.apache.carbondata.common.Strings;
	import org.apache.carbondata.core.constants.CarbonCommonConstants;
	import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
	import org.apache.carbondata.core.datastore.impl.FileFactory;
	import org.apache.carbondata.core.indexstore.blockletindex.SegmentIndexFileStore;
	import org.apache.carbondata.core.util.path.CarbonTablePath;
	import org.apache.carbondata.format.BlockletInfo3;
	import org.apache.carbondata.format.ColumnSchema;
	import org.apache.carbondata.format.FileFooter3;
	import org.apache.carbondata.format.IndexHeader;

	/**
	* A helper to collect all data files, schema file, table status file in a given folder
	*/
	class FileCollector {

	private long numBlock;
	private long numShard;
	private long numBlocklet;
	private long numPage;
	private long numRow;
	private long totalDataSize;
	private List<String> outPuts;

	// file path mapping to file object
	private LinkedHashMap<String, DataFile> dataFiles = new LinkedHashMap<>();
	private CarbonFile tableStatusFile;
	private CarbonFile schemaFile;

	FileCollector(List<String> outPuts) {
	this.outPuts = outPuts;
	}

	void collectFiles(String dataFolder) throws IOException {
	Set<String> shards = new HashSet<>();
	CarbonFile folder = FileFactory.getCarbonFile(dataFolder);
	List<CarbonFile> files = new ArrayList<>();
	if (folder.exists()) {
	if (folder.isDirectory()) {
	files = folder.listFiles(true);
	} else {
	files.add(folder);
	}
	}
	List<DataFile> unsortedFiles = new ArrayList<>();
	for (CarbonFile file : files) {
	if (isColumnarFile(file.getName())) {
	DataFile dataFile = new DataFile(file);
	dataFile.collectAllMeta();
	unsortedFiles.add(dataFile);
	collectNum(dataFile.getFooter());
	shards.add(dataFile.getShardName());
	totalDataSize += file.getSize();
	} else if (file.getName().endsWith(CarbonTablePath.TABLE_STATUS_FILE)) {
	tableStatusFile = file;
	} else if (file.getName().startsWith(CarbonTablePath.SCHEMA_FILE)) {
	schemaFile = file;
	} else if (isStreamFile(file.getName())) {
	outPuts.add(("WARN: input path contains streaming file, this tool does not support it yet, "
	+ "skipping it..."));

	}
	}

	Collections.sort(unsortedFiles, new Comparator<DataFile>() {
	@Override
	public int compare(DataFile o1, DataFile o2) {
	if (o1.getShardName().equalsIgnoreCase(o2.getShardName())) {
	return Integer.parseInt(o1.getPartNo()) - Integer.parseInt(o2.getPartNo());
	} else {
	return o1.getShardName().compareTo(o2.getShardName());
	}
	}
	});

	for (DataFile collectedFile : unsortedFiles) {
	this.dataFiles.put(collectedFile.getFilePath(), collectedFile);
	}
	numShard = shards.size();
	}

	private void collectNum(FileFooter3 footer) {
	numBlock++;
	numBlocklet += footer.blocklet_index_list.size();
	numRow += footer.num_rows;
	for (BlockletInfo3 blockletInfo3 : footer.blocklet_info_list3) {
	numPage += blockletInfo3.number_number_of_pages;
	}
	}

	private boolean isColumnarFile(String fileName) {
	// if the timestamp in file name is "0", it is a streaming file
	return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) &&
	!CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0");
	}

	private boolean isStreamFile(String fileName) {
	// if the timestamp in file name is "0", it is a streaming file
	return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) &&
	CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0");
	}

	// return file path mapping to file object
	LinkedHashMap<String, DataFile> getDataFiles() {
	return dataFiles;
	}

	CarbonFile getTableStatusFile() {
	return tableStatusFile;
	}

	CarbonFile getSchemaFile() {
	return schemaFile;
	}

	int getNumDataFiles() {
	return dataFiles.size();
	}

	void printBasicStats() {
	if (dataFiles.size() == 0) {
	System.out.println("no data file found");
	return;
	}
	outPuts.add("## Summary");
	String format = String
	.format("total: %,d blocks, %,d shards, %,d blocklets, %,d pages, %,d rows, %s", numBlock,
	numShard, numBlocklet, numPage, numRow, Strings.formatSize(totalDataSize));
	outPuts.add(format);

	String format1 = String.format("avg: %s/block, %s/blocklet, %,d rows/block, %,d rows/blocklet",
	Strings.formatSize((float) totalDataSize / numBlock),
	Strings.formatSize((float) totalDataSize / numBlocklet), numRow / numBlock,
	numRow / numBlocklet);
	outPuts.add(format1);
	}

	private String makeSortColumnsString(List<ColumnSchema> columnList) {
	StringBuilder builder = new StringBuilder();
	for (ColumnSchema column : columnList) {
	if (column.isDimension()) {
	Map<String, String> properties = column.getColumnProperties();
	if (properties != null) {
	if (properties.get(CarbonCommonConstants.SORT_COLUMNS) != null) {
	builder.append(column.column_name).append(",");
	}
	}
	}
	}
	if (builder.length() > 1) {
	return builder.substring(0, builder.length() - 1);
	} else {
	return "";
	}
	}

	public void collectSortColumns(String segmentFolder) throws IOException {
	CarbonFile[] files = SegmentIndexFileStore.getCarbonIndexFiles(
	segmentFolder, FileFactory.getConfiguration());
	if (files.length == 0) {
	throw new IllegalArgumentException("\"" + segmentFolder + "\" is not a valid Segment Folder");
	}
	Set<Boolean> isSortSet = new HashSet<>();
	Set<String> sortColumnsSet = new HashSet<>();
	for (CarbonFile file : files) {
	IndexHeader indexHeader = SegmentIndexFileStore.readIndexHeader(
	file.getCanonicalPath(), FileFactory.getConfiguration());
	if (indexHeader != null) {
	if (indexHeader.isSetIs_sort()) {
	isSortSet.add(indexHeader.is_sort);
	if (indexHeader.is_sort) {
	sortColumnsSet.add(makeSortColumnsString(indexHeader.getTable_columns()));
	}
	} else {
	// if is_sort is not set, it will be old store and consider as local_sort by default.
	sortColumnsSet.add(makeSortColumnsString(indexHeader.getTable_columns()));
	}
	}
	if (isSortSet.size() >= 2 \|\| sortColumnsSet.size() >= 2) {
	break;
	}
	}
	// for all index files, sort_columns should be same
	if (isSortSet.size() <= 1 && sortColumnsSet.size() == 1) {
	outPuts.add("sorted by " + sortColumnsSet.iterator().next());
	} else {
	outPuts.add("unsorted");
	}
	}

	public void close() throws IOException {
	for (DataFile file : dataFiles.values()) {
	file.close();
	}
	}
	}