parquet-cli/src/main/java/org/apache/parquet/cli/Util.java - parquet-mr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.parquet.cli;

 import com.google.common.base.Ascii;
 import com.google.common.base.Joiner;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Iterables;
 import com.google.common.hash.HashCode;

 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.Encoding;
 import org.apache.parquet.column.EncodingStats;
 import org.apache.parquet.column.statistics.Statistics;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.OriginalType;
 import org.apache.parquet.schema.PrimitiveType;
 import org.apache.parquet.schema.Type;
 import java.util.Set;

 import static org.apache.parquet.column.Encoding.BIT_PACKED;
 import static org.apache.parquet.column.Encoding.DELTA_BINARY_PACKED;
 import static org.apache.parquet.column.Encoding.DELTA_BYTE_ARRAY;
 import static org.apache.parquet.column.Encoding.PLAIN;
 import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY;
 import static org.apache.parquet.column.Encoding.RLE;
 import static org.apache.parquet.column.Encoding.RLE_DICTIONARY;
 import static org.apache.parquet.format.Encoding.DELTA_LENGTH_BYTE_ARRAY;


 public class Util {

   private static final long KB = 1024;
   private static final long MB = 1024 * KB;
   private static final long GB = 1024 * MB;
   private static final long TB = 1024 * GB;

   public static String humanReadable(float bytes) {
     if (bytes > TB) {
       return String.format("%.03f TB", bytes / TB);
     } else if (bytes > GB) {
       return String.format("%.03f GB", bytes / GB);
     } else if (bytes > MB) {
       return String.format("%.03f MB", bytes / MB);
     } else if (bytes > KB) {
       return String.format("%.03f kB", bytes / KB);
     } else {
       return String.format("%.02f B", bytes);
     }
   }

   public static String humanReadable(long bytes) {
     if (bytes > TB) {
       return String.format("%.03f TB", ((float) bytes) / TB);
     } else if (bytes > GB) {
       return String.format("%.03f GB", ((float) bytes) / GB);
     } else if (bytes > MB) {
       return String.format("%.03f MB", ((float) bytes) / MB);
     } else if (bytes > KB) {
       return String.format("%.03f kB", ((float) bytes) / KB);
     } else {
       return String.format("%d B", bytes);
     }
   }

   @Deprecated
   public static String minMaxAsString(Statistics stats, OriginalType annotation) {
     return minMaxAsString(stats);
   }

   public static String minMaxAsString(Statistics stats) {
     if (stats == null) {
       return "no stats";
     }
     if (!stats.hasNonNullValue()) {
       return "";
     }
     return String.format("%s / %s", humanReadable(stats.minAsString(), 30), humanReadable(stats.maxAsString(), 30));
   }

   @Deprecated
   public static String toString(Statistics stats, long count, OriginalType annotation) {
     return toString(stats, count);
   }

   public static String toString(Statistics stats, long count) {
     if (stats == null) {
       return "no stats";
     }
     return String.format("min: %s max: %s nulls: %d/%d",
         humanReadable(stats.minAsString(), 30), humanReadable(stats.maxAsString(), 30), stats.getNumNulls(), count);
   }

   public static String humanReadable(String str, int len) {
     if (str == null) {
       return "null";
     }

     StringBuilder sb = new StringBuilder();
     sb.append("\"");
     if (str.length() > len - 2) {
       sb.append(str.substring(0, len - 5)).append("...");
     } else {
       sb.append(str);
     }
     sb.append("\"");

     return sb.toString();
   }

   public static String humanReadable(byte[] bytes, int len) {
     Preconditions.checkArgument(len >= 5, "Display length must be minimum 5");
     if (bytes == null || bytes.length == 0) {
       return "null";
     }

     final String asString = HashCode.fromBytes(bytes).toString();
     return "0x" + Ascii.truncate(asString, len - 2, "...");
   }

   public static String shortCodec(CompressionCodecName codec) {
     switch (codec) {
       case UNCOMPRESSED:
         return "_";
       case SNAPPY:
         return "S";
       case GZIP:
         return "G";
       case LZO:
         return "L";
       case BROTLI:
         return "B";
       case LZ4:
         return "4";
       case ZSTD:
         return "Z";
       default:
         return "?";
     }
   }

   public static String encodingAsString(Encoding encoding, boolean isDict) {
     switch (encoding) {
       case PLAIN:
         return "_";
       case PLAIN_DICTIONARY:
         // data pages use RLE, dictionary pages use plain
         return isDict ? "_" : "R";
       case RLE_DICTIONARY:
         return "R";
       case DELTA_BINARY_PACKED:
       case DELTA_LENGTH_BYTE_ARRAY:
       case DELTA_BYTE_ARRAY:
         return "D";
       default:
         return "?";
     }
   }

   public static String encodingStatsAsString(EncodingStats encodingStats) {
     StringBuilder sb = new StringBuilder();
     if (encodingStats.hasDictionaryPages()) {
       for (Encoding encoding: encodingStats.getDictionaryEncodings()) {
         sb.append(encodingAsString(encoding, true));
       }
       sb.append(" ");
     } else {
       sb.append("  ");
     }

     Set<Encoding> encodings = encodingStats.getDataEncodings();
     if (encodings.contains(RLE_DICTIONARY) || encodings.contains(PLAIN_DICTIONARY)) {
       sb.append("R");
     }
     if (encodings.contains(PLAIN)) {
       sb.append("_");
     }
     if (encodings.contains(DELTA_BYTE_ARRAY) ||
         encodings.contains(DELTA_BINARY_PACKED) ||
         encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
       sb.append("D");
     }

     // Check for fallback and add a flag
     if (encodingStats.hasDictionaryEncodedPages() && encodingStats.hasNonDictionaryEncodedPages()) {
       sb.append(" F");
     }

     return sb.toString();
   }

   public static String encodingsAsString(Set<Encoding> encodings, ColumnDescriptor desc) {
     StringBuilder sb = new StringBuilder();
     if (encodings.contains(RLE) || encodings.contains(BIT_PACKED)) {
       sb.append(desc.getMaxDefinitionLevel() == 0 ? "B" : "R");
       sb.append(desc.getMaxRepetitionLevel() == 0 ? "B" : "R");
       if (encodings.contains(PLAIN_DICTIONARY)) {
         sb.append("R");
       }
       if (encodings.contains(PLAIN)) {
         sb.append("_");
       }
     } else {
       sb.append("RR");
       if (encodings.contains(RLE_DICTIONARY)) {
         sb.append("R");
       }
       if (encodings.contains(PLAIN)) {
         sb.append("_");
       }
       if (encodings.contains(DELTA_BYTE_ARRAY) ||
           encodings.contains(DELTA_BINARY_PACKED) ||
           encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
         sb.append("D");
       }
     }
     return sb.toString();
   }

   private static final Splitter DOT = Splitter.on('.');

   public static ColumnDescriptor descriptor(String column, MessageType schema) {
     String[] path = Iterables.toArray(DOT.split(column), String.class);
     Preconditions.checkArgument(schema.containsPath(path),
         "Schema doesn't have column: " + column);
     return schema.getColumnDescription(path);
   }

   public static String columnName(ColumnDescriptor desc) {
     return Joiner.on('.').join(desc.getPath());
   }

   public static PrimitiveType primitive(MessageType schema, String[] path) {
     Type current = schema;
     for (String part : path) {
       current = current.asGroupType().getType(part);
       if (current.isPrimitive()) {
         return current.asPrimitiveType();
       }
     }
     return null;
   }

   public static PrimitiveType primitive(String column, MessageType schema) {
     String[] path = Iterables.toArray(DOT.split(column), String.class);
     Preconditions.checkArgument(schema.containsPath(path),
         "Schema doesn't have column: " + column);
     return primitive(schema, path);
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.parquet.cli;

	import com.google.common.base.Ascii;
	import com.google.common.base.Joiner;
	import com.google.common.base.Preconditions;
	import com.google.common.base.Splitter;
	import com.google.common.collect.Iterables;
	import com.google.common.hash.HashCode;

	import org.apache.parquet.column.ColumnDescriptor;
	import org.apache.parquet.column.Encoding;
	import org.apache.parquet.column.EncodingStats;
	import org.apache.parquet.column.statistics.Statistics;
	import org.apache.parquet.hadoop.metadata.CompressionCodecName;
	import org.apache.parquet.schema.MessageType;
	import org.apache.parquet.schema.OriginalType;
	import org.apache.parquet.schema.PrimitiveType;
	import org.apache.parquet.schema.Type;
	import java.util.Set;

	import static org.apache.parquet.column.Encoding.BIT_PACKED;
	import static org.apache.parquet.column.Encoding.DELTA_BINARY_PACKED;
	import static org.apache.parquet.column.Encoding.DELTA_BYTE_ARRAY;
	import static org.apache.parquet.column.Encoding.PLAIN;
	import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY;
	import static org.apache.parquet.column.Encoding.RLE;
	import static org.apache.parquet.column.Encoding.RLE_DICTIONARY;
	import static org.apache.parquet.format.Encoding.DELTA_LENGTH_BYTE_ARRAY;


	public class Util {

	private static final long KB = 1024;
	private static final long MB = 1024 * KB;
	private static final long GB = 1024 * MB;
	private static final long TB = 1024 * GB;

	public static String humanReadable(float bytes) {
	if (bytes > TB) {
	return String.format("%.03f TB", bytes / TB);
	} else if (bytes > GB) {
	return String.format("%.03f GB", bytes / GB);
	} else if (bytes > MB) {
	return String.format("%.03f MB", bytes / MB);
	} else if (bytes > KB) {
	return String.format("%.03f kB", bytes / KB);
	} else {
	return String.format("%.02f B", bytes);
	}
	}

	public static String humanReadable(long bytes) {
	if (bytes > TB) {
	return String.format("%.03f TB", ((float) bytes) / TB);
	} else if (bytes > GB) {
	return String.format("%.03f GB", ((float) bytes) / GB);
	} else if (bytes > MB) {
	return String.format("%.03f MB", ((float) bytes) / MB);
	} else if (bytes > KB) {
	return String.format("%.03f kB", ((float) bytes) / KB);
	} else {
	return String.format("%d B", bytes);
	}
	}

	@Deprecated
	public static String minMaxAsString(Statistics stats, OriginalType annotation) {
	return minMaxAsString(stats);
	}

	public static String minMaxAsString(Statistics stats) {
	if (stats == null) {
	return "no stats";
	}
	if (!stats.hasNonNullValue()) {
	return "";
	}
	return String.format("%s / %s", humanReadable(stats.minAsString(), 30), humanReadable(stats.maxAsString(), 30));
	}

	@Deprecated
	public static String toString(Statistics stats, long count, OriginalType annotation) {
	return toString(stats, count);
	}

	public static String toString(Statistics stats, long count) {
	if (stats == null) {
	return "no stats";
	}
	return String.format("min: %s max: %s nulls: %d/%d",
	humanReadable(stats.minAsString(), 30), humanReadable(stats.maxAsString(), 30), stats.getNumNulls(), count);
	}

	public static String humanReadable(String str, int len) {
	if (str == null) {
	return "null";
	}

	StringBuilder sb = new StringBuilder();
	sb.append("\"");
	if (str.length() > len - 2) {
	sb.append(str.substring(0, len - 5)).append("...");
	} else {
	sb.append(str);
	}
	sb.append("\"");

	return sb.toString();
	}

	public static String humanReadable(byte[] bytes, int len) {
	Preconditions.checkArgument(len >= 5, "Display length must be minimum 5");
	if (bytes == null \|\| bytes.length == 0) {
	return "null";
	}

	final String asString = HashCode.fromBytes(bytes).toString();
	return "0x" + Ascii.truncate(asString, len - 2, "...");
	}

	public static String shortCodec(CompressionCodecName codec) {
	switch (codec) {
	case UNCOMPRESSED:
	return "_";
	case SNAPPY:
	return "S";
	case GZIP:
	return "G";
	case LZO:
	return "L";
	case BROTLI:
	return "B";
	case LZ4:
	return "4";
	case ZSTD:
	return "Z";
	default:
	return "?";
	}
	}

	public static String encodingAsString(Encoding encoding, boolean isDict) {
	switch (encoding) {
	case PLAIN:
	return "_";
	case PLAIN_DICTIONARY:
	// data pages use RLE, dictionary pages use plain
	return isDict ? "_" : "R";
	case RLE_DICTIONARY:
	return "R";
	case DELTA_BINARY_PACKED:
	case DELTA_LENGTH_BYTE_ARRAY:
	case DELTA_BYTE_ARRAY:
	return "D";
	default:
	return "?";
	}
	}

	public static String encodingStatsAsString(EncodingStats encodingStats) {
	StringBuilder sb = new StringBuilder();
	if (encodingStats.hasDictionaryPages()) {
	for (Encoding encoding: encodingStats.getDictionaryEncodings()) {
	sb.append(encodingAsString(encoding, true));
	}
	sb.append(" ");
	} else {
	sb.append(" ");
	}

	Set<Encoding> encodings = encodingStats.getDataEncodings();
	if (encodings.contains(RLE_DICTIONARY) \|\| encodings.contains(PLAIN_DICTIONARY)) {
	sb.append("R");
	}
	if (encodings.contains(PLAIN)) {
	sb.append("_");
	}
	if (encodings.contains(DELTA_BYTE_ARRAY) \|\|
	encodings.contains(DELTA_BINARY_PACKED) \|\|
	encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
	sb.append("D");
	}

	// Check for fallback and add a flag
	if (encodingStats.hasDictionaryEncodedPages() && encodingStats.hasNonDictionaryEncodedPages()) {
	sb.append(" F");
	}

	return sb.toString();
	}

	public static String encodingsAsString(Set<Encoding> encodings, ColumnDescriptor desc) {
	StringBuilder sb = new StringBuilder();
	if (encodings.contains(RLE) \|\| encodings.contains(BIT_PACKED)) {
	sb.append(desc.getMaxDefinitionLevel() == 0 ? "B" : "R");
	sb.append(desc.getMaxRepetitionLevel() == 0 ? "B" : "R");
	if (encodings.contains(PLAIN_DICTIONARY)) {
	sb.append("R");
	}
	if (encodings.contains(PLAIN)) {
	sb.append("_");
	}
	} else {
	sb.append("RR");
	if (encodings.contains(RLE_DICTIONARY)) {
	sb.append("R");
	}
	if (encodings.contains(PLAIN)) {
	sb.append("_");
	}
	if (encodings.contains(DELTA_BYTE_ARRAY) \|\|
	encodings.contains(DELTA_BINARY_PACKED) \|\|
	encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
	sb.append("D");
	}
	}
	return sb.toString();
	}

	private static final Splitter DOT = Splitter.on('.');

	public static ColumnDescriptor descriptor(String column, MessageType schema) {
	String[] path = Iterables.toArray(DOT.split(column), String.class);
	Preconditions.checkArgument(schema.containsPath(path),
	"Schema doesn't have column: " + column);
	return schema.getColumnDescription(path);
	}

	public static String columnName(ColumnDescriptor desc) {
	return Joiner.on('.').join(desc.getPath());
	}

	public static PrimitiveType primitive(MessageType schema, String[] path) {
	Type current = schema;
	for (String part : path) {
	current = current.asGroupType().getType(part);
	if (current.isPrimitive()) {
	return current.asPrimitiveType();
	}
	}
	return null;
	}

	public static PrimitiveType primitive(String column, MessageType schema) {
	String[] path = Iterables.toArray(DOT.split(column), String.class);
	Preconditions.checkArgument(schema.containsPath(path),
	"Schema doesn't have column: " + column);
	return primitive(schema, path);
	}

	}