blob: e34f8faaebd079047d273e16f0f3f926f23b8e24 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.impala.catalog;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.impala.thrift.TColumn;
import org.apache.impala.thrift.TNetworkAddress;
import org.apache.impala.thrift.TTable;
import org.apache.thrift.TException;
import org.apache.thrift.protocol.TProtocol;
import com.google.common.base.Preconditions;
import com.google.common.collect.Interner;
import com.google.common.collect.Interners;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
/**
* Static utility methods for interning various objects used in the catalog. In many
* cases, there are common strings that show up in lots of objects (eg class names,
* database names, user names, property names), and interning these strings can result
* in significant memory savings.
*/
public abstract class CatalogInterners {
private static Interner<TNetworkAddress> NETWORK_ADDRESS_INTERNER =
Interners.newWeakInterner();
/**
* Interner used for relatively-low-cardinality strings that aren't quite
* low enough cardinality to be safe to use the JVM interner. The JVM's interner
* uses a hashtable with a fixed number of buckets, so if we put anything
* with unbounded cardinality in there, we risk a high number of collisions.
*
* See https://shipilev.net/jvm-anatomy-park/10-string-intern/ for more details
* on those risks.
*/
private static Interner<String> STRING_INTERNER = Interners.newWeakInterner();
// Do not instantiate
private CatalogInterners() {}
/**
* Intern low-cardinality fields of a metastore table object in-place.
*/
public static void internFieldsInPlace(
org.apache.hadoop.hive.metastore.api.Table msTable) {
if (msTable == null) return;
// Database name is typically low cardinality, but still potentially risky
// to put in the JVM string table, so we'll use our own interner instead.
if (msTable.isSetDbName()) {
msTable.setDbName(STRING_INTERNER.intern(msTable.getDbName()));
}
if (msTable.isSetOwner()) {
msTable.setOwner(STRING_INTERNER.intern(msTable.getOwner()));
}
if (msTable.isSetParameters()) {
msTable.setParameters(internParameters(msTable.getParameters()));
}
if (msTable.isSetTableType()) {
msTable.setTableType(STRING_INTERNER.intern(msTable.getTableType()));
}
if (msTable.isSetSd()) {
internFieldsInPlace(msTable.getSd());
}
if (msTable.isSetPartitionKeys()) {
for (FieldSchema fs : msTable.getPartitionKeys()) internFieldsInPlace(fs);
}
}
public static void internFieldsInPlace(TTable table) {
if (table == null) return;
internFieldsInPlace(table.getClustering_columns());
internFieldsInPlace(table.getColumns());
if (table.isSetHdfs_table()) {
table.hdfs_table.setNullPartitionKeyValue(
table.hdfs_table.getNullPartitionKeyValue().intern());
table.hdfs_table.setNullColumnValue(
table.hdfs_table.getNullColumnValue().intern());
table.hdfs_table.setNetwork_addresses(internAddresses(
table.hdfs_table.getNetwork_addresses()));
}
}
private static void internFieldsInPlace(List<TColumn> cols) {
if (cols == null) return;
for (TColumn col : cols) {
if (col.isSetColumnName()) {
col.setColumnName(STRING_INTERNER.intern(col.getColumnName()));
}
}
}
/**
* Intern low-cardinality fields in the given storage descriptor.
*/
public static void internFieldsInPlace(StorageDescriptor sd) {
if (sd == null) return;
if (sd.isSetCols()) {
for (FieldSchema fs : sd.getCols()) internFieldsInPlace(fs);
}
if (sd.isSetInputFormat()) {
sd.setInputFormat(STRING_INTERNER.intern(sd.getInputFormat()));
}
if (sd.isSetOutputFormat()) {
sd.setOutputFormat(STRING_INTERNER.intern(sd.getOutputFormat()));
}
if (sd.isSetParameters()) {
sd.setParameters(internParameters(sd.getParameters()));
}
}
/**
* Intern low-cardinality fields in the given FieldSchema.
*/
private static void internFieldsInPlace(FieldSchema fs) {
if (fs == null) return;
if (fs.isSetName()) {
fs.setName(STRING_INTERNER.intern(fs.getName()));
}
if (fs.isSetType()) {
fs.setType(STRING_INTERNER.intern(fs.getType()));
}
}
/**
* Transform the given parameters map (for table or partition parameters) to intern
* its keys and commonly-used values.
*/
public static Map<String, String> internParameters(Map<String, String> parameters) {
if (parameters == null) return null;
Map<String, String> ret = Maps.newHashMapWithExpectedSize(parameters.size());
for (Map.Entry<String, String> e : parameters.entrySet()) {
// Intern values which we know will show up quite often. This is based on
// the results of the following SQL query against an HMS database:
//
// select PARAM_VALUE, count(*) from PARTITION_PARAMS where PARAM_KEY
// not like 'impala_%' group by PARAM_VALUE order by count(*) desc limit 100;
//
// In a large catalog from a production install, these represented about 68% of the
// entries.
String val = e.getValue();
if (val.isEmpty() ||
"-1".equals(val) ||
"0".equals(val) ||
"true".equalsIgnoreCase(val) ||
"false".equalsIgnoreCase(val) ||
"TASK".equals(val) ||
val.startsWith("impala_")) {
val = val.intern();
} else if (val.length() <= 2) {
// Very short values tend to be quite common -- for example most partitions
// have less than 99 files. But, potential cardinality is high enough that
// we avoid the JVM interner.
val = STRING_INTERNER.intern(val);
}
// Assume that the keys used in the HMS have a low cardinality, even if technically
// custom properties are allowed.
ret.put(STRING_INTERNER.intern(e.getKey()), val);
}
Preconditions.checkState(ret.size() == parameters.size());
return ret;
}
private static List<TNetworkAddress> internAddresses(List<TNetworkAddress> addrs) {
if (addrs == null) return null;
List<TNetworkAddress> ret = Lists.newArrayListWithCapacity(addrs.size());
for (TNetworkAddress addr : addrs) {
ret.add(CatalogInterners.internNetworkAddress(addr));
}
return ret;
}
/**
* Intern a shared string. Use this only for strings reasonably expected to be
* reused, such as an HBase column family, which are usually shared across many
* columns.
*/
public static String internString(String value) {
if (value == null) return null;
return STRING_INTERNER.intern(value);
}
/**
* Intern the given network address object, and return an immutable version.
*/
public static TNetworkAddress internNetworkAddress(TNetworkAddress addr) {
if (addr == null) return null;
// Intern an immutable subclass of the network address so that we don't
// accidentally modify them. This doesn't override every mutating method
// but it's likely someone would trip over one of these.
return NETWORK_ADDRESS_INTERNER.intern(new TNetworkAddress(addr) {
private static final long serialVersionUID = 1L;
@Override
public void clear() {
throw new UnsupportedOperationException("immutable");
}
@Override
public TNetworkAddress setHostname(String hostname) {
throw new UnsupportedOperationException("immutable");
}
@Override
public TNetworkAddress setPort(int port) {
throw new UnsupportedOperationException("immutable");
}
@Override
public void read(TProtocol iprot) throws TException {
throw new UnsupportedOperationException("immutable");
}
});
}
}