parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DictionaryPageReader.java - parquet-mr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.parquet.hadoop;

 import org.apache.parquet.bytes.BytesInput;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.Encoding;
 import org.apache.parquet.column.EncodingStats;
 import org.apache.parquet.column.page.DictionaryPage;
 import org.apache.parquet.column.page.DictionaryPageReadStore;
 import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 import org.apache.parquet.io.ParquetDecodingException;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;

 import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY;
 import static org.apache.parquet.column.Encoding.RLE_DICTIONARY;

 /**
  * A {@link DictionaryPageReadStore} implementation that reads dictionaries from
  * an open {@link ParquetFileReader}.
  *
  * This implementation will delegate dictionary reads to a
  * {@link ColumnChunkPageReadStore} to avoid extra reads after a row group has
  * been loaded into memory.
  */
 class DictionaryPageReader implements DictionaryPageReadStore {

   private final ParquetFileReader reader;
   private final Map<String, ColumnChunkMetaData> columns;
   private final Map<String, Optional<DictionaryPage>> dictionaryPageCache;
   private ColumnChunkPageReadStore rowGroup = null;

   /**
    * Instantiate a new DictionaryPageReader.
    *
    * @param reader The target ParquetFileReader
    * @param block The target BlockMetaData
    *
    * @throws NullPointerException if {@code reader} or {@code block} is
    *           {@code null}
    */
   DictionaryPageReader(ParquetFileReader reader, BlockMetaData block) {
     this.reader = Objects.requireNonNull(reader);
     this.columns = new HashMap<>();
     this.dictionaryPageCache = new ConcurrentHashMap<>();

     for (ColumnChunkMetaData column : block.getColumns()) {
       columns.put(column.getPath().toDotString(), column);
     }
   }

   /**
    * Sets this reader's row group's page store. When a row group is set, this
    * reader will delegate to that row group to return dictionary pages. This
    * avoids seeking and re-reading dictionary bytes after this reader's row
    * group is loaded into memory.
    *
    * @param rowGroup a ColumnChunkPageReadStore for this reader's row group
    */
   void setRowGroup(ColumnChunkPageReadStore rowGroup) {
     this.rowGroup = rowGroup;
   }

   @Override
   public DictionaryPage readDictionaryPage(ColumnDescriptor descriptor) {
     if (rowGroup != null) {
       // if the row group has already been read, use that dictionary
       return rowGroup.readDictionaryPage(descriptor);
     }

     String dotPath = String.join(".", descriptor.getPath());
     ColumnChunkMetaData column = columns.get(dotPath);
     if (column == null) {
       throw new ParquetDecodingException(
           "Failed to load dictionary, unknown column: " + dotPath);
     }

     return dictionaryPageCache.computeIfAbsent(dotPath, key -> {
       try {
         final DictionaryPage dict =
             hasDictionaryPage(column) ? reader.readDictionary(column) : null;

         // Copy the dictionary to ensure it can be reused if it is returned
         // more than once. This can happen when a DictionaryFilter has two or
         // more predicates for the same column. Cache misses as well.
         return (dict != null) ? Optional.of(reusableCopy(dict)) : Optional.empty();
       } catch (IOException e) {
         throw new ParquetDecodingException("Failed to read dictionary", e);
       }
     }).orElse(null);
   }

   private static DictionaryPage reusableCopy(DictionaryPage dict)
       throws IOException {
     return new DictionaryPage(BytesInput.from(dict.getBytes().toByteArray()),
         dict.getDictionarySize(), dict.getEncoding());
   }

   private boolean hasDictionaryPage(ColumnChunkMetaData column) {
     EncodingStats stats = column.getEncodingStats();
     if (stats != null) {
       // ensure there is a dictionary page and that it is used to encode data pages
       return stats.hasDictionaryPages() && stats.hasDictionaryEncodedPages();
     }

     Set<Encoding> encodings = column.getEncodings();
     return (encodings.contains(PLAIN_DICTIONARY) || encodings.contains(RLE_DICTIONARY));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.parquet.hadoop;

	import org.apache.parquet.bytes.BytesInput;
	import org.apache.parquet.column.ColumnDescriptor;
	import org.apache.parquet.column.Encoding;
	import org.apache.parquet.column.EncodingStats;
	import org.apache.parquet.column.page.DictionaryPage;
	import org.apache.parquet.column.page.DictionaryPageReadStore;
	import org.apache.parquet.hadoop.metadata.BlockMetaData;
	import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
	import org.apache.parquet.io.ParquetDecodingException;
	import java.io.IOException;
	import java.util.HashMap;
	import java.util.Map;
	import java.util.Objects;
	import java.util.Optional;
	import java.util.Set;
	import java.util.concurrent.ConcurrentHashMap;

	import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY;
	import static org.apache.parquet.column.Encoding.RLE_DICTIONARY;

	/**
	* A {@link DictionaryPageReadStore} implementation that reads dictionaries from
	* an open {@link ParquetFileReader}.
	*
	* This implementation will delegate dictionary reads to a
	* {@link ColumnChunkPageReadStore} to avoid extra reads after a row group has
	* been loaded into memory.
	*/
	class DictionaryPageReader implements DictionaryPageReadStore {

	private final ParquetFileReader reader;
	private final Map<String, ColumnChunkMetaData> columns;
	private final Map<String, Optional<DictionaryPage>> dictionaryPageCache;
	private ColumnChunkPageReadStore rowGroup = null;

	/**
	* Instantiate a new DictionaryPageReader.
	*
	* @param reader The target ParquetFileReader
	* @param block The target BlockMetaData
	*
	* @throws NullPointerException if {@code reader} or {@code block} is
	* {@code null}
	*/
	DictionaryPageReader(ParquetFileReader reader, BlockMetaData block) {
	this.reader = Objects.requireNonNull(reader);
	this.columns = new HashMap<>();
	this.dictionaryPageCache = new ConcurrentHashMap<>();

	for (ColumnChunkMetaData column : block.getColumns()) {
	columns.put(column.getPath().toDotString(), column);
	}
	}

	/**
	* Sets this reader's row group's page store. When a row group is set, this
	* reader will delegate to that row group to return dictionary pages. This
	* avoids seeking and re-reading dictionary bytes after this reader's row
	* group is loaded into memory.
	*
	* @param rowGroup a ColumnChunkPageReadStore for this reader's row group
	*/
	void setRowGroup(ColumnChunkPageReadStore rowGroup) {
	this.rowGroup = rowGroup;
	}

	@Override
	public DictionaryPage readDictionaryPage(ColumnDescriptor descriptor) {
	if (rowGroup != null) {
	// if the row group has already been read, use that dictionary
	return rowGroup.readDictionaryPage(descriptor);
	}

	String dotPath = String.join(".", descriptor.getPath());
	ColumnChunkMetaData column = columns.get(dotPath);
	if (column == null) {
	throw new ParquetDecodingException(
	"Failed to load dictionary, unknown column: " + dotPath);
	}

	return dictionaryPageCache.computeIfAbsent(dotPath, key -> {
	try {
	final DictionaryPage dict =
	hasDictionaryPage(column) ? reader.readDictionary(column) : null;

	// Copy the dictionary to ensure it can be reused if it is returned
	// more than once. This can happen when a DictionaryFilter has two or
	// more predicates for the same column. Cache misses as well.
	return (dict != null) ? Optional.of(reusableCopy(dict)) : Optional.empty();
	} catch (IOException e) {
	throw new ParquetDecodingException("Failed to read dictionary", e);
	}
	}).orElse(null);
	}

	private static DictionaryPage reusableCopy(DictionaryPage dict)
	throws IOException {
	return new DictionaryPage(BytesInput.from(dict.getBytes().toByteArray()),
	dict.getDictionarySize(), dict.getEncoding());
	}

	private boolean hasDictionaryPage(ColumnChunkMetaData column) {
	EncodingStats stats = column.getEncodingStats();
	if (stats != null) {
	// ensure there is a dictionary page and that it is used to encode data pages
	return stats.hasDictionaryPages() && stats.hasDictionaryEncodedPages();
	}

	Set<Encoding> encodings = column.getEncodings();
	return (encodings.contains(PLAIN_DICTIONARY) \|\| encodings.contains(RLE_DICTIONARY));
	}
	}