parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Offsets.java - parquet-mr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.parquet.hadoop;

 import java.io.IOException;

 import org.apache.parquet.format.PageHeader;
 import org.apache.parquet.format.Util;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 import org.apache.parquet.io.SeekableInputStream;

 /**
  * Class to help gather/calculate the proper values of the dictionary/first data page offset values in a column chunk.
  * This class is used by Tools (parquet-tools/-cli) that do not support encryption so this does not support it either.
  * (In some cases this tool would read the dictionary page header which might be encrypted.)
  */
 class Offsets {

   /**
    * Returns the offset values for the column chunk to be written.
    *
    * @param input         the source input stream of the column chunk
    * @param chunk         the column chunk metadata read from the source file
    * @param newChunkStart the position of the column chunk to be written
    * @return the offset values
    * @throws IOException if any I/O error occurs during the reading of the input stream
    */
   public static Offsets getOffsets(SeekableInputStream input, ColumnChunkMetaData chunk, long newChunkStart)
       throws IOException {
     long firstDataPageOffset;
     long dictionaryPageOffset;
     if (chunk.hasDictionaryPage()) {
       long dictionaryPageSize;
       if (chunk.getDictionaryPageOffset() == 0 || chunk.getFirstDataPageOffset() <= chunk.getDictionaryPageOffset()) {
         /*
          * The offsets might not contain the proper values (so we need to read the dictionary page header):
          * - The dictionaryPageOffset might not be set; in this case 0 is returned
          *   (0 cannot be a valid offset because of the MAGIC bytes)
          * - The firstDataPageOffset might point to the dictionary page
          */
         dictionaryPageSize = readDictionaryPageSize(input, newChunkStart);
       } else {
         dictionaryPageSize = chunk.getFirstDataPageOffset() - chunk.getDictionaryPageOffset();
       }
       firstDataPageOffset = newChunkStart + dictionaryPageSize;
       dictionaryPageOffset = newChunkStart;
     } else {
       firstDataPageOffset = newChunkStart;
       dictionaryPageOffset = 0;
     }
     return new Offsets(firstDataPageOffset, dictionaryPageOffset);
   }

   private static long readDictionaryPageSize(SeekableInputStream in, long pos) throws IOException {
     long origPos = -1;
     try {
       origPos = in.getPos();
       PageHeader header = Util.readPageHeader(in);
       long headerSize = in.getPos() - origPos;
       return headerSize + header.getCompressed_page_size();
     } finally {
       if (origPos != -1) {
         in.seek(origPos);
       }
     }
   }

   private Offsets(long firstDataPageOffset, long dictionaryPageOffset) {
     this.firstDataPageOffset = firstDataPageOffset;
     this.dictionaryPageOffset = dictionaryPageOffset;
   }

   public final long firstDataPageOffset;
   public final long dictionaryPageOffset;
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.parquet.hadoop;

	import java.io.IOException;

	import org.apache.parquet.format.PageHeader;
	import org.apache.parquet.format.Util;
	import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
	import org.apache.parquet.io.SeekableInputStream;

	/**
	* Class to help gather/calculate the proper values of the dictionary/first data page offset values in a column chunk.
	* This class is used by Tools (parquet-tools/-cli) that do not support encryption so this does not support it either.
	* (In some cases this tool would read the dictionary page header which might be encrypted.)
	*/
	class Offsets {

	/**
	* Returns the offset values for the column chunk to be written.
	*
	* @param input the source input stream of the column chunk
	* @param chunk the column chunk metadata read from the source file
	* @param newChunkStart the position of the column chunk to be written
	* @return the offset values
	* @throws IOException if any I/O error occurs during the reading of the input stream
	*/
	public static Offsets getOffsets(SeekableInputStream input, ColumnChunkMetaData chunk, long newChunkStart)
	throws IOException {
	long firstDataPageOffset;
	long dictionaryPageOffset;
	if (chunk.hasDictionaryPage()) {
	long dictionaryPageSize;
	if (chunk.getDictionaryPageOffset() == 0 \|\| chunk.getFirstDataPageOffset() <= chunk.getDictionaryPageOffset()) {
	/*
	* The offsets might not contain the proper values (so we need to read the dictionary page header):
	* - The dictionaryPageOffset might not be set; in this case 0 is returned
	* (0 cannot be a valid offset because of the MAGIC bytes)
	* - The firstDataPageOffset might point to the dictionary page
	*/
	dictionaryPageSize = readDictionaryPageSize(input, newChunkStart);
	} else {
	dictionaryPageSize = chunk.getFirstDataPageOffset() - chunk.getDictionaryPageOffset();
	}
	firstDataPageOffset = newChunkStart + dictionaryPageSize;
	dictionaryPageOffset = newChunkStart;
	} else {
	firstDataPageOffset = newChunkStart;
	dictionaryPageOffset = 0;
	}
	return new Offsets(firstDataPageOffset, dictionaryPageOffset);
	}

	private static long readDictionaryPageSize(SeekableInputStream in, long pos) throws IOException {
	long origPos = -1;
	try {
	origPos = in.getPos();
	PageHeader header = Util.readPageHeader(in);
	long headerSize = in.getPos() - origPos;
	return headerSize + header.getCompressed_page_size();
	} finally {
	if (origPos != -1) {
	in.seek(origPos);
	}
	}
	}

	private Offsets(long firstDataPageOffset, long dictionaryPageOffset) {
	this.firstDataPageOffset = firstDataPageOffset;
	this.dictionaryPageOffset = dictionaryPageOffset;
	}

	public final long firstDataPageOffset;
	public final long dictionaryPageOffset;
	}