old_list_structure.parquet
is generated with parquet-java version 1.14.3. It contains a LIST<LIST<INT32>>
column with a single value [[1, 2], [3, 4]]
using the legacy two-level structure encoding for list type.
The file is created by the following Java code:
package org.example; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroParquetWriter; import org.apache.parquet.hadoop.ParquetWriter; import java.io.IOException; import java.util.Arrays; import java.util.stream.Collectors; import java.util.stream.Stream; public class TwoLevelList { public static void main(String[] args) { Schema schema = new Schema.Parser().parse("{" + "\"type\":\"record\"," + "\"name\":\"my_record\"," + "\"fields\":[" + " {" + " \"name\":\"a\"," + " \"type\":{\"type\":\"array\", \"items\":{\"type\":\"array\", \"items\":\"int\"}}" + " }" + "]" + "}"); GenericRecord record = new GenericData.Record(schema); // Write [[1, 2], [3, 4]] to the avro record record.put("a", Stream.of(Arrays.asList(1, 2), Arrays.asList(3, 4)) .map(list -> { Schema innerListType = schema.getField("a").schema().getElementType(); GenericData.Array<Integer> innerList = new GenericData.Array<>(list.size(), innerListType); innerList.addAll(list); return innerList; }).collect(Collectors.toList())); Path file = new Path("/tmp/old_list_structure.parquet"); Configuration conf = new Configuration(); conf.set("parquet.avro.write-old-list-structure", "true"); // this is the default value try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(file) .withSchema(schema) .withConf(conf) .build()) { writer.write(record); } catch (IOException e) { throw new RuntimeException(e); } } }
Here is the file metadata printed by parquet-cli:
File path: /tmp/old_list_structure.parquet Created by: parquet-mr version 1.14.3 (build b5e376a2caee767a11e75b783512b14cf8ca90ec) Properties: parquet.avro.schema: {"type":"record","name":"my_record","fields":[{"name":"a","type":{"type":"array","items":{"type":"array","items":"int"}}}]} writer.model.name: avro Schema: message my_record { required group a (LIST) { repeated group array (LIST) { repeated int32 array; } } } Row group 0: count: 1 53.00 B records start: 4 total(compressed): 53 B total(uncompressed):53 B -------------------------------------------------------------------------------- type encodings count avg size nulls min / max a.array.array INT32 _ _ 4 13.25 B 0 "1" / "4"