dev/java/_sources/io.rst.txt - arrow-cookbook - Git at Google

 .. Licensed to the Apache Software Foundation (ASF) under one
 .. or more contributor license agreements.  See the NOTICE file
 .. distributed with this work for additional information
 .. regarding copyright ownership.  The ASF licenses this file
 .. to you under the Apache License, Version 2.0 (the
 .. "License"); you may not use this file except in compliance
 .. with the License.  You may obtain a copy of the License at

 ..   http://www.apache.org/licenses/LICENSE-2.0

 .. Unless required by applicable law or agreed to in writing,
 .. software distributed under the License is distributed on an
 .. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 .. KIND, either express or implied.  See the License for the
 .. specific language governing permissions and limitations
 .. under the License.

 .. _arrow-io:

 ========================
 Reading and writing data
 ========================

 The `Arrow IPC format <https://arrow.apache.org/docs/java/ipc.html>`_ defines two types of binary formats
 for serializing Arrow data: the streaming format and the file format (or random access format). Such files can
 be directly memory-mapped when read.

 .. contents::

 Writing
 =======

 Both writing file and streaming formats use the same API.

 Writing Random Access Files
 ***************************

 Write - Out to File
 -------------------

 .. testcode::

    import org.apache.arrow.memory.BufferAllocator;
    import org.apache.arrow.memory.RootAllocator;
    import org.apache.arrow.vector.VarCharVector;
    import org.apache.arrow.vector.IntVector;
    import org.apache.arrow.vector.types.pojo.Field;
    import org.apache.arrow.vector.types.pojo.FieldType;
    import org.apache.arrow.vector.types.pojo.ArrowType;
    import org.apache.arrow.vector.types.pojo.Schema;
    import org.apache.arrow.vector.VectorSchemaRoot;
    import static java.util.Arrays.asList;
    import org.apache.arrow.vector.ipc.ArrowFileWriter;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;

    try (BufferAllocator allocator = new RootAllocator()) {
        Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
        Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
        Schema schemaPerson = new Schema(asList(name, age));
        try(
            VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, allocator)
        ){
            VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
            nameVector.allocateNew(3);
            nameVector.set(0, "David".getBytes());
            nameVector.set(1, "Gladis".getBytes());
            nameVector.set(2, "Juan".getBytes());
            IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
            ageVector.allocateNew(3);
            ageVector.set(0, 10);
            ageVector.set(1, 20);
            ageVector.set(2, 30);
            vectorSchemaRoot.setRowCount(3);
            File file = new File("randon_access_to_file.arrow");
            try (
                FileOutputStream fileOutputStream = new FileOutputStream(file);
                ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, fileOutputStream.getChannel())
            ) {
                writer.start();
                writer.writeBatch();
                writer.end();
                System.out.println("Record batches written: " + writer.getRecordBlocks().size() + ". Number of rows written: " + vectorSchemaRoot.getRowCount());
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

 .. testoutput::

    Record batches written: 1. Number of rows written: 3

 Write - Out to Buffer
 ---------------------

 .. testcode::

    import org.apache.arrow.memory.BufferAllocator;
    import org.apache.arrow.memory.RootAllocator;
    import org.apache.arrow.vector.VarCharVector;
    import org.apache.arrow.vector.IntVector;
    import org.apache.arrow.vector.types.pojo.Field;
    import org.apache.arrow.vector.types.pojo.FieldType;
    import org.apache.arrow.vector.types.pojo.ArrowType;
    import org.apache.arrow.vector.types.pojo.Schema;
    import org.apache.arrow.vector.VectorSchemaRoot;
    import static java.util.Arrays.asList;
    import org.apache.arrow.vector.ipc.ArrowFileWriter;
    import java.io.ByteArrayOutputStream;
    import java.io.IOException;
    import java.nio.channels.Channels;

    try (BufferAllocator allocator = new RootAllocator()) {
        Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
        Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
        Schema schemaPerson = new Schema(asList(name, age));
        try(
            VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, allocator)
        ){
            VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
            nameVector.allocateNew(3);
            nameVector.set(0, "David".getBytes());
            nameVector.set(1, "Gladis".getBytes());
            nameVector.set(2, "Juan".getBytes());
            IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
            ageVector.allocateNew(3);
            ageVector.set(0, 10);
            ageVector.set(1, 20);
            ageVector.set(2, 30);
            vectorSchemaRoot.setRowCount(3);
            try (
                ByteArrayOutputStream out = new ByteArrayOutputStream();
                 ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out))
            ) {
                writer.start();
                writer.writeBatch();

                System.out.println("Record batches written: " + writer.getRecordBlocks().size() +
                        ". Number of rows written: " + vectorSchemaRoot.getRowCount());
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

 .. testoutput::

    Record batches written: 1. Number of rows written: 3

 Writing Streaming Format
 ************************

 Write - Out to File
 -------------------

 .. testcode::

    import org.apache.arrow.memory.BufferAllocator;
    import org.apache.arrow.memory.RootAllocator;
    import org.apache.arrow.vector.VarCharVector;
    import org.apache.arrow.vector.IntVector;
    import org.apache.arrow.vector.ipc.ArrowStreamWriter;
    import org.apache.arrow.vector.types.pojo.Field;
    import org.apache.arrow.vector.types.pojo.FieldType;
    import org.apache.arrow.vector.types.pojo.ArrowType;
    import org.apache.arrow.vector.types.pojo.Schema;
    import org.apache.arrow.vector.VectorSchemaRoot;
    import static java.util.Arrays.asList;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;

    try (BufferAllocator rootAllocator = new RootAllocator()) {
        Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
        Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
        Schema schemaPerson = new Schema(asList(name, age));
        try(
            VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)
        ){
            VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
            nameVector.allocateNew(3);
            nameVector.set(0, "David".getBytes());
            nameVector.set(1, "Gladis".getBytes());
            nameVector.set(2, "Juan".getBytes());
            IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
            ageVector.allocateNew(3);
            ageVector.set(0, 10);
            ageVector.set(1, 20);
            ageVector.set(2, 30);
            vectorSchemaRoot.setRowCount(3);
            File file = new File("streaming_to_file.arrow");
            try (
                FileOutputStream fileOutputStream = new FileOutputStream(file);
                ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, fileOutputStream.getChannel())
            ){
                writer.start();
                writer.writeBatch();
                System.out.println("Number of rows written: " + vectorSchemaRoot.getRowCount());
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

 .. testoutput::

    Number of rows written: 3

 Write - Out to Buffer
 ---------------------

 .. testcode::

    import org.apache.arrow.memory.BufferAllocator;
    import org.apache.arrow.memory.RootAllocator;
    import org.apache.arrow.vector.VarCharVector;
    import org.apache.arrow.vector.IntVector;
    import org.apache.arrow.vector.ipc.ArrowStreamWriter;
    import org.apache.arrow.vector.types.pojo.Field;
    import org.apache.arrow.vector.types.pojo.FieldType;
    import org.apache.arrow.vector.types.pojo.ArrowType;
    import org.apache.arrow.vector.types.pojo.Schema;
    import org.apache.arrow.vector.VectorSchemaRoot;
    import static java.util.Arrays.asList;
    import java.io.ByteArrayOutputStream;
    import java.io.IOException;
    import java.nio.channels.Channels;

    try (BufferAllocator rootAllocator = new RootAllocator()) {
        Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
        Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
        Schema schemaPerson = new Schema(asList(name, age));
        try(
            VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)
        ){
            VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
            nameVector.allocateNew(3);
            nameVector.set(0, "David".getBytes());
            nameVector.set(1, "Gladis".getBytes());
            nameVector.set(2, "Juan".getBytes());
            IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
            ageVector.allocateNew(3);
            ageVector.set(0, 10);
            ageVector.set(1, 20);
            ageVector.set(2, 30);
            vectorSchemaRoot.setRowCount(3);
            try (
                ByteArrayOutputStream out = new ByteArrayOutputStream();
                ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out))
            ){
                writer.start();
                writer.writeBatch();
                System.out.println("Number of rows written: " + vectorSchemaRoot.getRowCount());
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

 .. testoutput::

    Number of rows written: 3

 Reading
 =======

 Reading the random access format and streaming format both offer the same API,
 with the difference that random access files also offer access to any record batch by index.

 Reading Random Access Files
 ***************************

 Read - From File
 ----------------

 We are providing a path with auto generated arrow files for testing purposes, change that at your convenience.

 .. testcode::

    import org.apache.arrow.memory.BufferAllocator;
    import org.apache.arrow.memory.RootAllocator;
    import org.apache.arrow.vector.ipc.ArrowFileReader;
    import org.apache.arrow.vector.ipc.message.ArrowBlock;
    import org.apache.arrow.vector.VectorSchemaRoot;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;

    File file = new File("./thirdpartydeps/arrowfiles/random_access.arrow");
    try(
        BufferAllocator rootAllocator = new RootAllocator();
        FileInputStream fileInputStream = new FileInputStream(file);
        ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator)
    ){
        System.out.println("Record batches in file: " + reader.getRecordBlocks().size());
        for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
            reader.loadRecordBatch(arrowBlock);
            VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot();
            System.out.print(vectorSchemaRootRecover.contentToTSVString());
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

 .. testoutput::

    Record batches in file: 3
    name    age
    David    10
    Gladis    20
    Juan    30
    name    age
    Nidia    15
    Alexa    20
    Mara    15
    name    age
    Raul    34
    Jhon    29
    Thomy    33

 Read - From Buffer
 ------------------

 .. testcode::

    import org.apache.arrow.memory.BufferAllocator;
    import org.apache.arrow.memory.RootAllocator;
    import org.apache.arrow.vector.ipc.ArrowFileReader;
    import org.apache.arrow.vector.ipc.SeekableReadChannel;
    import org.apache.arrow.vector.ipc.message.ArrowBlock;
    import org.apache.arrow.vector.VectorSchemaRoot;
    import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
    import java.io.IOException;
    import java.nio.file.Files;
    import java.nio.file.Path;
    import java.nio.file.Paths;

    Path path = Paths.get("./thirdpartydeps/arrowfiles/random_access.arrow");
    try(
        BufferAllocator rootAllocator = new RootAllocator();
        ArrowFileReader reader = new ArrowFileReader(new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(
                                            Files.readAllBytes(path))), rootAllocator)
    ) {
        System.out.println("Record batches in file: " + reader.getRecordBlocks().size());
        for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
            reader.loadRecordBatch(arrowBlock);
            VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot();
            System.out.print(vectorSchemaRootRecover.contentToTSVString());
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

 .. testoutput::

    Record batches in file: 3
    name    age
    David    10
    Gladis    20
    Juan    30
    name    age
    Nidia    15
    Alexa    20
    Mara    15
    name    age
    Raul    34
    Jhon    29
    Thomy    33

 Reading Streaming Format
 ************************

 Read - From File
 ----------------

 .. testcode::

    import org.apache.arrow.memory.BufferAllocator;
    import org.apache.arrow.memory.RootAllocator;
    import org.apache.arrow.vector.ipc.ArrowStreamReader;
    import org.apache.arrow.vector.VectorSchemaRoot;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;

    File file = new File("./thirdpartydeps/arrowfiles/streaming.arrow");
    try(
        BufferAllocator rootAllocator = new RootAllocator();
        FileInputStream fileInputStreamForStream = new FileInputStream(file);
        ArrowStreamReader reader = new ArrowStreamReader(fileInputStreamForStream, rootAllocator)
    ) {
        while (reader.loadNextBatch()) {
            VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot();
            System.out.print(vectorSchemaRootRecover.contentToTSVString());
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

 .. testoutput::

    name    age
    David    10
    Gladis    20
    Juan    30
    name    age
    Nidia    15
    Alexa    20
    Mara    15
    name    age
    Raul    34
    Jhon    29
    Thomy    33

 Read - From Buffer
 ------------------

 .. testcode::

    import org.apache.arrow.memory.BufferAllocator;
    import org.apache.arrow.memory.RootAllocator;
    import org.apache.arrow.vector.ipc.ArrowStreamReader;
    import java.io.ByteArrayInputStream;
    import java.io.IOException;
    import java.nio.file.Files;
    import java.nio.file.Path;
    import java.nio.file.Paths;

    Path path = Paths.get("./thirdpartydeps/arrowfiles/streaming.arrow");
    try(
        BufferAllocator rootAllocator = new RootAllocator();
        ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(
                                        Files.readAllBytes(path)), rootAllocator)
    ) {
        while(reader.loadNextBatch()){
            System.out.print(reader.getVectorSchemaRoot().contentToTSVString());
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

 .. testoutput::

    name    age
    David    10
    Gladis    20
    Juan    30
    name    age
    Nidia    15
    Alexa    20
    Mara    15
    name    age
    Raul    34
    Jhon    29
    Thomy    33

 Reading Parquet File
 ********************

 Please check :doc:`Dataset <./dataset>`

 Handling Data with Dictionaries
 *******************************

 Reading and writing dictionary-encoded data requires separately tracking the dictionaries.

 .. testcode::

    import org.apache.arrow.memory.BufferAllocator;
    import org.apache.arrow.memory.RootAllocator;
    import org.apache.arrow.vector.FieldVector;
    import org.apache.arrow.vector.ValueVector;
    import org.apache.arrow.vector.VarCharVector;
    import org.apache.arrow.vector.VectorSchemaRoot;
    import org.apache.arrow.vector.dictionary.Dictionary;
    import org.apache.arrow.vector.dictionary.DictionaryEncoder;
    import org.apache.arrow.vector.dictionary.DictionaryProvider;
    import org.apache.arrow.vector.ipc.ArrowFileReader;
    import org.apache.arrow.vector.ipc.ArrowFileWriter;
    import org.apache.arrow.vector.ipc.message.ArrowBlock;
    import org.apache.arrow.vector.types.Types;
    import org.apache.arrow.vector.types.pojo.ArrowType;
    import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
    import org.apache.arrow.vector.types.pojo.FieldType;

    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.nio.charset.StandardCharsets;

    DictionaryEncoding dictionaryEncoding = new DictionaryEncoding(
            /*id=*/666L, /*ordered=*/false, /*indexType=*/
            new ArrowType.Int(8, true)
    );
    try (BufferAllocator root = new RootAllocator();
         VarCharVector countries = new VarCharVector("country-dict", root);
         VarCharVector appUserCountriesUnencoded = new VarCharVector(
                 "app-use-country-dict",
                 new FieldType(true, Types.MinorType.VARCHAR.getType(), dictionaryEncoding),
                 root)
    ) {
        countries.allocateNew(10);
        countries.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
        countries.set(1, "Cuba".getBytes(StandardCharsets.UTF_8));
        countries.set(2, "Grecia".getBytes(StandardCharsets.UTF_8));
        countries.set(3, "Guinea".getBytes(StandardCharsets.UTF_8));
        countries.set(4, "Islandia".getBytes(StandardCharsets.UTF_8));
        countries.set(5, "Malta".getBytes(StandardCharsets.UTF_8));
        countries.set(6, "Tailandia".getBytes(StandardCharsets.UTF_8));
        countries.set(7, "Uganda".getBytes(StandardCharsets.UTF_8));
        countries.set(8, "Yemen".getBytes(StandardCharsets.UTF_8));
        countries.set(9, "Zambia".getBytes(StandardCharsets.UTF_8));
        countries.setValueCount(10);

        Dictionary countriesDictionary = new Dictionary(countries, dictionaryEncoding);
        System.out.println("Dictionary: " + countriesDictionary);

        appUserCountriesUnencoded.allocateNew(5);
        appUserCountriesUnencoded.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
        appUserCountriesUnencoded.set(1, "Guinea".getBytes(StandardCharsets.UTF_8));
        appUserCountriesUnencoded.set(2, "Islandia".getBytes(StandardCharsets.UTF_8));
        appUserCountriesUnencoded.set(3, "Malta".getBytes(StandardCharsets.UTF_8));
        appUserCountriesUnencoded.set(4, "Uganda".getBytes(StandardCharsets.UTF_8));
        appUserCountriesUnencoded.setValueCount(5);
        System.out.println("Unencoded data: " + appUserCountriesUnencoded);

        File file = new File("random_access_file_with_dictionary.arrow");
        DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
        provider.put(countriesDictionary);
        try (FieldVector appUseCountryDictionaryEncoded = (FieldVector) DictionaryEncoder
                .encode(appUserCountriesUnencoded, countriesDictionary);
             VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.of(appUseCountryDictionaryEncoded);
             FileOutputStream fileOutputStream = new FileOutputStream(file);
             ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, provider, fileOutputStream.getChannel())
        ) {
            System.out.println("Dictionary-encoded data: " +appUseCountryDictionaryEncoded);
            System.out.println("Dictionary-encoded ID: " +appUseCountryDictionaryEncoded.getField().getDictionary().getId());
            writer.start();
            writer.writeBatch();
            writer.end();
            System.out.println("Record batches written: " + writer.getRecordBlocks().size() + ". Number of rows written: " + vectorSchemaRoot.getRowCount());
            try(
                BufferAllocator rootAllocator = new RootAllocator();
                FileInputStream fileInputStream = new FileInputStream(file);
                ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator)
            ){
                for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
                    reader.loadRecordBatch(arrowBlock);
                    FieldVector appUseCountryDictionaryEncodedRead = reader.getVectorSchemaRoot().getVector("app-use-country-dict");
                    DictionaryEncoding dictionaryEncodingRead = appUseCountryDictionaryEncodedRead.getField().getDictionary();
                    System.out.println("Dictionary-encoded ID recovered: " + dictionaryEncodingRead.getId());
                    Dictionary appUseCountryDictionaryRead = reader.getDictionaryVectors().get(dictionaryEncodingRead.getId());
                    System.out.println("Dictionary-encoded data recovered: " + appUseCountryDictionaryEncodedRead);
                    System.out.println("Dictionary recovered: " + appUseCountryDictionaryRead);
                    try (ValueVector readVector = DictionaryEncoder.decode(appUseCountryDictionaryEncodedRead, appUseCountryDictionaryRead)) {
                        System.out.println("Decoded data: " + readVector);
                    }
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

 .. testoutput::

    Dictionary: Dictionary DictionaryEncoding[id=666,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
    Unencoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
    Dictionary-encoded data: [0, 3, 4, 5, 7]
    Dictionary-encoded ID: 666
    Record batches written: 1. Number of rows written: 5
    Dictionary-encoded ID recovered: 666
    Dictionary-encoded data recovered: [0, 3, 4, 5, 7]
    Dictionary recovered: Dictionary DictionaryEncoding[id=666,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
    Decoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
	.. Licensed to the Apache Software Foundation (ASF) under one
	.. or more contributor license agreements. See the NOTICE file
	.. distributed with this work for additional information
	.. regarding copyright ownership. The ASF licenses this file
	.. to you under the Apache License, Version 2.0 (the
	.. "License"); you may not use this file except in compliance
	.. with the License. You may obtain a copy of the License at

	.. http://www.apache.org/licenses/LICENSE-2.0

	.. Unless required by applicable law or agreed to in writing,
	.. software distributed under the License is distributed on an
	.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	.. KIND, either express or implied. See the License for the
	.. specific language governing permissions and limitations
	.. under the License.

	.. _arrow-io:

	========================
	Reading and writing data
	========================

	The `Arrow IPC format <https://arrow.apache.org/docs/java/ipc.html>`_ defines two types of binary formats
	for serializing Arrow data: the streaming format and the file format (or random access format). Such files can
	be directly memory-mapped when read.

	.. contents::

	Writing
	=======

	Both writing file and streaming formats use the same API.

	Writing Random Access Files
	***************************

	Write - Out to File
	-------------------

	.. testcode::

	import org.apache.arrow.memory.BufferAllocator;
	import org.apache.arrow.memory.RootAllocator;
	import org.apache.arrow.vector.VarCharVector;
	import org.apache.arrow.vector.IntVector;
	import org.apache.arrow.vector.types.pojo.Field;
	import org.apache.arrow.vector.types.pojo.FieldType;
	import org.apache.arrow.vector.types.pojo.ArrowType;
	import org.apache.arrow.vector.types.pojo.Schema;
	import org.apache.arrow.vector.VectorSchemaRoot;
	import static java.util.Arrays.asList;
	import org.apache.arrow.vector.ipc.ArrowFileWriter;
	import java.io.File;
	import java.io.FileOutputStream;
	import java.io.IOException;

	try (BufferAllocator allocator = new RootAllocator()) {
	Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
	Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
	Schema schemaPerson = new Schema(asList(name, age));
	try(
	VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, allocator)
	){
	VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
	nameVector.allocateNew(3);
	nameVector.set(0, "David".getBytes());
	nameVector.set(1, "Gladis".getBytes());
	nameVector.set(2, "Juan".getBytes());
	IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
	ageVector.allocateNew(3);
	ageVector.set(0, 10);
	ageVector.set(1, 20);
	ageVector.set(2, 30);
	vectorSchemaRoot.setRowCount(3);
	File file = new File("randon_access_to_file.arrow");
	try (
	FileOutputStream fileOutputStream = new FileOutputStream(file);
	ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, fileOutputStream.getChannel())
	) {
	writer.start();
	writer.writeBatch();
	writer.end();
	System.out.println("Record batches written: " + writer.getRecordBlocks().size() + ". Number of rows written: " + vectorSchemaRoot.getRowCount());
	} catch (IOException e) {
	e.printStackTrace();
	}
	}
	}

	.. testoutput::

	Record batches written: 1. Number of rows written: 3

	Write - Out to Buffer
	---------------------

	.. testcode::

	import org.apache.arrow.memory.BufferAllocator;
	import org.apache.arrow.memory.RootAllocator;
	import org.apache.arrow.vector.VarCharVector;
	import org.apache.arrow.vector.IntVector;
	import org.apache.arrow.vector.types.pojo.Field;
	import org.apache.arrow.vector.types.pojo.FieldType;
	import org.apache.arrow.vector.types.pojo.ArrowType;
	import org.apache.arrow.vector.types.pojo.Schema;
	import org.apache.arrow.vector.VectorSchemaRoot;
	import static java.util.Arrays.asList;
	import org.apache.arrow.vector.ipc.ArrowFileWriter;
	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.nio.channels.Channels;

	try (BufferAllocator allocator = new RootAllocator()) {
	Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
	Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
	Schema schemaPerson = new Schema(asList(name, age));
	try(
	VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, allocator)
	){
	VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
	nameVector.allocateNew(3);
	nameVector.set(0, "David".getBytes());
	nameVector.set(1, "Gladis".getBytes());
	nameVector.set(2, "Juan".getBytes());
	IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
	ageVector.allocateNew(3);
	ageVector.set(0, 10);
	ageVector.set(1, 20);
	ageVector.set(2, 30);
	vectorSchemaRoot.setRowCount(3);
	try (
	ByteArrayOutputStream out = new ByteArrayOutputStream();
	ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out))
	) {
	writer.start();
	writer.writeBatch();

	System.out.println("Record batches written: " + writer.getRecordBlocks().size() +
	". Number of rows written: " + vectorSchemaRoot.getRowCount());
	} catch (IOException e) {
	e.printStackTrace();
	}
	}
	}

	.. testoutput::

	Record batches written: 1. Number of rows written: 3

	Writing Streaming Format
	************************

	Write - Out to File
	-------------------

	.. testcode::

	import org.apache.arrow.memory.BufferAllocator;
	import org.apache.arrow.memory.RootAllocator;
	import org.apache.arrow.vector.VarCharVector;
	import org.apache.arrow.vector.IntVector;
	import org.apache.arrow.vector.ipc.ArrowStreamWriter;
	import org.apache.arrow.vector.types.pojo.Field;
	import org.apache.arrow.vector.types.pojo.FieldType;
	import org.apache.arrow.vector.types.pojo.ArrowType;
	import org.apache.arrow.vector.types.pojo.Schema;
	import org.apache.arrow.vector.VectorSchemaRoot;
	import static java.util.Arrays.asList;
	import java.io.File;
	import java.io.FileOutputStream;
	import java.io.IOException;

	try (BufferAllocator rootAllocator = new RootAllocator()) {
	Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
	Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
	Schema schemaPerson = new Schema(asList(name, age));
	try(
	VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)
	){
	VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
	nameVector.allocateNew(3);
	nameVector.set(0, "David".getBytes());
	nameVector.set(1, "Gladis".getBytes());
	nameVector.set(2, "Juan".getBytes());
	IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
	ageVector.allocateNew(3);
	ageVector.set(0, 10);
	ageVector.set(1, 20);
	ageVector.set(2, 30);
	vectorSchemaRoot.setRowCount(3);
	File file = new File("streaming_to_file.arrow");
	try (
	FileOutputStream fileOutputStream = new FileOutputStream(file);
	ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, fileOutputStream.getChannel())
	){
	writer.start();
	writer.writeBatch();
	System.out.println("Number of rows written: " + vectorSchemaRoot.getRowCount());
	} catch (IOException e) {
	e.printStackTrace();
	}
	}
	}

	.. testoutput::

	Number of rows written: 3

	Write - Out to Buffer
	---------------------

	.. testcode::

	import org.apache.arrow.memory.BufferAllocator;
	import org.apache.arrow.memory.RootAllocator;
	import org.apache.arrow.vector.VarCharVector;
	import org.apache.arrow.vector.IntVector;
	import org.apache.arrow.vector.ipc.ArrowStreamWriter;
	import org.apache.arrow.vector.types.pojo.Field;
	import org.apache.arrow.vector.types.pojo.FieldType;
	import org.apache.arrow.vector.types.pojo.ArrowType;
	import org.apache.arrow.vector.types.pojo.Schema;
	import org.apache.arrow.vector.VectorSchemaRoot;
	import static java.util.Arrays.asList;
	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.nio.channels.Channels;

	try (BufferAllocator rootAllocator = new RootAllocator()) {
	Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
	Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
	Schema schemaPerson = new Schema(asList(name, age));
	try(
	VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)
	){
	VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
	nameVector.allocateNew(3);
	nameVector.set(0, "David".getBytes());
	nameVector.set(1, "Gladis".getBytes());
	nameVector.set(2, "Juan".getBytes());
	IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
	ageVector.allocateNew(3);
	ageVector.set(0, 10);
	ageVector.set(1, 20);
	ageVector.set(2, 30);
	vectorSchemaRoot.setRowCount(3);
	try (
	ByteArrayOutputStream out = new ByteArrayOutputStream();
	ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out))
	){
	writer.start();
	writer.writeBatch();
	System.out.println("Number of rows written: " + vectorSchemaRoot.getRowCount());
	} catch (IOException e) {
	e.printStackTrace();
	}
	}
	}

	.. testoutput::

	Number of rows written: 3

	Reading
	=======

	Reading the random access format and streaming format both offer the same API,
	with the difference that random access files also offer access to any record batch by index.

	Reading Random Access Files
	***************************

	Read - From File
	----------------

	We are providing a path with auto generated arrow files for testing purposes, change that at your convenience.

	.. testcode::

	import org.apache.arrow.memory.BufferAllocator;
	import org.apache.arrow.memory.RootAllocator;
	import org.apache.arrow.vector.ipc.ArrowFileReader;
	import org.apache.arrow.vector.ipc.message.ArrowBlock;
	import org.apache.arrow.vector.VectorSchemaRoot;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;

	File file = new File("./thirdpartydeps/arrowfiles/random_access.arrow");
	try(
	BufferAllocator rootAllocator = new RootAllocator();
	FileInputStream fileInputStream = new FileInputStream(file);
	ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator)
	){
	System.out.println("Record batches in file: " + reader.getRecordBlocks().size());
	for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
	reader.loadRecordBatch(arrowBlock);
	VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot();
	System.out.print(vectorSchemaRootRecover.contentToTSVString());
	}
	} catch (IOException e) {
	e.printStackTrace();
	}

	.. testoutput::

	Record batches in file: 3
	name age
	David 10
	Gladis 20
	Juan 30
	name age
	Nidia 15
	Alexa 20
	Mara 15
	name age
	Raul 34
	Jhon 29
	Thomy 33

	Read - From Buffer
	------------------

	.. testcode::

	import org.apache.arrow.memory.BufferAllocator;
	import org.apache.arrow.memory.RootAllocator;
	import org.apache.arrow.vector.ipc.ArrowFileReader;
	import org.apache.arrow.vector.ipc.SeekableReadChannel;
	import org.apache.arrow.vector.ipc.message.ArrowBlock;
	import org.apache.arrow.vector.VectorSchemaRoot;
	import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
	import java.io.IOException;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.nio.file.Paths;

	Path path = Paths.get("./thirdpartydeps/arrowfiles/random_access.arrow");
	try(
	BufferAllocator rootAllocator = new RootAllocator();
	ArrowFileReader reader = new ArrowFileReader(new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(
	Files.readAllBytes(path))), rootAllocator)
	) {
	System.out.println("Record batches in file: " + reader.getRecordBlocks().size());
	for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
	reader.loadRecordBatch(arrowBlock);
	VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot();
	System.out.print(vectorSchemaRootRecover.contentToTSVString());
	}
	} catch (IOException e) {
	e.printStackTrace();
	}

	.. testoutput::

	Record batches in file: 3
	name age
	David 10
	Gladis 20
	Juan 30
	name age
	Nidia 15
	Alexa 20
	Mara 15
	name age
	Raul 34
	Jhon 29
	Thomy 33

	Reading Streaming Format
	************************

	Read - From File
	----------------

	.. testcode::

	import org.apache.arrow.memory.BufferAllocator;
	import org.apache.arrow.memory.RootAllocator;
	import org.apache.arrow.vector.ipc.ArrowStreamReader;
	import org.apache.arrow.vector.VectorSchemaRoot;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;

	File file = new File("./thirdpartydeps/arrowfiles/streaming.arrow");
	try(
	BufferAllocator rootAllocator = new RootAllocator();
	FileInputStream fileInputStreamForStream = new FileInputStream(file);
	ArrowStreamReader reader = new ArrowStreamReader(fileInputStreamForStream, rootAllocator)
	) {
	while (reader.loadNextBatch()) {
	VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot();
	System.out.print(vectorSchemaRootRecover.contentToTSVString());
	}
	} catch (IOException e) {
	e.printStackTrace();
	}

	.. testoutput::

	name age
	David 10
	Gladis 20
	Juan 30
	name age
	Nidia 15
	Alexa 20
	Mara 15
	name age
	Raul 34
	Jhon 29
	Thomy 33

	Read - From Buffer
	------------------

	.. testcode::

	import org.apache.arrow.memory.BufferAllocator;
	import org.apache.arrow.memory.RootAllocator;
	import org.apache.arrow.vector.ipc.ArrowStreamReader;
	import java.io.ByteArrayInputStream;
	import java.io.IOException;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.nio.file.Paths;

	Path path = Paths.get("./thirdpartydeps/arrowfiles/streaming.arrow");
	try(
	BufferAllocator rootAllocator = new RootAllocator();
	ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(
	Files.readAllBytes(path)), rootAllocator)
	) {
	while(reader.loadNextBatch()){
	System.out.print(reader.getVectorSchemaRoot().contentToTSVString());
	}
	} catch (IOException e) {
	e.printStackTrace();
	}

	.. testoutput::

	name age
	David 10
	Gladis 20
	Juan 30
	name age
	Nidia 15
	Alexa 20
	Mara 15
	name age
	Raul 34
	Jhon 29
	Thomy 33

	Reading Parquet File
	********************

	Please check :doc:`Dataset <./dataset>`

	Handling Data with Dictionaries
	*******************************

	Reading and writing dictionary-encoded data requires separately tracking the dictionaries.

	.. testcode::

	import org.apache.arrow.memory.BufferAllocator;
	import org.apache.arrow.memory.RootAllocator;
	import org.apache.arrow.vector.FieldVector;
	import org.apache.arrow.vector.ValueVector;
	import org.apache.arrow.vector.VarCharVector;
	import org.apache.arrow.vector.VectorSchemaRoot;
	import org.apache.arrow.vector.dictionary.Dictionary;
	import org.apache.arrow.vector.dictionary.DictionaryEncoder;
	import org.apache.arrow.vector.dictionary.DictionaryProvider;
	import org.apache.arrow.vector.ipc.ArrowFileReader;
	import org.apache.arrow.vector.ipc.ArrowFileWriter;
	import org.apache.arrow.vector.ipc.message.ArrowBlock;
	import org.apache.arrow.vector.types.Types;
	import org.apache.arrow.vector.types.pojo.ArrowType;
	import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
	import org.apache.arrow.vector.types.pojo.FieldType;

	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileNotFoundException;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.nio.charset.StandardCharsets;

	DictionaryEncoding dictionaryEncoding = new DictionaryEncoding(
	/id=/666L, /ordered=/false, /indexType=/
	new ArrowType.Int(8, true)
	);
	try (BufferAllocator root = new RootAllocator();
	VarCharVector countries = new VarCharVector("country-dict", root);
	VarCharVector appUserCountriesUnencoded = new VarCharVector(
	"app-use-country-dict",
	new FieldType(true, Types.MinorType.VARCHAR.getType(), dictionaryEncoding),
	root)
	) {
	countries.allocateNew(10);
	countries.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
	countries.set(1, "Cuba".getBytes(StandardCharsets.UTF_8));
	countries.set(2, "Grecia".getBytes(StandardCharsets.UTF_8));
	countries.set(3, "Guinea".getBytes(StandardCharsets.UTF_8));
	countries.set(4, "Islandia".getBytes(StandardCharsets.UTF_8));
	countries.set(5, "Malta".getBytes(StandardCharsets.UTF_8));
	countries.set(6, "Tailandia".getBytes(StandardCharsets.UTF_8));
	countries.set(7, "Uganda".getBytes(StandardCharsets.UTF_8));
	countries.set(8, "Yemen".getBytes(StandardCharsets.UTF_8));
	countries.set(9, "Zambia".getBytes(StandardCharsets.UTF_8));
	countries.setValueCount(10);

	Dictionary countriesDictionary = new Dictionary(countries, dictionaryEncoding);
	System.out.println("Dictionary: " + countriesDictionary);

	appUserCountriesUnencoded.allocateNew(5);
	appUserCountriesUnencoded.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
	appUserCountriesUnencoded.set(1, "Guinea".getBytes(StandardCharsets.UTF_8));
	appUserCountriesUnencoded.set(2, "Islandia".getBytes(StandardCharsets.UTF_8));
	appUserCountriesUnencoded.set(3, "Malta".getBytes(StandardCharsets.UTF_8));
	appUserCountriesUnencoded.set(4, "Uganda".getBytes(StandardCharsets.UTF_8));
	appUserCountriesUnencoded.setValueCount(5);
	System.out.println("Unencoded data: " + appUserCountriesUnencoded);

	File file = new File("random_access_file_with_dictionary.arrow");
	DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
	provider.put(countriesDictionary);
	try (FieldVector appUseCountryDictionaryEncoded = (FieldVector) DictionaryEncoder
	.encode(appUserCountriesUnencoded, countriesDictionary);
	VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.of(appUseCountryDictionaryEncoded);
	FileOutputStream fileOutputStream = new FileOutputStream(file);
	ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, provider, fileOutputStream.getChannel())
	) {
	System.out.println("Dictionary-encoded data: " +appUseCountryDictionaryEncoded);
	System.out.println("Dictionary-encoded ID: " +appUseCountryDictionaryEncoded.getField().getDictionary().getId());
	writer.start();
	writer.writeBatch();
	writer.end();
	System.out.println("Record batches written: " + writer.getRecordBlocks().size() + ". Number of rows written: " + vectorSchemaRoot.getRowCount());
	try(
	BufferAllocator rootAllocator = new RootAllocator();
	FileInputStream fileInputStream = new FileInputStream(file);
	ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator)
	){
	for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
	reader.loadRecordBatch(arrowBlock);
	FieldVector appUseCountryDictionaryEncodedRead = reader.getVectorSchemaRoot().getVector("app-use-country-dict");
	DictionaryEncoding dictionaryEncodingRead = appUseCountryDictionaryEncodedRead.getField().getDictionary();
	System.out.println("Dictionary-encoded ID recovered: " + dictionaryEncodingRead.getId());
	Dictionary appUseCountryDictionaryRead = reader.getDictionaryVectors().get(dictionaryEncodingRead.getId());
	System.out.println("Dictionary-encoded data recovered: " + appUseCountryDictionaryEncodedRead);
	System.out.println("Dictionary recovered: " + appUseCountryDictionaryRead);
	try (ValueVector readVector = DictionaryEncoder.decode(appUseCountryDictionaryEncodedRead, appUseCountryDictionaryRead)) {
	System.out.println("Decoded data: " + readVector);
	}
	}
	}
	} catch (FileNotFoundException e) {
	e.printStackTrace();
	} catch (IOException e) {
	e.printStackTrace();
	}
	}

	.. testoutput::

	Dictionary: Dictionary DictionaryEncoding[id=666,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
	Unencoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
	Dictionary-encoded data: [0, 3, 4, 5, 7]
	Dictionary-encoded ID: 666
	Record batches written: 1. Number of rows written: 5
	Dictionary-encoded ID recovered: 666
	Dictionary-encoded data recovered: [0, 3, 4, 5, 7]
	Dictionary recovered: Dictionary DictionaryEncoding[id=666,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
	Decoded data: [Andorra, Guinea, Islandia, Malta, Uganda]