tree: ff516b1e5e58afe7e8329aaa3a6e2bbdeac2f654 [path history] [tgz]
  1. alltypes_dictionary.avro
  2. alltypes_nulls_plain.avro
  3. alltypes_plain.avro
  4. alltypes_plain.snappy.avro
  5. binary.avro
  6. datapage_v2.snappy.avro
  7. dict-page-offset-zero.avro
  8. fixed_length_decimal.avro
  9. fixed_length_decimal_legacy.avro
  10. int32_decimal.avro
  11. int64_decimal.avro
  12. list_columns.avro
  13. nested_lists.snappy.avro
  14. nonnullable.impala.avro
  15. nullable.impala.avro
  16. nulls.snappy.avro
  17. README.md
  18. repeated_no_annotation.avro
  19. single_nan.avro
data/avro/README.md

This directory contains AVRO files corresponding to the parquet testing files at https://github.com/apache/parquet-testing/blob/master/data/

These files were created by using spark using the commands from https://gist.github.com/Igosuki/324b011f40185269d3fc552350d21744

Roughly:

import com.github.mrpowers.spark.daria.sql.DariaWriters
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import org.apache.hadoop.conf.Configuration 
import org.apache.commons.io.FilenameUtils

val fileGlobs = sc.getConf.get("spark.driver.globs")
val dest = sc.getConf.get("spark.driver.out")

val fs = FileSystem.get(new Configuration(true));
val status = fs.globStatus(new Path(fileGlobs))
for (fileStatus <- status) {
    val path = fileStatus.getPath().toString()
    try {
        val dfin = spark.read.format("parquet").load(path)
        val fileName = fileStatus.getPath().getName();
        val fileNameWithOutExt = FilenameUtils.removeExtension(fileName);
        val destination = s"${dest}/${fileNameWithOutExt}.avro"
        println(s"Converting $path to avro at $destination")
        DariaWriters.writeSingleFile(
            df = dfin,
            format = "avro",
            sc = spark.sparkContext,
            tmpFolder = s"/tmp/dw/${fileName}",
            filename = destination
        )
    } catch {
        case e: Throwable => println(s"failed to convert $path : ${e.getMessage}")
    }
}

Additional notes:

FileDescription
alltypes_nulls_plain.avroContains a single row with null values for each scalar data type, i.e, {"string_col":null,"int_col":null,"bool_col":null,"bigint_col":null,"float_col":null,"double_col":null,"bytes_col":null}. Generated from https://gist.github.com/nenorbot/5a92e24f8f3615488f75e2a18a105c76