blob: ab02eefb3e853b7772dac4116a31f54be52a3626 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
from pycarbon.sdk.CarbonReader import CarbonReader
from pycarbon.sdk.CarbonSchemaReader import CarbonSchemaReader
from pycarbon.sdk.CarbonWriter import CarbonWriter
import base64
import time
import shutil
import os
import jnius_config
jnius_config.set_classpath("../../../store/sdk/target/carbondata-sdk.jar")
IMAGE_DATA_PATH = "./resources"
def test_run_write_carbon():
jsonSchema = "[{stringField:string},{shortField:short},{intField:int}]"
path = "/tmp/data/writeCarbon" + str(time.time())
if os.path.exists(path):
shutil.rmtree(path)
writer = CarbonWriter() \
.builder() \
.outputPath(path) \
.withCsvInput(jsonSchema) \
.writtenBy("pycarbon") \
.build()
for i in range(0, 10):
from jnius import autoclass
arrayListClass = autoclass("java.util.ArrayList")
data_list = arrayListClass()
data_list.add("pycarbon")
data_list.add(str(i))
data_list.add(str(i * 10))
writer.write(data_list.toArray())
writer.close()
reader = CarbonReader() \
.builder() \
.withFolder(path) \
.withBatch(1000) \
.build()
i = 0
while reader.hasNext():
rows = reader.readNextBatchRow()
i += len(rows)
assert 10 == i
reader.close()
carbonSchemaReader = CarbonSchemaReader()
schema = carbonSchemaReader.readSchema(path)
assert 3 == schema.getFieldsLength()
writer = CarbonWriter() \
.builder() \
.outputPath(path) \
.withCsvInput(jsonSchema) \
.writtenBy("pycarbon") \
.build()
for i in range(0, 10):
from jnius import autoclass
arrayListClass = autoclass("java.util.ArrayList")
data_list = arrayListClass()
data_list.add("pycarbon")
data_list.add(str(i))
data_list.add(str(i * 10))
writer.write(data_list.toArray())
writer.close()
carbonSchemaReader = CarbonSchemaReader()
schema = carbonSchemaReader.readSchema(getAsBuffer=False, path=path, validateSchema=True)
assert 3 == schema.getFieldsLength()
shutil.rmtree(path)
def test_run_write_carbon_binary_base64_encode():
jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary}]"
path = "/tmp/data/writeCarbon" + str(time.time())
if os.path.exists(path):
shutil.rmtree(path)
jpg_path = IMAGE_DATA_PATH + "/carbondatalogo.jpg"
writer = CarbonWriter() \
.builder() \
.outputPath(path) \
.withCsvInput(jsonSchema) \
.writtenBy("pycarbon") \
.build()
with open(jpg_path, mode='rb+') as file_object:
content = file_object.read()
for i in range(0, 10):
from jnius import autoclass
arrayListClass = autoclass("java.util.ArrayList")
data_list = arrayListClass()
data_list.add("pycarbon")
data_list.add(str(i))
data_list.add(str(i * 10))
data_list.add(base64.b64encode(content))
writer.write(data_list.toArray())
writer.close()
reader = CarbonReader() \
.builder() \
.withFolder(path) \
.withBatch(1000) \
.build()
i = 0
while reader.hasNext():
rows = reader.readNextBatchRow()
for row in rows:
i += 1
for column in row:
from jnius.jnius import ByteArray
if 1 == i and isinstance(column, ByteArray) and len(column) > 1000:
with open(path + "/image.jpg", 'wb+') as file_object:
file_object.write(base64.b64decode(column.tostring()))
assert 10 == i
reader.close()
shutil.rmtree(path)
# TODO: to be supported
@pytest.mark.skip("write binary to be supported")
def test_run_write_carbon_binary():
jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary}]"
path = "/tmp/data/writeCarbon" + str(time.time())
if os.path.exists(path):
shutil.rmtree(path)
jpg_path = IMAGE_DATA_PATH + "/carbondatalogo.jpg"
writer = CarbonWriter() \
.builder() \
.outputPath(path) \
.withCsvInput(jsonSchema) \
.writtenBy("pycarbon") \
.build()
with open(jpg_path, mode='rb+') as file_object:
content = file_object.read()
for i in range(0, 10):
from jnius import autoclass
arrayListClass = autoclass("java.util.ArrayList")
data_list = arrayListClass()
data_list.add("pycarbon")
data_list.add(str(i))
data_list.add(str(i * 10))
data_list.add(content)
writer.write(data_list.toArray())
writer.close()
reader = CarbonReader() \
.builder() \
.withFolder(path) \
.withBatch(1000) \
.build()
i = 0
while reader.hasNext():
rows = reader.readNextBatchRow()
for row in rows:
i += 1
for column in row:
from jnius.jnius import ByteArray
if 1 == i and isinstance(column, ByteArray) and len(column) > 1000:
with open(path + "/image.jpg", 'wb+') as file_object:
file_object.write(column.tostring())
assert 10 == i
reader.close()
shutil.rmtree(path)
def test_run_write_carbon_binary_base64_encode_many_files():
jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{txtField:string}]"
path = "/tmp/data/writeCarbon" + str(time.time())
if os.path.exists(path):
shutil.rmtree(path)
jpg_path = IMAGE_DATA_PATH + "/flowers"
from jnius import autoclass
sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil")
jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg')
writer = CarbonWriter() \
.builder() \
.outputPath(path) \
.withCsvInput(jsonSchema) \
.writtenBy("pycarbon") \
.build()
for i in range(0, jpg_files.size()):
jpg_path = jpg_files.get(i)
with open(jpg_path, mode='rb+') as file_object:
content = file_object.read()
with open(str(jpg_path).replace('.jpg', '.txt'), mode='r+') as file_object:
txt = file_object.read()
arrayListClass = autoclass("java.util.ArrayList")
data_list = arrayListClass()
data_list.add("pycarbon")
data_list.add(str(i))
data_list.add(str(i * 10))
data_list.add(base64.b64encode(content))
data_list.add(txt)
writer.write(data_list.toArray())
writer.close()
reader = CarbonReader() \
.builder() \
.withFolder(path) \
.build()
i = 0
while reader.hasNext():
rows = reader.readNextBatchRow()
for row in rows:
i += 1
for column in row:
from jnius.jnius import ByteArray
if isinstance(column, ByteArray) and len(column) > 1000:
with open(path + "/image" + str(i) + ".jpg", 'wb+') as file_object:
file_object.write(base64.b64decode(column.tostring()))
assert 3 == i
reader.close()
shutil.rmtree(path)
def test_run_write_carbon_binary_base64_encode_voc():
jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{txtField:string}]"
path = "/tmp/data/writeCarbon" + str(time.time())
if os.path.exists(path):
shutil.rmtree(path)
jpg_path = IMAGE_DATA_PATH + "/voc"
from jnius import autoclass
sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil")
jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg')
writer = CarbonWriter() \
.builder() \
.outputPath(path) \
.withCsvInput(jsonSchema) \
.writtenBy("pycarbon") \
.build()
for i in range(0, jpg_files.size()):
jpg_path = jpg_files.get(i)
with open(jpg_path, mode='rb+') as file_object:
content = file_object.read()
with open(str(jpg_path).replace('.jpg', '.xml'), mode='r+') as file_object:
txt = file_object.read()
arrayListClass = autoclass("java.util.ArrayList")
data_list = arrayListClass()
data_list.add("pycarbon")
data_list.add(str(i))
data_list.add(str(i * 10))
data_list.add(base64.b64encode(content))
data_list.add(txt)
writer.write(data_list.toArray())
writer.close()
reader = CarbonReader() \
.builder() \
.withFolder(path) \
.withBatch(1000) \
.build()
i = 0
while reader.hasNext():
rows = reader.readNextBatchRow()
for row in rows:
i += 1
for column in row:
from jnius.jnius import ByteArray
if isinstance(column, ByteArray) and len(column) > 1000:
with open(path + "/image" + str(i) + ".jpg", 'wb+') as file_object:
file_object.write(base64.b64decode(column.tostring()))
assert 5 == i
reader.close()
shutil.rmtree(path)
def test_run_write_carbon_binary_base64_encode_vocForSegmentationClass():
jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{segField:binary}]"
path = "/tmp/data/writeCarbon" + str(time.time())
if os.path.exists(path):
shutil.rmtree(path)
jpg_path = IMAGE_DATA_PATH + "/vocForSegmentationClass"
from jnius import autoclass
sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil")
jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg')
writer = CarbonWriter() \
.builder() \
.outputPath(path) \
.withCsvInput(jsonSchema) \
.writtenBy("pycarbon") \
.build()
for i in range(0, jpg_files.size()):
jpg_path = jpg_files.get(i)
with open(jpg_path, mode='rb+') as file_object:
content = file_object.read()
with open(str(jpg_path).replace('.jpg', '.png'), mode='rb+') as file_object:
png_data = file_object.read()
arrayListClass = autoclass("java.util.ArrayList")
data_list = arrayListClass()
data_list.add("pycarbon")
data_list.add(str(i))
data_list.add(str(i * 10))
data_list.add(base64.b64encode(content))
data_list.add(base64.b64encode(png_data))
writer.write(data_list.toArray())
writer.close()
reader = CarbonReader() \
.builder() \
.withFolder(path) \
.withBatch(1000) \
.build()
i = 0
while reader.hasNext():
rows = reader.readNextBatchRow()
for row in rows:
i += 1
num = 0
for column in row:
num += 1
from jnius.jnius import ByteArray
if isinstance(column, ByteArray) and len(column) > 1000:
with open(path + "/image" + str(i) + "_" + str(num) + ".jpg", 'wb+') as file_object:
file_object.write(base64.b64decode(column.tostring()))
assert 3 == i
reader.close()
shutil.rmtree(path)
def test_run_write_carbon_binary_base64_encode_decodeInJava_many_files():
jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{txtField:string}]"
path = "/tmp/data/writeCarbon" + str(time.time())
if os.path.exists(path):
shutil.rmtree(path)
jpg_path = IMAGE_DATA_PATH + "/flowers"
from jnius import autoclass
sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil")
jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg')
writer = CarbonWriter() \
.builder() \
.outputPath(path) \
.withCsvInput(jsonSchema) \
.writtenBy("pycarbon") \
.withLoadOption("binary_decoder", "base64") \
.withPageSizeInMb(1) \
.build()
for i in range(0, jpg_files.size()):
jpg_path = jpg_files.get(i)
with open(jpg_path, mode='rb+') as file_object:
content = file_object.read()
with open(str(jpg_path).replace('.jpg', '.txt'), mode='r+') as file_object:
txt = file_object.read()
arrayListClass = autoclass("java.util.ArrayList")
data_list = arrayListClass()
data_list.add("pycarbon")
data_list.add(str(i))
data_list.add(str(i * 10))
data_list.add(base64.b64encode(content))
data_list.add(txt)
writer.write(data_list.toArray())
writer.close()
reader = CarbonReader() \
.builder() \
.withFolder(path) \
.withBatch(1000) \
.build()
i = 0
while reader.hasNext():
rows = reader.readNextBatchRow()
for row in rows:
i += 1
for column in row:
from jnius.jnius import ByteArray
if isinstance(column, ByteArray) and len(column) > 1000 and i < 20:
with open(path + "/image" + str(i) + ".jpg", 'wb+') as file_object:
file_object.write((column.tostring()))
assert 3 == i
reader.close()
shutil.rmtree(path)