sdk_python/amaterasu/pandas/datasets.py - incubator-retired-amaterasu - Git at Google

 """
  Licensed to the Apache Software Foundation (ASF) under one or more
  contributor license agreements.  See the NOTICE file distributed with
  this work for additional information regarding copyright ownership.
  The ASF licenses this file to You under the Apache License, Version 2.0
  (the "License"); you may not use this file except in compliance with
  the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
 """
 import os

 import pandas as pd
 from pyhive import hive
 from typing import Type, Dict, Any

 from amaterasu.base.datasets import BaseDatasetLoader, BaseDatasetManager, DatasetTypes


 class HiveDatasetLoader(BaseDatasetLoader):

     def load_dataset(self) -> pd.DataFrame:
         hive_db = hive.connect(host=self.dataset_conf['host'], port=self.dataset_conf.get('port', 10000))
         with hive_db.cursor() as cursor:
             cursor.execute("SELECT * FROM {}".format(self.dataset_conf['table']))
             data = list(cursor.fetchall())
             return pd.DataFrame.from_records(data)

     def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool = False, keep_index=False):
         hive_db = hive.connect(host=self.dataset_conf['host'], port=self.dataset_conf.get('port', 10000))
         with hive_db.cursor() as cursor:
             if overwrite:
                 cursor.execute('DROP TABLE {}'.format(self.dataset_conf['table']))
             dataset.to_records(index=keep_index)
             dataset.write.saveAsTable(self.dataset_conf['table'])


 class _FileCSVDatasetLoader(BaseDatasetLoader):

     def load_dataset(self, *args, **kwargs) -> pd.DataFrame:
         separator = self.dataset_conf.get('separator', ',')
         return pd.read_csv(self.dataset_conf['uri'], sep=separator)

     def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
         separator = self.dataset_conf.get('separator', ',')
         dataset.to_csv(self.dataset_conf['conf'], sep=separator)


 class _FileJSONDatasetLoader(BaseDatasetLoader):

     def load_dataset(self, *args, **kwargs) -> pd.DataFrame:
         orient = self.dataset_conf.get('orient')
         return pd.read_json(self.dataset_conf['uri'], orient=orient)

     def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
         orient = self.dataset_conf.get('orient')
         dataset.to_json(self.dataset_conf['uri'], orient=orient)


 class _FileExcelDatasetLoader(BaseDatasetLoader):

     def load_dataset(self, *args, **kwargs) -> pd.DataFrame:
         worksheet = self.dataset_conf['worksheet']
         return pd.read_excel(self.dataset_conf['uri'], sheet_name=worksheet)

     def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
         worksheet = self.dataset_conf['worksheet']
         dataset.to_excel(self.dataset_conf['uri'], sheet_name=worksheet)


 class _FileParquetDatasetLoader(BaseDatasetLoader):

     def load_dataset(self, *args, **kwargs) -> pd.DataFrame:
         return pd.read_parquet(self.dataset_conf['uri'])

     def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
         dataset.to_parquet(self.dataset_conf['uri'])


 class _FilePickleDatasetLoader(BaseDatasetLoader):

     def load_dataset(self, *args, **kwargs) -> pd.DataFrame:
         return pd.read_pickle(self.dataset_conf['uri'])

     def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
         dataset.to_pickle(self.dataset_conf['uri'])


 class FileDatasetLoader(BaseDatasetLoader):

     _registered_loaders : Dict[str, Type[BaseDatasetLoader]] = {
         'json': _FileJSONDatasetLoader,
         'excel': _FileExcelDatasetLoader,
         'csv': _FileCSVDatasetLoader,
         'pickle': _FilePickleDatasetLoader,
         'parquet': _FileParquetDatasetLoader
     }

     def __init__(self, dataset_conf: Dict):
         super().__init__(dataset_conf)
         try:
             self._concrete_loader = self._registered_loaders[self.dataset_conf['format']](self.dataset_conf)
         except KeyError:
             raise NotImplemented(
                 "File with format '{}' is not supported, please handle it manually".format(self.dataset_conf['format']))

     def load_dataset(self) -> pd.DataFrame:
         return self._concrete_loader.load_dataset()

     def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
         if not overwrite and os.path.exists(self.dataset_conf['uri']):
             raise IOError("File with path '{}' already exists.".format(self.dataset_conf['uri']))
         else:
             self._concrete_loader.persist_dataset(dataset, overwrite)


 class DatasetManager(BaseDatasetManager):

     def get_datastore(self, datastore_cls: Type[BaseDatasetLoader], dataset_conf: Dict):
         datastore = datastore_cls(dataset_conf)
         return datastore

     def __init__(self, dataset_conf, spark):
         super(DatasetManager, self).__init__(dataset_conf)
         self.spark = spark
         self._registered_datastores[DatasetTypes.Hive.value] = HiveDatasetLoader
         self._registered_datastores[DatasetTypes.File.value] = FileDatasetLoader
	"""
	Licensed to the Apache Software Foundation (ASF) under one or more
	contributor license agreements. See the NOTICE file distributed with
	this work for additional information regarding copyright ownership.
	The ASF licenses this file to You under the Apache License, Version 2.0
	(the "License"); you may not use this file except in compliance with
	the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	"""
	import os

	import pandas as pd
	from pyhive import hive
	from typing import Type, Dict, Any

	from amaterasu.base.datasets import BaseDatasetLoader, BaseDatasetManager, DatasetTypes


	class HiveDatasetLoader(BaseDatasetLoader):

	def load_dataset(self) -> pd.DataFrame:
	hive_db = hive.connect(host=self.dataset_conf['host'], port=self.dataset_conf.get('port', 10000))
	with hive_db.cursor() as cursor:
	cursor.execute("SELECT * FROM {}".format(self.dataset_conf['table']))
	data = list(cursor.fetchall())
	return pd.DataFrame.from_records(data)

	def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool = False, keep_index=False):
	hive_db = hive.connect(host=self.dataset_conf['host'], port=self.dataset_conf.get('port', 10000))
	with hive_db.cursor() as cursor:
	if overwrite:
	cursor.execute('DROP TABLE {}'.format(self.dataset_conf['table']))
	dataset.to_records(index=keep_index)
	dataset.write.saveAsTable(self.dataset_conf['table'])


	class _FileCSVDatasetLoader(BaseDatasetLoader):

	def load_dataset(self, args, *kwargs) -> pd.DataFrame:
	separator = self.dataset_conf.get('separator', ',')
	return pd.read_csv(self.dataset_conf['uri'], sep=separator)

	def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
	separator = self.dataset_conf.get('separator', ',')
	dataset.to_csv(self.dataset_conf['conf'], sep=separator)


	class _FileJSONDatasetLoader(BaseDatasetLoader):

	def load_dataset(self, args, *kwargs) -> pd.DataFrame:
	orient = self.dataset_conf.get('orient')
	return pd.read_json(self.dataset_conf['uri'], orient=orient)

	def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
	orient = self.dataset_conf.get('orient')
	dataset.to_json(self.dataset_conf['uri'], orient=orient)


	class _FileExcelDatasetLoader(BaseDatasetLoader):

	def load_dataset(self, args, *kwargs) -> pd.DataFrame:
	worksheet = self.dataset_conf['worksheet']
	return pd.read_excel(self.dataset_conf['uri'], sheet_name=worksheet)

	def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
	worksheet = self.dataset_conf['worksheet']
	dataset.to_excel(self.dataset_conf['uri'], sheet_name=worksheet)


	class _FileParquetDatasetLoader(BaseDatasetLoader):

	def load_dataset(self, args, *kwargs) -> pd.DataFrame:
	return pd.read_parquet(self.dataset_conf['uri'])

	def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
	dataset.to_parquet(self.dataset_conf['uri'])


	class _FilePickleDatasetLoader(BaseDatasetLoader):

	def load_dataset(self, args, *kwargs) -> pd.DataFrame:
	return pd.read_pickle(self.dataset_conf['uri'])

	def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
	dataset.to_pickle(self.dataset_conf['uri'])


	class FileDatasetLoader(BaseDatasetLoader):

	_registered_loaders : Dict[str, Type[BaseDatasetLoader]] = {
	'json': _FileJSONDatasetLoader,
	'excel': _FileExcelDatasetLoader,
	'csv': _FileCSVDatasetLoader,
	'pickle': _FilePickleDatasetLoader,
	'parquet': _FileParquetDatasetLoader
	}

	def __init__(self, dataset_conf: Dict):
	super().__init__(dataset_conf)
	try:
	self._concrete_loader = self._registered_loaders[self.dataset_conf['format']](self.dataset_conf)
	except KeyError:
	raise NotImplemented(
	"File with format '{}' is not supported, please handle it manually".format(self.dataset_conf['format']))

	def load_dataset(self) -> pd.DataFrame:
	return self._concrete_loader.load_dataset()

	def persist_dataset(self, dataset: pd.DataFrame, overwrite: bool):
	if not overwrite and os.path.exists(self.dataset_conf['uri']):
	raise IOError("File with path '{}' already exists.".format(self.dataset_conf['uri']))
	else:
	self._concrete_loader.persist_dataset(dataset, overwrite)


	class DatasetManager(BaseDatasetManager):

	def get_datastore(self, datastore_cls: Type[BaseDatasetLoader], dataset_conf: Dict):
	datastore = datastore_cls(dataset_conf)
	return datastore

	def __init__(self, dataset_conf, spark):
	super(DatasetManager, self).__init__(dataset_conf)
	self.spark = spark
	self._registered_datastores[DatasetTypes.Hive.value] = HiveDatasetLoader
	self._registered_datastores[DatasetTypes.File.value] = FileDatasetLoader