vxquery-core/src/main/java/org/apache/vxquery/hdfs2/HDFSFunctions.java - vxquery - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.vxquery.hdfs2;

 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.io.StringReader;
 import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.logging.Level;
 import java.util.logging.Logger;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.mapred.SplitLocationInfo;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hyracks.api.client.NodeControllerInfo;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 import org.apache.hyracks.hdfs.ContextFactory;
 import org.apache.hyracks.hdfs2.dataflow.FileSplitsFactory;
 import org.w3c.dom.Document;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;

 public class HDFSFunctions {

     private Configuration conf;
     private FileSystem fs;
     private String confPath;
     private Job job;
     private InputFormat inputFormat;
     private List<InputSplit> splits;
     private ArrayList<ArrayList<String>> nodes;
     private HashMap<Integer, String> schedule;
     private static final String TEMP = "java.io.tmpdir";
     private static final String DFS_PATH = "vxquery_splits_schedule.txt";
     private static final String FILEPATH = System.getProperty(TEMP) + "splits_schedule.txt";
     protected static final Logger LOGGER = Logger.getLogger(HDFSFunctions.class.getName());
     private final Map<String, NodeControllerInfo> nodeControllerInfos;

     /**
      * Create the configuration and add the paths for core-site and hdfs-site as resources.
      * Initialize an instance of HDFS FileSystem for this configuration.
      *
      * @param nodeControllerInfos
      *            Map of the node to its attributes
      * @param hdfsConf
      *            Hdfs path to config
      */
     public HDFSFunctions(Map<String, NodeControllerInfo> nodeControllerInfos, String hdfsConf) {
         this.conf = new Configuration();
         this.nodeControllerInfos = nodeControllerInfos;
         this.confPath = hdfsConf;
     }

     /**
      * Create the needed objects for reading the splits of the filepath given as argument.
      * This method should run before the scheduleSplits method.
      *
      * @param filepath
      *            Path to config.
      * @param tag
      *            Tag to read.
      */
     @SuppressWarnings({ "deprecation", "unchecked" })
     public void setJob(String filepath, String tag) {
         try {
             conf.set("start_tag", "<" + tag + ">");
             conf.set("end_tag", "</" + tag + ">");
             job = new Job(conf, "Read from HDFS");
             Path input = new Path(filepath);
             FileInputFormat.addInputPath(job, input);
             job.setInputFormatClass(XmlCollectionWithTagInputFormat.class);
             inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
             splits = inputFormat.getSplits(job);
         } catch (IOException | ClassNotFoundException | InterruptedException e) {
             if (LOGGER.isLoggable(Level.SEVERE)) {
                 LOGGER.severe(e.getMessage());
             }
         }
     }

     /**
      * Returns true if the file path exists or it is located somewhere in the home directory of the user that called the function.
      * Searches in subdirectories of the home directory too.
      *
      * @param filename
      *            HDFS file path.
      * @return boolean
      *         True if located in HDFS.
      * @throws IOException
      *         If searching for the filepath throws {@link IOException}
      */
     public boolean isLocatedInHDFS(String filename) throws IllegalArgumentException, IOException {
         //search file path
         if (fs.exists(new Path(filename))) {
             return true;
         }
         return searchInDirectory(fs.getHomeDirectory(), filename) != null;
     }

     /**
      * Searches the given directory for the file.
      *
      * @param directory
      *            to search
      * @param filename
      *            of file we want
      * @return path if file exists in this directory.else return null.
      */
     public Path searchInDirectory(Path directory, String filename) {
         //Search the files and folder in this Path to find the one matching the filename.
         try {
             RemoteIterator<LocatedFileStatus> it = fs.listFiles(directory, true);
             String[] parts;
             Path path;
             while (it.hasNext()) {
                 path = it.next().getPath();
                 parts = path.toString().split("/");
                 if (parts[parts.length - 1].equals(filename)) {
                     return path;
                 }
             }
         } catch (IOException e) {
             if (LOGGER.isLoggable(Level.SEVERE)) {
                 LOGGER.severe(e.getMessage());
             }
         }
         return null;
     }

     /**
      * Read the cluster properties file and locate the HDFS_CONF variable that is the directory path for the
      * hdfs configuration if the system environment variable HDFS_CONF is not set.
      *
      * @return true if is successfully finds the Hadoop/HDFS home directory
      */
     private boolean locateConf() {
         if (this.confPath == null) {
             //As a last resort, try getting the configuration from the system environment
             //Some systems won't have this set.
             this.confPath = System.getenv("HADOOP_CONF_DIR");
         }
         return this.confPath != null;
     }

     /**
      * Upload a file/directory to HDFS.Filepath is the path in the local file system.dir is the destination path.
      *
      * @param filepath
      *            file to upload
      * @param dir
      *            HDFS directory to save the file
      * @return boolean
      */
     public boolean put(String filepath, String dir) {
         if (this.fs != null) {
             Path path = new Path(filepath);
             Path dest = new Path(dir);
             try {
                 if (fs.exists(dest)) {
                     fs.delete(dest, true); //recursive delete
                 }
             } catch (IOException e) {
                 if (LOGGER.isLoggable(Level.SEVERE)) {
                     LOGGER.severe(e.getMessage());
                 }
             }
             try {
                 fs.copyFromLocalFile(path, dest);
             } catch (IOException e) {
                 if (LOGGER.isLoggable(Level.SEVERE)) {
                     LOGGER.severe(e.getMessage());
                 }
             }
         }
         return false;
     }

     /**
      * Get instance of the HDFSfile system if it is configured correctly.
      * Return null if there is no instance.
      *
      * @return FileSystem
      */
     public FileSystem getFileSystem() {
         if (locateConf()) {
             conf.addResource(new Path(this.confPath + "/core-site.xml"));
             conf.addResource(new Path(this.confPath + "/hdfs-site.xml"));
             try {
                 fs = FileSystem.get(conf);
                 return this.fs;
             } catch (IOException e) {
                 if (LOGGER.isLoggable(Level.SEVERE)) {
                     LOGGER.severe(e.getMessage());
                 }
             }
         } else {
             if (LOGGER.isLoggable(Level.SEVERE)) {
                 LOGGER.severe("Could not locate HDFS configuration folder.");
             }
         }
         return null;
     }

     public HashMap<String, ArrayList<Integer>> getLocationsOfSplits() throws IOException {
         HashMap<String, ArrayList<Integer>> splitsMap = new HashMap<>();
         ArrayList<Integer> temp;
         int i = 0;
         String hostname;
         for (InputSplit s : this.splits) {
             SplitLocationInfo[] info = s.getLocationInfo();
             hostname = info[0].getLocation();
             if (splitsMap.containsKey(hostname)) {
                 temp = splitsMap.get(hostname);
                 temp.add(i);
             } else {
                 temp = new ArrayList<>();
                 temp.add(i);
                 splitsMap.put(hostname, temp);
             }
             i++;
         }

         return splitsMap;
     }

     public void scheduleSplits() throws IOException, ParserConfigurationException, SAXException {
         schedule = new HashMap<>();
         ArrayList<String> empty = new ArrayList<>();
         HashMap<String, ArrayList<Integer>> splitsMap = this.getLocationsOfSplits();
         readNodesFromXML();
         int count = this.splits.size();

         String node;
         for (ArrayList<String> info : this.nodes) {
             node = info.get(1);
             if (splitsMap.containsKey(node)) {
                 for (Integer split : splitsMap.get(node)) {
                     schedule.put(split, node);
                     count--;
                 }
                 splitsMap.remove(node);
             } else {
                 empty.add(node);
             }
         }

         //Check if every split got assigned to a node
         if (count != 0) {
             ArrayList<Integer> remaining = new ArrayList<>();
             // Find remaining splits
             for (InputSplit s : this.splits) {
                 int i = 0;
                 if (!schedule.containsKey(i)) {
                     remaining.add(i);
                 }
             }

             if (!empty.isEmpty()) {
                 int nodeNumber = 0;
                 for (int split : remaining) {
                     if (nodeNumber == empty.size()) {
                         nodeNumber = 0;
                     }
                     schedule.put(split, empty.get(nodeNumber));
                     nodeNumber++;
                 }
             }
         }
     }

     /**
      * Read the hostname and the ip address of every node from the xml cluster configuration file.
      * Save the information inside nodes.
      */
     public void readNodesFromXML() {
         nodes = new ArrayList<>();
         for (NodeControllerInfo ncInfo : nodeControllerInfos.values()) {
             //Will this include the master node? Is that bad?
             ArrayList<String> info = new ArrayList<>();
             info.add(ncInfo.getNodeId());
             info.add(ncInfo.getNetworkAddress().getAddress());
             nodes.add(info);
         }
     }

     /**
      * Writes the schedule to a temporary file, then uploads the file to the HDFS.
      *
      * @throws UnsupportedEncodingException
      *            The encoding of the file is not correct
      * @throws FileNotFoundException
      *            The file doesn't exist
      */
     public void addScheduleToDistributedCache() throws FileNotFoundException, UnsupportedEncodingException {
         PrintWriter writer = new PrintWriter(FILEPATH, "UTF-8");
         for (int split : this.schedule.keySet()) {
             writer.write(split + "," + this.schedule.get(split));
         }
         writer.close();
         // Add file to HDFS
         this.put(FILEPATH, DFS_PATH);
     }

     public RecordReader getReader() {

         List<FileSplit> fileSplits = new ArrayList<>();
         for (int i = 0; i < splits.size(); i++) {
             fileSplits.add((FileSplit) splits.get(i));
         }
         FileSplitsFactory splitsFactory;
         try {
             splitsFactory = new FileSplitsFactory(fileSplits);
             List<FileSplit> inputSplits = splitsFactory.getSplits();
             ContextFactory ctxFactory = new ContextFactory();
             int size = inputSplits.size();
             for (int i = 0; i < size; i++) {
                 /**
                  * read the split
                  */
                 TaskAttemptContext context;
                 try {
                     context = ctxFactory.createContext(job.getConfiguration(), i);
                     RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                     reader.initialize(inputSplits.get(i), context);
                     return reader;
                 } catch (IOException | InterruptedException e) {
                     if (LOGGER.isLoggable(Level.SEVERE)) {
                         LOGGER.severe(e.getMessage());
                     }
                 }
             }
         } catch (HyracksDataException e) {
             if (LOGGER.isLoggable(Level.SEVERE)) {
                 LOGGER.severe(e.getMessage());
             }
         }
         return null;
     }

     /**
      * @return schedule.
      */
     public HashMap<Integer, String> getSchedule() {
         return this.schedule;
     }

     /**
      * Return the splits belonging to this node for the existing schedule.
      *
      * @param node
      *            HDFS node
      * @return List
      */
     public ArrayList<Integer> getScheduleForNode(String node) {
         ArrayList<Integer> nodeSchedule = new ArrayList<>();
         for (int split : this.schedule.keySet()) {
             if (node.equals(this.schedule.get(split))) {
                 nodeSchedule.add(split);
             }
         }
         return nodeSchedule;
     }

     public List<InputSplit> getSplits() {
         return this.splits;
     }

     public Job getJob() {
         return this.job;
     }

     public InputFormat getinputFormat() {
         return this.inputFormat;
     }

     public Document convertStringToDocument(String xmlStr) {
         DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
         DocumentBuilder builder;
         try {
             builder = factory.newDocumentBuilder();
             Document doc = builder.parse(new InputSource(new StringReader(xmlStr)));
             return doc;
         } catch (Exception e) {
             if (LOGGER.isLoggable(Level.SEVERE)) {
                 LOGGER.severe(e.getMessage());
             }
         }
         return null;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.vxquery.hdfs2;

	import java.io.FileNotFoundException;
	import java.io.IOException;
	import java.io.PrintWriter;
	import java.io.StringReader;
	import java.io.UnsupportedEncodingException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.logging.Level;
	import java.util.logging.Logger;

	import javax.xml.parsers.DocumentBuilder;
	import javax.xml.parsers.DocumentBuilderFactory;
	import javax.xml.parsers.ParserConfigurationException;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.LocatedFileStatus;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.fs.RemoteIterator;
	import org.apache.hadoop.mapred.SplitLocationInfo;
	import org.apache.hadoop.mapreduce.InputFormat;
	import org.apache.hadoop.mapreduce.InputSplit;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.RecordReader;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.input.FileSplit;
	import org.apache.hadoop.util.ReflectionUtils;
	import org.apache.hyracks.api.client.NodeControllerInfo;
	import org.apache.hyracks.api.exceptions.HyracksDataException;
	import org.apache.hyracks.hdfs.ContextFactory;
	import org.apache.hyracks.hdfs2.dataflow.FileSplitsFactory;
	import org.w3c.dom.Document;
	import org.xml.sax.InputSource;
	import org.xml.sax.SAXException;

	public class HDFSFunctions {

	private Configuration conf;
	private FileSystem fs;
	private String confPath;
	private Job job;
	private InputFormat inputFormat;
	private List<InputSplit> splits;
	private ArrayList<ArrayList<String>> nodes;
	private HashMap<Integer, String> schedule;
	private static final String TEMP = "java.io.tmpdir";
	private static final String DFS_PATH = "vxquery_splits_schedule.txt";
	private static final String FILEPATH = System.getProperty(TEMP) + "splits_schedule.txt";
	protected static final Logger LOGGER = Logger.getLogger(HDFSFunctions.class.getName());
	private final Map<String, NodeControllerInfo> nodeControllerInfos;

	/**
	* Create the configuration and add the paths for core-site and hdfs-site as resources.
	* Initialize an instance of HDFS FileSystem for this configuration.
	*
	* @param nodeControllerInfos
	* Map of the node to its attributes
	* @param hdfsConf
	* Hdfs path to config
	*/
	public HDFSFunctions(Map<String, NodeControllerInfo> nodeControllerInfos, String hdfsConf) {
	this.conf = new Configuration();
	this.nodeControllerInfos = nodeControllerInfos;
	this.confPath = hdfsConf;
	}

	/**
	* Create the needed objects for reading the splits of the filepath given as argument.
	* This method should run before the scheduleSplits method.
	*
	* @param filepath
	* Path to config.
	* @param tag
	* Tag to read.
	*/
	@SuppressWarnings({ "deprecation", "unchecked" })
	public void setJob(String filepath, String tag) {
	try {
	conf.set("start_tag", "<" + tag + ">");
	conf.set("end_tag", "</" + tag + ">");
	job = new Job(conf, "Read from HDFS");
	Path input = new Path(filepath);
	FileInputFormat.addInputPath(job, input);
	job.setInputFormatClass(XmlCollectionWithTagInputFormat.class);
	inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
	splits = inputFormat.getSplits(job);
	} catch (IOException \| ClassNotFoundException \| InterruptedException e) {
	if (LOGGER.isLoggable(Level.SEVERE)) {
	LOGGER.severe(e.getMessage());
	}
	}
	}

	/**
	* Returns true if the file path exists or it is located somewhere in the home directory of the user that called the function.
	* Searches in subdirectories of the home directory too.
	*
	* @param filename
	* HDFS file path.
	* @return boolean
	* True if located in HDFS.
	* @throws IOException
	* If searching for the filepath throws {@link IOException}
	*/
	public boolean isLocatedInHDFS(String filename) throws IllegalArgumentException, IOException {
	//search file path
	if (fs.exists(new Path(filename))) {
	return true;
	}
	return searchInDirectory(fs.getHomeDirectory(), filename) != null;
	}

	/**
	* Searches the given directory for the file.
	*
	* @param directory
	* to search
	* @param filename
	* of file we want
	* @return path if file exists in this directory.else return null.
	*/
	public Path searchInDirectory(Path directory, String filename) {
	//Search the files and folder in this Path to find the one matching the filename.
	try {
	RemoteIterator<LocatedFileStatus> it = fs.listFiles(directory, true);
	String[] parts;
	Path path;
	while (it.hasNext()) {
	path = it.next().getPath();
	parts = path.toString().split("/");
	if (parts[parts.length - 1].equals(filename)) {
	return path;
	}
	}
	} catch (IOException e) {
	if (LOGGER.isLoggable(Level.SEVERE)) {
	LOGGER.severe(e.getMessage());
	}
	}
	return null;
	}

	/**
	* Read the cluster properties file and locate the HDFS_CONF variable that is the directory path for the
	* hdfs configuration if the system environment variable HDFS_CONF is not set.
	*
	* @return true if is successfully finds the Hadoop/HDFS home directory
	*/
	private boolean locateConf() {
	if (this.confPath == null) {
	//As a last resort, try getting the configuration from the system environment
	//Some systems won't have this set.
	this.confPath = System.getenv("HADOOP_CONF_DIR");
	}
	return this.confPath != null;
	}

	/**
	* Upload a file/directory to HDFS.Filepath is the path in the local file system.dir is the destination path.
	*
	* @param filepath
	* file to upload
	* @param dir
	* HDFS directory to save the file
	* @return boolean
	*/
	public boolean put(String filepath, String dir) {
	if (this.fs != null) {
	Path path = new Path(filepath);
	Path dest = new Path(dir);
	try {
	if (fs.exists(dest)) {
	fs.delete(dest, true); //recursive delete
	}
	} catch (IOException e) {
	if (LOGGER.isLoggable(Level.SEVERE)) {
	LOGGER.severe(e.getMessage());
	}
	}
	try {
	fs.copyFromLocalFile(path, dest);
	} catch (IOException e) {
	if (LOGGER.isLoggable(Level.SEVERE)) {
	LOGGER.severe(e.getMessage());
	}
	}
	}
	return false;
	}

	/**
	* Get instance of the HDFSfile system if it is configured correctly.
	* Return null if there is no instance.
	*
	* @return FileSystem
	*/
	public FileSystem getFileSystem() {
	if (locateConf()) {
	conf.addResource(new Path(this.confPath + "/core-site.xml"));
	conf.addResource(new Path(this.confPath + "/hdfs-site.xml"));
	try {
	fs = FileSystem.get(conf);
	return this.fs;
	} catch (IOException e) {
	if (LOGGER.isLoggable(Level.SEVERE)) {
	LOGGER.severe(e.getMessage());
	}
	}
	} else {
	if (LOGGER.isLoggable(Level.SEVERE)) {
	LOGGER.severe("Could not locate HDFS configuration folder.");
	}
	}
	return null;
	}

	public HashMap<String, ArrayList<Integer>> getLocationsOfSplits() throws IOException {
	HashMap<String, ArrayList<Integer>> splitsMap = new HashMap<>();
	ArrayList<Integer> temp;
	int i = 0;
	String hostname;
	for (InputSplit s : this.splits) {
	SplitLocationInfo[] info = s.getLocationInfo();
	hostname = info[0].getLocation();
	if (splitsMap.containsKey(hostname)) {
	temp = splitsMap.get(hostname);
	temp.add(i);
	} else {
	temp = new ArrayList<>();
	temp.add(i);
	splitsMap.put(hostname, temp);
	}
	i++;
	}

	return splitsMap;
	}

	public void scheduleSplits() throws IOException, ParserConfigurationException, SAXException {
	schedule = new HashMap<>();
	ArrayList<String> empty = new ArrayList<>();
	HashMap<String, ArrayList<Integer>> splitsMap = this.getLocationsOfSplits();
	readNodesFromXML();
	int count = this.splits.size();

	String node;
	for (ArrayList<String> info : this.nodes) {
	node = info.get(1);
	if (splitsMap.containsKey(node)) {
	for (Integer split : splitsMap.get(node)) {
	schedule.put(split, node);
	count--;
	}
	splitsMap.remove(node);
	} else {
	empty.add(node);
	}
	}

	//Check if every split got assigned to a node
	if (count != 0) {
	ArrayList<Integer> remaining = new ArrayList<>();
	// Find remaining splits
	for (InputSplit s : this.splits) {
	int i = 0;
	if (!schedule.containsKey(i)) {
	remaining.add(i);
	}
	}

	if (!empty.isEmpty()) {
	int nodeNumber = 0;
	for (int split : remaining) {
	if (nodeNumber == empty.size()) {
	nodeNumber = 0;
	}
	schedule.put(split, empty.get(nodeNumber));
	nodeNumber++;
	}
	}
	}
	}

	/**
	* Read the hostname and the ip address of every node from the xml cluster configuration file.
	* Save the information inside nodes.
	*/
	public void readNodesFromXML() {
	nodes = new ArrayList<>();
	for (NodeControllerInfo ncInfo : nodeControllerInfos.values()) {
	//Will this include the master node? Is that bad?
	ArrayList<String> info = new ArrayList<>();
	info.add(ncInfo.getNodeId());
	info.add(ncInfo.getNetworkAddress().getAddress());
	nodes.add(info);
	}
	}

	/**
	* Writes the schedule to a temporary file, then uploads the file to the HDFS.
	*
	* @throws UnsupportedEncodingException
	* The encoding of the file is not correct
	* @throws FileNotFoundException
	* The file doesn't exist
	*/
	public void addScheduleToDistributedCache() throws FileNotFoundException, UnsupportedEncodingException {
	PrintWriter writer = new PrintWriter(FILEPATH, "UTF-8");
	for (int split : this.schedule.keySet()) {
	writer.write(split + "," + this.schedule.get(split));
	}
	writer.close();
	// Add file to HDFS
	this.put(FILEPATH, DFS_PATH);
	}

	public RecordReader getReader() {

	List<FileSplit> fileSplits = new ArrayList<>();
	for (int i = 0; i < splits.size(); i++) {
	fileSplits.add((FileSplit) splits.get(i));
	}
	FileSplitsFactory splitsFactory;
	try {
	splitsFactory = new FileSplitsFactory(fileSplits);
	List<FileSplit> inputSplits = splitsFactory.getSplits();
	ContextFactory ctxFactory = new ContextFactory();
	int size = inputSplits.size();
	for (int i = 0; i < size; i++) {
	/**
	* read the split
	*/
	TaskAttemptContext context;
	try {
	context = ctxFactory.createContext(job.getConfiguration(), i);
	RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
	reader.initialize(inputSplits.get(i), context);
	return reader;
	} catch (IOException \| InterruptedException e) {
	if (LOGGER.isLoggable(Level.SEVERE)) {
	LOGGER.severe(e.getMessage());
	}
	}
	}
	} catch (HyracksDataException e) {
	if (LOGGER.isLoggable(Level.SEVERE)) {
	LOGGER.severe(e.getMessage());
	}
	}
	return null;
	}

	/**
	* @return schedule.
	*/
	public HashMap<Integer, String> getSchedule() {
	return this.schedule;
	}

	/**
	* Return the splits belonging to this node for the existing schedule.
	*
	* @param node
	* HDFS node
	* @return List
	*/
	public ArrayList<Integer> getScheduleForNode(String node) {
	ArrayList<Integer> nodeSchedule = new ArrayList<>();
	for (int split : this.schedule.keySet()) {
	if (node.equals(this.schedule.get(split))) {
	nodeSchedule.add(split);
	}
	}
	return nodeSchedule;
	}

	public List<InputSplit> getSplits() {
	return this.splits;
	}

	public Job getJob() {
	return this.job;
	}

	public InputFormat getinputFormat() {
	return this.inputFormat;
	}

	public Document convertStringToDocument(String xmlStr) {
	DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
	DocumentBuilder builder;
	try {
	builder = factory.newDocumentBuilder();
	Document doc = builder.parse(new InputSource(new StringReader(xmlStr)));
	return doc;
	} catch (Exception e) {
	if (LOGGER.isLoggable(Level.SEVERE)) {
	LOGGER.severe(e.getMessage());
	}
	}
	return null;
	}
	}