| /* |
| * Copyright 2009-2013 by The Regents of the University of California |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * you may obtain a copy of the License from |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package edu.uci.ics.pregelix.dataflow; |
| |
| import java.io.DataOutput; |
| import java.io.File; |
| import java.io.IOException; |
| import java.lang.reflect.InvocationTargetException; |
| import java.nio.ByteBuffer; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.io.Writable; |
| import org.apache.hadoop.io.WritableComparable; |
| import org.apache.hadoop.mapreduce.InputSplit; |
| import org.apache.hadoop.mapreduce.TaskAttemptContext; |
| import org.apache.hadoop.mapreduce.lib.input.FileSplit; |
| |
| import edu.uci.ics.hyracks.api.context.IHyracksTaskContext; |
| import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable; |
| import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider; |
| import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor; |
| import edu.uci.ics.hyracks.api.exceptions.HyracksDataException; |
| import edu.uci.ics.hyracks.api.exceptions.HyracksException; |
| import edu.uci.ics.hyracks.api.job.JobSpecification; |
| import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder; |
| import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender; |
| import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils; |
| import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor; |
| import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryOutputSourceOperatorNodePushable; |
| import edu.uci.ics.hyracks.hdfs.ContextFactory; |
| import edu.uci.ics.hyracks.hdfs2.dataflow.FileSplitsFactory; |
| import edu.uci.ics.pregelix.api.graph.Vertex; |
| import edu.uci.ics.pregelix.api.io.VertexInputFormat; |
| import edu.uci.ics.pregelix.api.io.VertexReader; |
| import edu.uci.ics.pregelix.api.util.BspUtils; |
| import edu.uci.ics.pregelix.dataflow.base.IConfigurationFactory; |
| import edu.uci.ics.pregelix.dataflow.util.IterationUtils; |
| |
| @SuppressWarnings("rawtypes") |
| public class VertexFileScanOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor { |
| private static final long serialVersionUID = 1L; |
| private final FileSplitsFactory splitsFactory; |
| private final IConfigurationFactory confFactory; |
| private final int fieldSize = 2; |
| private final String[] scheduledLocations; |
| private final boolean[] executed; |
| |
| /** |
| * @param spec |
| */ |
| public VertexFileScanOperatorDescriptor(JobSpecification spec, RecordDescriptor rd, List<InputSplit> splits, |
| String[] scheduledLocations, IConfigurationFactory confFactory) throws HyracksException { |
| super(spec, 0, 1); |
| List<FileSplit> fileSplits = new ArrayList<FileSplit>(); |
| for (int i = 0; i < splits.size(); i++) { |
| fileSplits.add((FileSplit) splits.get(i)); |
| } |
| this.splitsFactory = new FileSplitsFactory(fileSplits); |
| this.confFactory = confFactory; |
| this.scheduledLocations = scheduledLocations; |
| this.executed = new boolean[scheduledLocations.length]; |
| Arrays.fill(executed, false); |
| this.recordDescriptors[0] = rd; |
| } |
| |
| @Override |
| public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, |
| IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) |
| throws HyracksDataException { |
| final List<FileSplit> splits = splitsFactory.getSplits(); |
| |
| return new AbstractUnaryOutputSourceOperatorNodePushable() { |
| private final ContextFactory ctxFactory = new ContextFactory(); |
| private String jobId; |
| |
| @Override |
| public void initialize() throws HyracksDataException { |
| try { |
| Configuration conf = confFactory.createConfiguration(ctx); |
| |
| //get the info for spilling vertices to HDFS |
| jobId = BspUtils.getJobId(conf); |
| |
| writer.open(); |
| for (int i = 0; i < scheduledLocations.length; i++) { |
| if (scheduledLocations[i].equals(ctx.getJobletContext().getApplicationContext().getNodeId())) { |
| /** |
| * pick one from the FileSplit queue |
| */ |
| synchronized (executed) { |
| if (!executed[i]) { |
| executed[i] = true; |
| } else { |
| continue; |
| } |
| } |
| loadVertices(ctx, conf, i); |
| } |
| } |
| writer.close(); |
| } catch (Exception e) { |
| throw new HyracksDataException(e); |
| } |
| } |
| |
| /** |
| * Load the vertices |
| * |
| * @parameter IHyracks ctx |
| * @throws IOException |
| * @throws IllegalAccessException |
| * @throws InstantiationException |
| * @throws ClassNotFoundException |
| * @throws InterruptedException |
| */ |
| @SuppressWarnings("unchecked") |
| private void loadVertices(final IHyracksTaskContext ctx, Configuration conf, int splitId) |
| throws IOException, ClassNotFoundException, InterruptedException, InstantiationException, |
| IllegalAccessException, NoSuchFieldException, InvocationTargetException { |
| int treeVertexSizeLimit = IterationUtils.getVFrameSize(ctx) / 2; |
| int dataflowPageSize = ctx.getFrameSize(); |
| ByteBuffer frame = ctx.allocateFrame(); |
| FrameTupleAppender appender = new FrameTupleAppender(dataflowPageSize); |
| appender.reset(frame, true); |
| |
| VertexInputFormat vertexInputFormat = BspUtils.createVertexInputFormat(conf); |
| InputSplit split = splits.get(splitId); |
| TaskAttemptContext mapperContext = ctxFactory.createContext(conf, splitId); |
| mapperContext.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); |
| |
| VertexReader vertexReader = vertexInputFormat.createVertexReader(split, mapperContext); |
| vertexReader.initialize(split, mapperContext); |
| Vertex readerVertex = BspUtils.createVertex(mapperContext.getConfiguration()); |
| ArrayTupleBuilder tb = new ArrayTupleBuilder(fieldSize); |
| DataOutput dos = tb.getDataOutput(); |
| |
| IterationUtils.setJobContext(BspUtils.getJobId(conf), ctx, mapperContext); |
| Vertex.taskContext = mapperContext; |
| |
| /** |
| * empty vertex value |
| */ |
| Writable emptyVertexValue = BspUtils.createVertexValue(conf); |
| |
| while (vertexReader.nextVertex()) { |
| readerVertex = vertexReader.getCurrentVertex(); |
| tb.reset(); |
| if (readerVertex.getVertexId() == null) { |
| throw new IllegalArgumentException("loadVertices: Vertex reader returned a vertex " |
| + "without an id! - " + readerVertex); |
| } |
| if (readerVertex.getVertexValue() == null) { |
| readerVertex.setVertexValue(emptyVertexValue); |
| } |
| WritableComparable vertexId = readerVertex.getVertexId(); |
| vertexId.write(dos); |
| tb.addFieldEndOffset(); |
| |
| readerVertex.write(dos); |
| tb.addFieldEndOffset(); |
| |
| if (tb.getSize() >= treeVertexSizeLimit || tb.getSize() > dataflowPageSize) { |
| //if (tb.getSize() < dataflowPageSize) { |
| //spill vertex to HDFS if it cannot fit into a tree storage page |
| String pathStr = BspUtils.TMP_DIR + jobId + File.separator + vertexId; |
| readerVertex.setSpilled(pathStr); |
| tb.reset(); |
| vertexId.write(dos); |
| tb.addFieldEndOffset(); |
| //vertex content will be spilled to HDFS |
| readerVertex.write(dos); |
| tb.addFieldEndOffset(); |
| readerVertex.setUnSpilled(); |
| } |
| if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { |
| if (appender.getTupleCount() <= 0) { |
| throw new IllegalStateException("zero tuples in a frame!"); |
| } |
| FrameUtils.flushFrame(frame, writer); |
| appender.reset(frame, true); |
| if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { |
| //this place should never be reached, otherwise it is a bug |
| throw new IllegalStateException( |
| "An overflow vertex content should not be flushed into bulkload dataflow."); |
| } |
| } |
| } |
| |
| vertexReader.close(); |
| if (appender.getTupleCount() > 0) { |
| FrameUtils.flushFrame(frame, writer); |
| } |
| System.gc(); |
| } |
| }; |
| } |
| } |