| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.mrql; |
| |
| import org.apache.mrql.gen.*; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.io.*; |
| import org.apache.spark.rdd.RDD; |
| import org.apache.spark.api.java.JavaRDD; |
| import org.apache.spark.streaming.*; |
| import org.apache.spark.streaming.api.java.*; |
| import org.apache.spark.streaming.dstream.*; |
| import scala.collection.Seq; |
| import scala.collection.JavaConversions; |
| import scala.runtime.AbstractFunction2; |
| import org.apache.spark.storage.RDDInfo; |
| |
| |
| /** Evaluates physical plans in Apache Spark stream mode */ |
| public class SparkStreaming extends SparkEvaluator { |
| private final static scala.reflect.ClassTag<MRData> classtag = scala.reflect.ClassTag$.MODULE$.apply(MRData.class); |
| private static ArrayList<JavaInputDStream<MRData>> streams; |
| private static ArrayList<String> stream_names; |
| private static int tries = 0; |
| |
| private static JavaInputDStream<MRData> stream_source ( Tree source, Environment env ) { |
| match source { |
| case BinaryStream(`file,_): |
| String path = ((MR_string)evalE(file,env)).get(); |
| new BinaryDataSource(path,Plan.conf); |
| return new SparkFileInputStream(stream_context,path,true); |
| case ParsedStream(`parser,`file,...args): |
| String path = ((MR_string)evalE(file,env)).get(); |
| Class<? extends Parser> p = DataSource.parserDirectory.get(parser.toString()); |
| if (p == null) |
| throw new Error("Unknown parser: "+parser); |
| new ParsedDataSource(path,p,args,Plan.conf); |
| try { |
| dump_source_dir(); |
| } catch (IOException ex) { |
| throw new Error("Cannot dump source directory"); |
| }; |
| return new SparkFileInputStream(stream_context,path,false); |
| case SocketStream(`parser,`hostname,`port,...args): |
| String h = ((MR_string)evalE(hostname,env)).get(); |
| int p = ((MR_int)evalE(port,env)).get(); |
| return new SparkSocketStream(stream_context,h,p,parser.toString(),args); |
| }; |
| throw new Error("Unknown stream source: "+print_query(source)); |
| } |
| |
| private static Tree get_streams ( Tree plan, Environment env ) { |
| match plan { |
| case Stream(lambda(`v,`b),`source): |
| streams.add(stream_source(source,env)); |
| stream_names.add(v.toString()); |
| return get_streams(b,env); |
| }; |
| return plan; |
| } |
| |
| private final static SparkEvaluator ef = (SparkEvaluator)Evaluator.evaluator; |
| |
| /** bind the pattern variables to values */ |
| private static void bind_list ( Tree pattern, Tree src, Environment env, Environment rdd_env ) { |
| match pattern { |
| case tuple(...ps): |
| int i = 0; |
| match src { |
| case tuple(...es): |
| for ( Tree p: ps ) |
| bind_list(p,es.nth(i++),env,rdd_env); |
| } |
| }; |
| MRData value = new MR_rdd(ef.eval(src,env,rdd_env)); |
| if (variable_lookup(pattern.toString(),global_rdds) == null) |
| global_rdds = new Environment(pattern.toString(),value,global_rdds); |
| else global_rdds.replace(pattern.toString(),value); |
| if (repeat_variables.member(pattern)) |
| Interpreter.new_distributed_binding(pattern.toString(),value); |
| } |
| |
| private static void stream_processing ( final Tree plan, final Environment env, |
| final Environment dataset_env, final Function f ) { |
| if (streams.size() == 0) |
| throw new Error("No input streams in query"); |
| ArrayList<DStream<?>> rdds = new ArrayList<DStream<?>>(); |
| for ( JavaDStream<MRData> jd: streams ) |
| rdds.add(jd.dstream()); |
| final ArrayList<String> vars = stream_names; |
| final AbstractFunction2<Seq<RDD<?>>,Time,RDD<MRData>> fnc = |
| new AbstractFunction2<Seq<RDD<?>>,Time,RDD<MRData>>() { |
| public RDD<MRData> apply ( Seq<RDD<?>> rdds, Time tm ) { |
| long t = System.currentTimeMillis(); |
| Environment rdd_env = dataset_env; |
| int i = 0; |
| for ( RDD<?> rdd: JavaConversions.seqAsJavaList(rdds) ) { |
| try { |
| if (rdd.count() == 0) |
| return SparkEvaluator.spark_context.sc().emptyRDD(classtag); |
| } catch (Exception ex) { |
| return SparkEvaluator.spark_context.sc().emptyRDD(classtag); |
| }; |
| MRData d = new MR_rdd(new JavaRDD(rdd,classtag)); |
| global_rdds = new Environment(vars.get(i),d,global_rdds); |
| rdd_env = new Environment(vars.get(i++),d,rdd_env); |
| }; |
| match plan { |
| case lambda(`pat,`e): |
| bind_list(pat,e,env,rdd_env); |
| f.eval(null); |
| case _: // non-incremental streaming |
| f.eval(ef.evalE(plan,env)); |
| }; |
| if (!Config.quiet_execution) |
| System.out.println("Run time: "+(System.currentTimeMillis()-t)/1000.0+" secs"); |
| for ( RDD<?> rdd: JavaConversions.seqAsJavaList(rdds) ) |
| rdd.unpersist(true); |
| GC_cached_rdds(); |
| System.gc(); |
| for ( RDDInfo rdi: SparkEvaluator.spark_context.sc().getRDDStorageInfo() ) |
| System.err.println("Warning: cached RDD "+rdi); |
| if (++tries >= Config.stream_tries) { // for performance experiments only |
| tries = 0; |
| SparkEvaluator.stream_context.stop(true,true); |
| System.exit(0); |
| }; |
| return SparkEvaluator.spark_context.sc().emptyRDD(classtag); |
| } |
| }; |
| new TransformedDStream<MRData>(JavaConversions.asScalaBuffer(rdds).toSeq(),fnc,classtag).register(); |
| } |
| |
| /** evaluate plan in stream mode: evaluate each batch of data and apply the function f */ |
| final public static void evaluate ( Tree plan, Environment env, Environment dataset_env, Function f ) { |
| streams = new ArrayList<JavaInputDStream<MRData>>(); |
| stream_names = new ArrayList<String>(); |
| match plan { |
| case lambda(`p,`b): |
| b = get_streams(b,env); |
| stream_processing(#<lambda(`p,`b)>,env,null,f); |
| case _: // non-incremental streaming |
| plan = get_streams(plan,env); |
| stream_processing(plan,env,dataset_env,f); |
| }; |
| stream_context.start(); |
| stream_context.awaitTermination(); |
| } |
| } |