| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.pig.scripting.jruby; |
| |
| import java.util.Iterator; |
| |
| import org.apache.pig.backend.executionengine.ExecException; |
| import org.apache.pig.data.BagFactory; |
| import org.apache.pig.data.DataBag; |
| import org.apache.pig.data.Tuple; |
| import org.apache.pig.data.TupleFactory; |
| |
| import org.jruby.Ruby; |
| import org.jruby.RubyArray; |
| import org.jruby.RubyBoolean; |
| import org.jruby.RubyClass; |
| import org.jruby.RubyEnumerator; |
| import org.jruby.RubyFixnum; |
| import org.jruby.RubyModule; |
| import org.jruby.RubyObject; |
| import org.jruby.RubyString; |
| import org.jruby.RubySymbol; |
| import org.jruby.anno.JRubyClass; |
| import org.jruby.anno.JRubyMethod; |
| import org.jruby.runtime.Block; |
| import org.jruby.runtime.ObjectAllocator; |
| import org.jruby.runtime.ThreadContext; |
| import org.jruby.runtime.builtin.IRubyObject; |
| |
| //TODO: need to fix the enumerator piece! |
| //TODO: need to fix the flatten semantics |
| |
| /** |
| * This provides a Ruby-esque way to interact with DataBag objects. It encapsulates |
| * a bag object, and provides an easy to use interface. One difference between the |
| * Ruby and the the Java API on DataBag is that in Ruby you iterate on the bag directly. |
| * <p> |
| * The RubyDataBag class uses JRuby's API for the defintion Ruby class using Java code. |
| * The comments in this class will more extensively explain the annotations for those not |
| * familiar with JRuby. |
| * <p> |
| * In JRuby, the annotations are provided for convenience, and are detected and used |
| * by the "defineAnnotatedMethods" method. The JRubyClass annotation sets the class name |
| * as it will be seen in the Ruby runtime, and alows you to include any modules. In the |
| * case of the RubyDataBag, within Ruby we just want it to be called DataBag, and we |
| * want it to be enumerable. |
| */ |
| @JRubyClass(name = "DataBag", include = "Enumerable") |
| public class RubyDataBag extends RubyObject implements Iterable<Tuple> { |
| private static final long serialVersionUID = 1L; |
| private static TupleFactory mTupleFactory = TupleFactory.getInstance(); |
| private static BagFactory mBagFactory = BagFactory.getInstance(); |
| |
| private DataBag internalDB; // The encapsulated bag object |
| |
| public DataBag getBag() { |
| return internalDB; |
| } |
| |
| /** |
| * This is an object allocator which is necessary for the define method. |
| * Given a runtime and a klass object, it instantiates the default object. |
| */ |
| private static final ObjectAllocator ALLOCATOR = new ObjectAllocator() { |
| public IRubyObject allocate(Ruby runtime, RubyClass klass) { |
| return new RubyDataBag(runtime, klass); |
| } |
| }; |
| |
| /** |
| * This method registers the class with the given runtime. It is not necessary to do this here, |
| * but it is simpler to associate the methods necessary to register the class with the class |
| * itself, so on the Library side it is possible to just specify "RubyDataBag.define(runtime)". |
| * |
| * @param runtime an instance of the Ruby runtime |
| * @return a RubyClass object with metadata about the registered class |
| */ |
| public static RubyClass define(Ruby runtime) { |
| // This generates the class object associated with DataBag, and registers it with the |
| // runtime. The RubyClass object has all the metadata associated with a Class itself. |
| RubyClass result = runtime.defineClass("DataBag", runtime.getObject(), ALLOCATOR); |
| |
| // This registers a method which can be used to know whether a module is an |
| // instance of the class. |
| result.kindOf = new RubyModule.KindOf() { |
| public boolean isKindOf(IRubyObject obj, RubyModule type) { |
| return obj instanceof RubyDataBag; |
| } |
| }; |
| |
| // This includes the Enumerable module that we specified. |
| result.includeModule(runtime.getEnumerable()); |
| |
| // This method actually reads the annotations we placed and registers |
| // all of the methods. |
| result.defineAnnotatedMethods(RubyDataBag.class); |
| |
| // This returns the RubyClass object with all the new metadata. |
| return result; |
| } |
| |
| /** |
| * This constructor encapsulated an empty bag. |
| * |
| * @param ruby an instance of the ruby runtime |
| * @param rc an instance of the class object with meatadata |
| */ |
| protected RubyDataBag(final Ruby ruby, RubyClass rc) { |
| super(ruby,rc); |
| internalDB = mBagFactory.newDefaultBag(); |
| } |
| |
| /** |
| * This constructor encapsulates the bag that is passed to it. Note: |
| * the resultant RubyDataBag will encapsulated that bag directly, not |
| * a copy. |
| * |
| * @param ruby an instance of the ruby runtime |
| * @param rc an instance of the class object with meatadata |
| * @param db a DataBag to encapsulate |
| */ |
| protected RubyDataBag(final Ruby ruby, RubyClass rc, DataBag db) { |
| super(ruby,rc); |
| internalDB = db; |
| } |
| |
| /** |
| * The initialize method is the method used on the Ruby side to construct |
| * the RubyDataBag object. The default is just an empty bag. |
| * |
| * @return the initialized RubyDataBag |
| */ |
| @JRubyMethod |
| @SuppressWarnings("deprecation") |
| public RubyDataBag initialize() { |
| internalDB = mBagFactory.newDefaultBag(); |
| return this; |
| } |
| |
| /** |
| * The initialize method can optionally receive a DataBag. In the case of |
| * a RubyDataBag, a RubyDataBag will be returned that directly encapsulates it. |
| * |
| * @param arg an IRubyObject that is a RubyDataBag to encapsulate |
| * @return the initialized RubyDataBag |
| */ |
| @JRubyMethod |
| public RubyDataBag initialize(IRubyObject arg) { |
| if (arg instanceof RubyDataBag) { |
| internalDB = ((RubyDataBag)arg).getBag(); |
| } else { |
| throw new IllegalArgumentException("Bag argument passed to DataBag initializer"); |
| } |
| return this; |
| } |
| |
| /** |
| * This method deletes all of the entries in the underlying DataBag. |
| */ |
| @JRubyMethod |
| public void clear() { |
| internalDB.clear(); |
| } |
| |
| /** |
| * This returns whether the encapsulated DatBag is distinct, per the distinct setting. |
| * |
| * @param context the context the method is being executed in |
| * @return true if it the encapsulated is distinct, false otherwise |
| */ |
| @JRubyMethod(name = {"distinct?", "is_distinct?"}) |
| public RubyBoolean isDistinct(ThreadContext context) { |
| return RubyBoolean.newBoolean(context.getRuntime(), internalDB.isDistinct()); |
| } |
| |
| /** |
| * This returns whether the encapsulated DatBag is distinct, per the sorted setting. |
| * |
| * @param context the context the method is being executed in |
| * @return true if it the encapsulated is sorted, false otherwise |
| */ |
| @JRubyMethod(name = {"sorted?", "is_sorted?"}) |
| public RubyBoolean isSorted(ThreadContext context) { |
| return RubyBoolean.newBoolean(context.getRuntime(), internalDB.isSorted()); |
| } |
| |
| /** |
| * This returns the size of the encapsulated DataBag. |
| * |
| * @param context the context the method is being executed in |
| * @return the size of the encapsulated DataBag |
| */ |
| @JRubyMethod(name={"size","length"}) |
| public RubyFixnum size(ThreadContext context) { |
| return RubyFixnum.newFixnum(context.getRuntime(), internalDB.size()); |
| } |
| |
| /** |
| * The add method accepts a varargs argument; each argument can be either a random |
| * object, a DataBag, or a RubyArray. In the case of a random object, that object |
| * will be converted to a Pig object and put into a Tuple. In the case of a |
| * RubyArray, it will be treated as a Tuple and added. In the case of a DataBag, |
| * it will iterate over the DataBag and add all of the elements to the element |
| * encapsulated by RubyDataBag. |
| * |
| * @param context the context the method is being executed in |
| * @param args varargs passed to add. Each argument can be a RubyDataBag, whose |
| contents will be copied; a RubyArray, which will be treated as a |
| Tuple, or another object which will be converted over per |
| {@link PigJrubyLibrary#rubyToPig}. |
| */ |
| @JRubyMethod(required = 1, rest = true) |
| public void add(ThreadContext context, IRubyObject[] args) throws ExecException { |
| for (IRubyObject arg : args) { |
| if (arg instanceof RubyDataBag) { |
| for (Tuple t : (RubyDataBag)arg) |
| internalDB.add(t); |
| } else if (arg instanceof RubyArray) { |
| internalDB.add(PigJrubyLibrary.rubyToPig((RubyArray)arg)); |
| } else { |
| internalDB.add(mTupleFactory.newTuple(PigJrubyLibrary.rubyToPig(arg))); |
| } |
| } |
| } |
| |
| /** |
| * This method returns a copy of the encapsulated DataBag. |
| * |
| * @param context the context the method is being executed in |
| * @return the copied RubyDataBag |
| */ |
| //TODO see if a deepcopy is necessary as well (and consider adding to DataBag and Tuple) |
| @JRubyMethod |
| public RubyDataBag clone(ThreadContext context) { |
| DataBag b = mBagFactory.newDefaultBag(); |
| for (Tuple t : this) |
| b.add(t); |
| Ruby runtime = context.getRuntime(); |
| return new RubyDataBag(runtime, runtime.getClass("DataBag"), b); |
| } |
| |
| /** |
| * This method returns whether or not the encapsulated DataBag is empty. |
| * |
| * @param context the context the method is being executed in |
| i @return true if the encapsulated DAtaBag is empty, false otherwise |
| */ |
| @JRubyMethod(name = "empty?") |
| public RubyBoolean isEmpty(ThreadContext context) { |
| return RubyBoolean.newBoolean(context.getRuntime(), internalDB.size() == 0); |
| } |
| |
| /** |
| * This method returns a string representation of the RubyDataBag. If given an optional |
| * argument, then if that argument is true, the contents of the bag will also be printed. |
| * |
| * @param context the context the method is being executed in |
| * @param args optional true/false argument passed to inspect |
| * @return string representation of the RubyDataBag |
| */ |
| @JRubyMethod(name = {"inspect", "to_s", "to_string"}, optional = 1) |
| public RubyString inspect(ThreadContext context, IRubyObject[] args) { |
| Ruby runtime = context.getRuntime(); |
| StringBuilder sb = new StringBuilder(); |
| sb.append("[DataBag: size: ").append(internalDB.size()); |
| if (args.length > 0 && args[0].isTrue()) |
| sb.append(" = ").append(internalDB.toString()); |
| sb.append("]"); |
| return RubyString.newString(runtime, sb); |
| } |
| |
| public Iterator<Tuple> iterator() { |
| return internalDB.iterator(); |
| } |
| |
| /** |
| * This is an implementation of the each method which opens up the Enumerable interface, |
| * and makes it very convenient to iterate over the elements of a DataBag. Note that currently, |
| * due to a deficiency in JRuby, it is not possible to call each without a block given. |
| * |
| * @param context the context the method is being executed in |
| * @param block a block to call on the elements of the bag |
| * @return enumerator object if null block given, nil otherwise |
| */ |
| @JRubyMethod |
| public IRubyObject each(ThreadContext context, Block block) throws ExecException{ |
| Ruby runtime = context.getRuntime(); |
| |
| if (!block.isGiven()) |
| return PigJrubyLibrary.enumeratorize(runtime, this, "each"); |
| /* In a future release of JRuby when enumeratorize is made public (which is planned), should replace the above with the below |
| if (!block.isGiven()) |
| return RubyEnumerator.enumeratorize(context.getRuntime(), this, "each"); |
| */ |
| |
| for (Tuple t : this) |
| block.yield(context, PigJrubyLibrary.pigToRuby(runtime, t)); |
| |
| return context.nil; |
| } |
| |
| //TODO let them specify which element will be returned, or if it will just iterate over each ie a true flatten |
| /** |
| * This is a convenience method which will run the given block on the first element |
| * of each tuple contained. |
| * |
| * @param context the context the method is being executed in |
| * @param block a block to call on the elements of the bag |
| * @return enumerator object if null block given, nil otherwise |
| */ |
| @JRubyMethod(name = {"flat_each", "flatten"}) |
| public IRubyObject flatten(ThreadContext context, Block block) throws ExecException { |
| Ruby runtime = context.getRuntime(); |
| |
| if (!block.isGiven()) |
| return PigJrubyLibrary.enumeratorize(runtime, this, "flatten"); |
| /* In a future release of JRuby when enumeratorize is made public (which is planned), should replace the above with the below |
| if (!block.isGiven()) |
| return RubyEnumerator.enumeratorize(context.getRuntime(), this, "flatten"); |
| */ |
| |
| for (Tuple t : this) |
| block.yield(context, PigJrubyLibrary.pigToRuby(runtime, t.get(0))); |
| |
| return context.nil; |
| } |
| } |