<?xml version="1.0" encoding="UTF-8"?>
<!--
  Licensed to the Apache Software Foundation (ASF) under one or more
  contributor license agreements.  See the NOTICE file distributed with
  this work for additional information regarding copyright ownership.
  The ASF licenses this file to You under the Apache License, Version 2.0
  (the "License"); you may not use this file except in compliance with
  the License.  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">

<document>
  <header>
    <title>Pig Latin Basics</title>
  </header>
  <body>

<!-- CONVENTIONS -->
<section>
<title>Conventions</title>
   <p>Conventions for the syntax and code examples in the Pig Latin Reference Manual are described here.</p>
   <table>
      <tr>
            <td>
               <p>Convention</p>
            </td>
            <td>
               <p>Description</p>
            </td>
            <td>
               <p>Example</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>( )</p>
            </td>
            <td>
               <p>Parentheses enclose one or more items.</p>
               <p>Parentheses are also used to indicate the tuple data type.</p>
            </td>
            <td>
               <p>Multiple items:</p>
               <p>(1, abc, (2,4,6) )</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>[ ]</p>
            </td>
            <td>
               <p>Straight brackets enclose one or more optional items.</p>
               <p>Straight brackets are also used to indicate the map data type. In this case &lt;&gt; is used to indicate optional items.</p>
            </td>
            <td>
               <p>Optional items:</p>
               <p>[INNER | OUTER]</p>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>{ }</p>
            </td>
            <td>
               <p>Curly brackets enclose two or more items, one of which is required. </p>
               <p>Curly brackets also used to indicate the bag data type. In this case &lt;&gt; is used to indicate required items.</p>
            </td>
            <td>
               <p>Two items, one required:</p>
               <p>{ block | nested_block }</p>
              <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>…</p>
            </td>
            <td>
               <p>Horizontal ellipsis points indicate that you can repeat a portion of the code.</p>
            </td>
            <td>
               <p>Pig Latin syntax statement:</p>
               <p>cat path [path …]</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>UPPERCASE</p>
               <p></p>
               <p>lowercase</p>
            </td>
            <td>
               <p>In general, uppercase type indicates elements the system supplies.</p>
               <p>In general, lowercase type indicates elements that you supply.</p>
               <p>(These conventions are not strictly adherered to in all examples.)</p>
               <p>See <a href="#case-sensitivity">Case Sensitivity</a></p>
            </td>
            <td>
               <p>Pig Latin statement:</p>
               <p>a = LOAD 'data' AS (f1:int);</p>
               <p></p>
               <ul>
                  <li>
                     <p>LOAD, AS - Pig keywords</p>
                  </li>
                  <li>
                     <p>a, f1 - aliases you supply</p>
                  </li>
                  <li>
                     <p>'data' - data source you supply</p>
                  </li>
               </ul>
            </td>
         </tr>
   </table>
   </section>   


 <!-- KEYWORDS -->
   <section id="reserved-keywords">
   <title>Reserved Keywords</title>
   <p>Pig reserved keywords are listed here.</p>
   <table>
         <tr>
            <td> <p>-- A </p> </td>
            <td> <p>assert, and, any, all, arrange, as, asc, AVG</p> </td>
         </tr>      
      
         <tr>
            <td> <p>-- B </p> </td>
            <td> <p>bag, BinStorage, by, bytearray, BIGINTEGER, BIGDECIMAL</p> </td>
         </tr>   

         <tr>
            <td> <p>-- C </p> </td>
            <td> <p>cache, CASE, cat, cd, chararray, cogroup, CONCAT, copyFromLocal, copyToLocal, COUNT, cp, cross</p> </td>
         </tr>
         
         <tr>
            <td> <p>-- D </p> </td>
            <td> <p>datetime, %declare, %default, define, dense, desc, describe, DIFF, distinct, double, du, dump</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- E </p> </td>
            <td> <p>e, E, eval, exec, explain</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- F </p> </td>
            <td> <p>f, F, filter, flatten, float, foreach, full</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- G </p> </td>
            <td> <p>generate, group</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- H </p> </td>
            <td> <p>help</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- I </p> </td>
            <td> <p>if, illustrate, import, inner, input, int, into, is</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- J </p> </td>
            <td> <p>join</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- K </p> </td>
            <td> <p>kill</p> </td>
         </tr>   
         
         <tr>
            <td> <p>-- L </p> </td>
            <td> <p>l, L, left, limit, load, long, ls</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- M </p> </td>
            <td> <p>map, matches, MAX, MIN, mkdir, mv </p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- N </p> </td>
            <td> <p>not, null</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- O </p> </td>
            <td> <p>onschema, or, order, outer, output</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- P </p> </td>
            <td> <p>parallel, pig, PigDump, PigStorage, pwd</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- Q </p> </td>
            <td> <p>quit</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- R </p> </td>
            <td> <p>register, returns, right, rm, rmf, rollup, run</p> </td>
         </tr>  

         <tr>
            <td> <p>-- S </p> </td>
            <td> <p>sample, set, ship, SIZE, split, stderr, stdin, stdout, store, stream, SUM</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- T </p> </td>
            <td> <p>TextLoader, TOKENIZE, through, tuple</p> </td>
         </tr>  
         
         <tr>
            <td> <p>-- U </p> </td>
            <td> <p>union, using</p> </td>
         </tr>  

         <tr>
            <td> <p>-- V, W, X, Y, Z </p> </td>
            <td> <p>void</p> </td>
         </tr>  
            
   </table>
   </section>
   
   
 <!-- ++++++++++++++++++++++++++++++++++ --> 
 <!-- CASE SENSITIVITY -->
   <section id="case-sensitivity">
   <title>Case Sensitivity</title>
   <p>The names (aliases) of relations and fields are case sensitive. The names of Pig Latin functions are case sensitive. 
   The names of parameters (see <a href="cont.html#Parameter-Sub">Parameter Substitution</a>) and all other Pig Latin keywords (see <a href="#reserved-keywords">Reserved Keywords</a>) are case insensitive.</p>
   <p>In the example below, note the following:</p>
   <ul>
      <li>
         <p>The names (aliases) of relations A, B, and C are case sensitive.</p>
      </li>
      <li>
         <p>The names (aliases) of fields f1, f2, and f3 are case sensitive.</p>
      </li>
      <li>
         <p>Function names PigStorage and COUNT are case sensitive.</p>
      </li>
      <li>
         <p>Keywords LOAD, USING, AS, GROUP, BY, FOREACH, GENERATE, and DUMP are case insensitive. 
         They can also be written as load, using, as, group, by, etc.</p>
      </li>
      <li>
         <p>In the FOREACH statement, the field in relation B is referred to by positional notation ($0).</p>
      </li>
   </ul>
   <p/>

<source>
grunt> A = LOAD 'data' USING PigStorage() AS (f1:int, f2:int, f3:int);
grunt> B = GROUP A BY f1;
grunt> C = FOREACH B GENERATE COUNT ($0);
grunt> DUMP C;
</source>
</section>
  
 <!-- ++++++++++++++++++++++++++++++++++ -->   
<!-- DATA TYPES AND MORE-->
<section>
<title>Data Types and More</title>

<!-- IDENTIFIERS-->
<section id="identifiers">
<title>Identifiers</title>
<p>Identifiers include the names of relations (aliases), fields, variables, and so on. 
In Pig, identifiers start with a letter and can be followed by any number of letters, digits, or underscores.</p>

<p>Valid identifiers:</p>
<source>
A
A123
abc_123_BeX_
</source>
<p></p>
<p>Invalid identifiers: </p>
<source>
_A123
abc_$
A!B
</source>


</section>


<!-- RELATIONS, BAGS, TUPLES, FIELDS-->
   <section id="relations">
   <title>Relations, Bags, Tuples, Fields</title>
      <p><a href="start.html#pl-statements">Pig Latin statements</a> work with relations. A relation can be defined as follows:</p>
   <ul>
      <li>
         <p>A relation is a bag (more specifically, an outer bag).</p>
      </li>
      <li>
         <p>A bag is a collection of tuples. </p>
      </li>
      <li>
         <p>A tuple is an ordered set of fields.</p>
      </li>
      <li>
         <p>A field is a piece of data.</p>
      </li>
   </ul>
   <p></p>
   <p>A Pig relation is a bag of tuples. A Pig relation is similar to a table in a relational database, where the tuples in the bag correspond to the rows in a table. Unlike a relational table, however, Pig relations don't require that every tuple contain the same number of fields or that the fields in the same position (column) have the same type.</p>
   <p>Also note that relations are unordered which means there is no guarantee that tuples are processed in any particular order. Furthermore, processing may be parallelized in which case tuples are not processed according to any total ordering.</p>
   
   <section id="ref-relation">
   <title>Referencing Relations</title>
   <p>Relations are referred to by name (or alias). Names are assigned by you as part of the Pig Latin statement. In this example the name (alias) of the relation is A.</p>
   
   <source>
A = LOAD 'student' USING PigStorage() AS (name:chararray, age:int, gpa:float);
DUMP A;
(John,18,4.0F)
(Mary,19,3.8F)
(Bill,20,3.9F)
(Joe,18,3.8F)
</source>

  <p>You an assign an alias to another alias. The new alias can be used in the place of the original alias to refer the original relation. </p>
  <source>
  A = LOAD 'student' USING PigStorage() AS (name:chararray, age:int, gpa:float);
  B = A;
  DUMP B;
  </source>
</section>
   
   
   <!-- +++++++++++++++++++++++++++++++++++++++++++++++ -->
   <section id="ref-field">
   <title>Referencing Fields</title>
   <p>Fields are referred to by positional notation or by name (alias). </p>
   <ul>
      <li>
         <p>Positional notation is generated by the system. Positional notation is indicated with the dollar sign ($) and begins with zero (0); for example, $0, $1, $2. </p>
      </li>
      <li>
         <p>Names are assigned by you using schemas (or, in the case of the GROUP operator and some functions, by the system). You can use any name that is not a Pig keyword (see <a href="#identifiers">Identifiers</a> for valid name examples).</p>
      </li>
   </ul>
   <p>Given relation A above, the three fields are separated out in this table. </p>
   <table>
         <tr>
            <td>
               <p></p>
            </td>
            <td>
               <p>First Field</p>
            </td>
            <td>
               <p>Second Field</p>
            </td>
            <td>
               <p>Third Field </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Data type</p>
            </td>
            <td>
               <p>chararray</p>
            </td>
            <td>
               <p>int</p>
            </td>
            <td>
               <p>float</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Positional notation (generated by system) </p>
            </td>
            <td>
               <p>$0</p>
            </td>
            <td>
               <p>$1</p>
            </td>
            <td>
               <p>$2</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Possible name (assigned by you using a schema)</p>
            </td>
            <td>
               <p>name</p>
            </td>
            <td>
               <p>age</p>
            </td>
            <td>
               <p>gpa</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Field value (for the first tuple)</p>
            </td>
            <td>
               <p>John</p>
            </td>
            <td>
               <p>18</p>
            </td>
            <td>
               <p>4.0</p>
            </td>
         </tr>
   </table>
   
   <p>As shown in this example when you assign names to fields (using the AS schema clause) you can still refer to the fields using positional notation. However, for debugging purposes and ease of comprehension, it is better to use field names.</p>
<source>
A = LOAD 'student' USING PigStorage() AS (name:chararray, age:int, gpa:float);
X = FOREACH A GENERATE name,$2;
DUMP X;
(John,4.0F)
(Mary,3.8F)
(Bill,3.9F)
(Joe,3.8F)
</source>   
   
   <p>In this example an error is generated because the requested column ($3) is outside of the declared schema (positional notation begins with $0). Note that the error is caught before the statements are executed.</p>
   
   <source>
A = LOAD 'data' AS (f1:int,f2:int,f3:int);
B = FOREACH A GENERATE $3;
DUMP B;
2009-01-21 23:03:46,715 [main] ERROR org.apache.pig.tools.grunt.GruntParser - java.io.IOException: 
Out of bound access. Trying to access non-existent  : 3. Schema {f1: bytearray,f2: bytearray,f3: bytearray} has 3 column(s). 
<em>etc ... </em></source>
</section>
   
   
   <!-- +++++++++++++++++++++++++++++++++++++++++++++++ -->
   <section id="ref-field-complex">
   <title>Referencing Fields that are Complex Data Types</title>
   <p>As noted, the fields in a tuple can be any data type, including the complex data types: bags, tuples, and maps. </p>
   <ul>
      <li>
         <p>Use the schemas for complex data types to name fields that are complex data types. </p>
      </li>
      <li>
         <p>Use the dereference operators to reference and work with fields that are complex data types.</p>
      </li>
   </ul>
   <p>In this example the data file contains tuples. A schema for complex data types (in this case, tuples) is used to load the data. Then, dereference operators (the dot in t1.t1a and t2.$0) are used to access the fields in the tuples. Note that when you assign names to fields you can still refer to these fields using positional notation.</p>

   
   <source>
cat data;
(3,8,9) (4,5,6)
(1,4,7) (3,7,5)
(2,5,8) (9,5,8)

A = LOAD 'data' AS (t1:tuple(t1a:int, t1b:int,t1c:int),t2:tuple(t2a:int,t2b:int,t2c:int));

DUMP A;
((3,8,9),(4,5,6))
((1,4,7),(3,7,5))
((2,5,8),(9,5,8))

X = FOREACH A GENERATE t1.t1a,t2.$0;

DUMP X;
(3,4)
(1,3)
(2,9)
</source>
</section>
</section>   

<!-- ++++++++++++++++++++++++++++++++++ --> 
<section id="data-types">
<title>Data Types</title>

<section>
<title>Simple and Complex</title>
<p></p>

   <table>
      <tr>
            <td>
               <p><strong>Simple Types</strong></p>
            </td>
            <td>
               <p>Description</p>
            </td>
            <td>
               <p>Example </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>int</p>
            </td>
            <td>
               <p>Signed 32-bit integer</p>
            </td>
            <td>
               <p>10</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>long</p>
            </td>
            <td>
               <p>Signed 64-bit integer</p>
            </td>
            <td>
               <p>Data:     10L or 10l </p>
               <p>Display: 10L </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>float</p>
            </td>
            <td>
               <p>32-bit floating point</p>
            </td>
            <td>
               <p>Data:     10.5F or 10.5f or 10.5e2f or 10.5E2F</p>
               <p>Display: 10.5F or 1050.0F</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>double</p>
            </td>
            <td>
               <p>64-bit floating point</p>
            </td>
            <td>
               <p>Data:     10.5 or 10.5e2 or 10.5E2</p>
               <p>Display: 10.5 or 1050.0</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>chararray</p>
            </td>
            <td>
               <p>Character array (string) in Unicode UTF-8 format</p>
            </td>
            <td>
               <p>hello world</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bytearray</p>
            </td>
            <td>
               <p>Byte array (blob)</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
                  <tr>
            <td>
               <p>boolean</p>
            </td>
            <td>
               <p>boolean</p>
            </td>
            <td>
               <p>true/false (case insensitive)</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>datetime</p>
            </td>
            <td>
               <p>datetime</p>
            </td>
            <td>
               <p>1970-01-01T00:00:00.000+00:00</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>biginteger</p>
            </td>
            <td>
               <p>Java BigInteger</p>
            </td>
            <td>
               <p>200000000000</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bigdecimal</p>
            </td>
            <td>
               <p>Java BigDecimal</p>
            </td>
            <td>
               <p>33.456783321323441233442</p>
            </td>
         </tr>
         <tr>
            <td>
               <p><strong>Complex Types</strong></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple</p>
            </td>
            <td>
               <p>An ordered set of fields.</p>
            </td>
            <td>
               <p>(19,2)</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bag</p>
            </td>
            <td>
               <p>An collection of tuples.</p>
            </td>
            <td>
               <p>{(19,2), (18,1)}</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>map</p>
            </td>
            <td>
               <p>A set of key value pairs.</p>
            </td>
            <td>
               <p>[open#apache]</p>
            </td>
         </tr>
   </table>
   
   <p>Note the following general observations about data types:</p>
   <ul>
      <li>
         <p>Use schemas to assign types to fields.  If you don't assign types, fields default to type bytearray and implicit conversions are applied to the data depending on the context in which that data is used. For example, in relation B, f1 is converted to integer because 5 is integer. In relation C, f1 and f2 are converted to double because we don't know the type of either f1 or f2.</p>
      <source>
A = LOAD 'data' AS (f1,f2,f3);
B = FOREACH A GENERATE f1 + 5;
C = FOREACH A generate f1 + f2;
</source>
      </li>
   </ul>

   <ul>
      <li>
         <p>If a schema is defined as part of a load statement, the load function will attempt to enforce the schema. If the data does not conform to the schema, the loader will generate a null value or an error.</p>
      <source>
A = LOAD 'data' AS (name:chararray, age:int, gpa:float);
</source>
      </li>
   </ul>
   <p></p>
   <ul>
      <li>
         <p>If an explicit cast is not supported, an error will occur. For example, you cannot cast a chararray to int.</p>
         <source>
A = LOAD 'data' AS (name:chararray, age:int, gpa:float);
B = FOREACH A GENERATE (int)name;

This will cause an error …</source>
      </li>
   </ul>

   <p></p>
   <ul>
      <li>
         <p>If Pig cannot resolve incompatible types through implicit casts, an error will occur. For example, you cannot add chararray and float (see the <a href="#types-table-add">Types Table for addition and subtraction</a>).</p>
      <source>
A = LOAD 'data' AS (name:chararray, age:int, gpa:float);
B = FOREACH A GENERATE name + gpa;

This will cause an error …</source>
      </li>
   </ul>
   <p></p>
   <p>All data types have corresponding <a href="#schemas">schemas</a>.</p>
   </section>
   
   <section id ="tuple">
   <title>Tuple</title>
   <p>A tuple is an ordered set of fields.</p>
   
   <section>
   <title>Syntax </title>
   <table>
        <tr>
            <td>
               <p>( field [, field …] )  </p>
            </td>
        </tr>
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>(  )</p>
            </td>
            <td>
               <p>A tuple is enclosed in parentheses ( ).</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>field</p>
            </td>
            <td>
               <p>A piece of data. A field can be any data type (including tuple and bag).</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Usage</title>
   <p>You can think of a tuple as a row with one or more fields, where each field can be any data type and any field may or may not have data. If a field has no data, then the following happens:</p>
   <ul>
      <li>
         <p>In a load statement, the loader will inject null into the tuple. The actual value that is substituted for null is loader specific; for example, PigStorage substitutes an empty field for null.</p>
      </li>
      <li>
         <p>In a non-load statement, if a requested field is missing from a tuple, Pig will inject null.</p>
      </li>
   </ul>
   <p></p>
   <p>Also see <a href="#tuple-schema">tuple schemas</a>.</p>
   </section>
   
   <section>
   <title>Example</title>
   <p>In this example the tuple contains three fields.</p>
   <source>(John,18,4.0F)</source>
   </section></section>
   
   <section id="bag">
   <title>Bag</title>
   <p>A bag is a collection of tuples.</p>
   
   <section>
   <title>Syntax: Inner bag</title>
   <table>
      <tr>
            <td>
               <p>{ tuple [, tuple …] }</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>{  }</p>
            </td>
            <td>
               <p>An inner bag is enclosed in curly brackets { }.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple</p>
            </td>
            <td>
               <p>A tuple.</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Usage </title>
   <p>Note the following about bags:</p>
   <ul>
      <li>
         <p>A bag can have duplicate tuples.</p>
      </li>
      <li>
         <p>A bag can have tuples with differing numbers of fields. However, if Pig tries to access a field that does not exist, a null value is substituted.</p>
      </li>
      <li>
         <p>A bag can have tuples with fields that have different data types. However, for Pig to effectively process bags, the schemas of the tuples within those bags should be the same. For example, if half of the tuples include chararray fields and while the other half include float fields, only half of the tuples will participate in any kind of computation because the chararray fields will be converted to null.</p>
         <p></p>
         <p>Bags have two forms: outer bag (or relation) and inner bag.</p>
      </li>
   </ul>
   <p></p>
   <p>Also see <a href="#bag-schema">bag schemas</a>.</p>
   </section>
   
   <section>
   <title>Example: Outer Bag</title>
   <p>In this example A is a relation or bag of tuples. You can think of this bag as an outer bag.</p>
<source>
A = LOAD 'data' as (f1:int, f2:int, f3:int);
DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
</source>
   </section>
   
   <section>
   <title>Example: Inner Bag</title>
   <p>Now, suppose we group relation A by the first field to form relation X. </p>
   <p>In this example X is a relation or bag of tuples. The tuples in relation X have two fields. The first field is type int. The second field is type bag; you can think of this bag as an inner bag.</p>
<source>
X = GROUP A BY f1;
DUMP X;
(1,{(1,2,3)})
(4,{(4,2,1),(4,3,3)})
(8,{(8,3,4)})
</source>
   </section>
   </section>
   
   <section id="map">
   <title>Map</title>
   <p>A map is a set of key/value pairs.</p>
   
   <section>
   <title>Syntax (&lt;&gt; denotes optional)</title>
   <table>
      <tr>
            <td>
               <p>[ key#value &lt;, key#value …&gt; ]</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>[ ]</p>
            </td>
            <td>
               <p>Maps are enclosed in straight brackets [ ].</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>#</p>
            </td>
            <td>
               <p>Key value pairs are separated by the pound sign #.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>key</p>
            </td>
            <td>
               <p>Must be chararray data type. Must be a unique value.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>value</p>
            </td>
            <td>
               <p>Any data type (the defaults to bytearray).</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Usage</title>
   <p>Key values within a relation must be unique.</p>

   <p>Also see <a href="#map-schema">map schemas</a>.</p>
   </section>
   
   <section>
   <title>Example</title>
   <p>In this example the map includes two key value pairs.</p>
<source>[name#John,phone#5551212]</source>
</section></section></section> 
   
   <!-- ++++++++++++++++++++++++++++++++++ --> 
   <section id="nulls">
   <title>Nulls and Pig Latin</title>
   <p>In Pig Latin, nulls are implemented using the SQL definition of null as unknown or non-existent. Nulls can occur naturally in data or can be the result of an operation. </p>
   <section id="nulls-ops">
   <title>Nulls, Operators, and Functions</title>
   <p>Pig Latin operators and functions interact with nulls as shown in this table.</p>
   <table>
      <tr>
            <td>
               <p>Operator </p>
            </td>
            <td>
               <p>Interaction </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Comparison operators:</p>
               <p>==, !=</p>
               <p>&gt;, &lt;</p>
               <p>&gt;=, &lt;=</p>
            </td>
            <td>
               <p>If either subexpression is null, the result is null.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Comparison operator:</p>
               <p>matches </p>
            </td>
            <td>
               <p>If either the string being matched against or the string defining the match is null, the result is null.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Arithmetic operators:</p>
               <p> + , -, *, /</p>
               <p>% modulo</p>
               <p>? : bincond</p>
               <p>CASE : case</p>
            </td>
            <td>
               <p>If either subexpression is null, the resulting expression is null.</p>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Null operator:</p>
               <p>is null </p>
            </td>
            <td>
               <p>If the tested value is null, returns true; otherwise, returns false (see  <a href="#null_operators">Null Operators</a>).</p>
              
            </td>
         </tr>
         <tr>
            <td>
               <p>Null operator:</p>
               <p>is not null</p>
            </td>
            <td>
               <p>If the tested value is not null, returns true; otherwise, returns false (see  <a href="#null_operators">Null Operators</a>).</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Dereference operators:</p>
               <p>tuple (.) or map (#)</p>
            </td>
            <td>
               <p>If the de-referenced tuple or map is null, returns null.</p>
            </td>
         </tr>
                           <tr>
            <td>
               <p>Operators:</p>
               <p>COGROUP, GROUP, JOIN</p>
            </td>
            <td>
               <p>These operators handle nulls differently (see examples below).</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Function:</p>
               <p>COUNT_STAR</p>
            </td>
            <td>
               <p>This function counts all values, including nulls.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Cast operator</p>
            </td>
            <td>
               <p>Casting a null from one type to another type results in a null.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Functions:</p>
               <p>AVG, MIN, MAX, SUM, COUNT</p>
            </td>
            <td>
               <p>These functions ignore nulls. </p>
            </td>
         </tr>

         <tr>
            <td>
               <p>Function:</p>
               <p>CONCAT</p>
            </td>
            <td>
               <p>If either subexpression is null, the resulting expression is null.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>Function:</p>
               <p>SIZE</p>
            </td>
            <td>
               <p>If the tested object is null, returns null.</p>
            </td>
         </tr>
   </table>
   
   <p>For Boolean subexpressions, note the results when nulls are used with these operators:</p>
   <ul>
      <li>
         <p>FILTER operator – If a filter expression results in null value, the filter does not pass them through (if X is null, !X is also null, and the filter will reject both).</p>
      </li>
      <li>
      <p>Bincond operator – If a Boolean subexpression results in null value, the resulting expression is null (see the interactions above for Arithmetic operators)</p>
      </li>
   </ul>
   </section>
   
   
   <!-- ++++++++++++++++++++++++++++++++++ --> 
   <section id="nulls-constants">
   <title>Nulls and Constants</title>
   <p>Nulls can be used as constant expressions in place of expressions of any type.</p>
   <p>In this example a and null are projected.</p>
<source>
A = LOAD 'data' AS (a, b, c).
B = FOREACH A GENERATE a, null;
</source>
  
   <p>In this example of an outer join, if the join key is missing from a table it is replaced by null.</p>
<source>
A = LOAD 'student' AS (name: chararray, age: int, gpa: float);
B = LOAD 'votertab10k' AS (name: chararray, age: int, registration: chararray, donation: float);
C = COGROUP A BY name, B BY name;
D = FOREACH C GENERATE FLATTEN((IsEmpty(A) ? null : A)), FLATTEN((IsEmpty(B) ? null : B));
</source>
   
   <p>Like any other expression, null constants can be implicitly or explicitly cast. </p>
   <p>In this example both a and null will be implicitly cast to double.</p>
<source>
A = LOAD 'data' AS (a, b, c).
B = FOREACH A GENERATE a + null;
</source>
   
   <p>In this example  both a and null will be cast to int, a implicitly, and null explicitly.</p>
<source>
A = LOAD 'data' AS (a, b, c).
B = FOREACH A GENERATE a + (int)null;
</source>
   </section>
   
      <!-- ++++++++++++++++++++++++++++++++++ -->  
   <section id="nulls-ops-produce">
   <title>Operations That Produce Nulls</title>
   <p>As noted, nulls can be the result of an operation. These operations can produce null values: </p>
   <ul>
      <li>
         <p>Division by zero</p>
      </li>
      <li>
         <p>Returns from user defined functions (UDFs) </p>
      </li>
      <li>
         <p>Dereferencing a field that does not exist.</p>
      </li>
      <li>
         <p>Dereferencing a key that does not exist in a map. For example, given a map, info, containing [name#john, phone#5551212] if a user tries to use info#address a null is returned.</p>
      </li>
      <li>
         <p>Accessing a field that does not exist in a tuple.</p>
      </li>
   </ul>
   
   <section>
   <title>Example: Accessing a field that does not exist in a tuple</title>
   <p>In this example nulls are injected if fields do not have data.</p>
<source>
cat data;
    2   3
4   
7   8   9

A = LOAD 'data' AS (f1:int,f2:int,f3:int)

DUMP A;
(,2,3)
(4,,)
(7,8,9)

B = FOREACH A GENERATE f1,f2;

DUMP B;
(,2)
(4,)
(7,8)
</source>
   
   </section></section>
   
   
      <!-- ++++++++++++++++++++++++++++++++++ -->  
   <section id="nulls-load">
   <title>Nulls and Load Functions</title>
   <p>As noted, nulls can occur naturally in the data. If nulls are part of the data, it is the responsibility of the load function to handle them correctly. Keep in mind that what is considered a null value is loader-specific; however, the load function should always communicate null values to Pig by producing Java nulls.</p>
   <p>The Pig Latin load functions (for example, PigStorage and TextLoader) produce null values wherever data is missing. For example, empty strings (chararrays) are not loaded; instead, they are replaced by nulls.</p>
   
   <p>PigStorage is the default load function for the LOAD operator. In this example the is not null operator is used to filter names with null values.</p>

 <source>
A = LOAD 'student' AS (name, age, gpa); 
B = FILTER A BY name is not null;
</source>  
   </section>
   
   <section id="nulls_group">
   <title>Nulls and GROUP/COGROUP Operators</title>
   <p>When using the GROUP operator with a single relation, records with a null group key are grouped together.</p>
   <source>
A = load 'student' as (name:chararray, age:int, gpa:float);
dump A;
(joe,18,2.5)
(sam,,3.0)
(bob,,3.5)

X = group A by age;
dump X;
(18,{(joe,18,2.5)})
(,{(sam,,3.0),(bob,,3.5)})
   </source>
   
<p>When using the GROUP (COGROUP) operator with multiple relations, records with a null group key from different relations are considered different and are grouped separately. In the example below note that there are two tuples in the output corresponding to the null group key: one that contains tuples from relation A (but not relation B) and one that contains tuples from relation B (but not relation A).</p>
   
<source>
A = load 'student' as (name:chararray, age:int, gpa:float);
B = load 'student' as (name:chararray, age:int, gpa:float);
dump B;
(joe,18,2.5)
(sam,,3.0)
(bob,,3.5)

X = cogroup A by age, B by age;
dump X;
(18,{(joe,18,2.5)},{(joe,18,2.5)})
(,{(sam,,3.0),(bob,,3.5)},{})
(,{},{(sam,,3.0),(bob,,3.5)})
</source>
   </section>
   
      <!-- ++++++++++++++++++++++++++++++++++ -->  
   <section id="nulls_join">
   <title>Nulls and JOIN Operator</title>
   <p>The JOIN operator - when performing inner joins - adheres to the SQL standard and disregards (filters out) null values. 
   (See also <a href="perf.html#nulls">Drop Nulls Before a Join</a>.)</p>
<source>
A = load 'student' as (name:chararray, age:int, gpa:float);
B = load 'student' as (name:chararray, age:int, gpa:float);
dump B;
(joe,18,2.5)
(sam,,3.0)
(bob,,3.5)
  
X = join A by age, B by age;
dump X;
(joe,18,2.5,joe,18,2.5)
</source>
   </section>
   <section id="nulls_flatten">
   <title>Nulls and FLATTEN Operator</title>
   <p>The FLATTEN operator handles null value differently based on its schema.</p>
   <p>For null tuples, FLATTEN(null) produces multiples nulls based on the number of elements in the schema for that field.
   If tuple has no schema, FLATTEN(null) simply returns a single null. </p>
   <p>For null bags, we would have liked to discard the row just like we do with flatten of an empty bag.
   However, it was too late by the time we noticed this inconsistency.
   In order to preserve the backward compatibility, FLATTEN(null) for bag produces multiples nulls
   based on the number of elements defined for the schema of this bag.
   If no schema, a single null is returned. </p>
   <p>For bags containing some null tuples, it follows the same rule as flatten of null tuples described above. </p>
   <p>For null maps, FLATTEN(null) produces 2 nulls to represent the key and the value.</p>
   <p>For null with other types, FLATTEN(null) simply returns a single null.</p>
   </section>
   
   </section>
  
  
   <!-- ++++++++++++++++++++++++++++++++++ -->  
   <section id="constants">
   <title>Constants</title>
   <p>Pig provides constant representations for all data types except bytearrays.</p>
   <table>
      <tr>
            <td>
               <p></p>
            </td>
            <td>
               <p><strong>Constant Example</strong></p>
            </td>
            <td>
               <p><strong>Notes</strong></p>
            </td>
         </tr>
         <tr>
            <td>
               <p><strong>Simple Data Types</strong></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>int</p>
            </td>
            <td>
               <p>19</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>long</p>
            </td>
            <td>
               <p>19L</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>float</p>
            </td>
            <td>
               <p>19.2F or 1.92e2f</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>double</p>
            </td>
            <td>
               <p>19.2 or 1.92e2</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>chararray</p>
            </td>
            <td>
               <p>'hello world'</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bytearray</p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>Not applicable.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>boolean</p>
            </td>
            <td>
               <p>true/false</p>
            </td>
            <td>
               <p>Case insensitive.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>biginteger</p>
            </td>
            <td>
               <p>19211921192119211921BI</p>
            </td>
            <td>
            </td>
         </tr>
         <tr>
            <td>
               <p>bigdecimal</p>
            </td>
            <td>
               <p>192119211921.192119211921BD</p>
            </td>
            <td>
            </td>
         </tr>
         <tr>
            <td>
               <p><strong>Complex Data Types</strong></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple</p>
            </td>
            <td>
               <p>(19, 2, 1)</p>
            </td>
            <td>
               <p>A constant in this form creates a tuple.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bag</p>
            </td>
            <td>
               <p>{ (19, 2), (1, 2) }</p>
            </td>
            <td>
               <p>A constant in this form creates a bag.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>map</p>
            </td>
            <td>
               <p>[ 'name' # 'John', 'ext' # 5555 ]</p>
            </td>
            <td>
               <p>A constant in this form creates a map.</p>
            </td>
         </tr>
   </table>
   <p></p>
   <p>Please note the following:</p>
   <ul>
      <li>
         <p>On UTF-8 systems you can specify string constants consisting of printable ASCII characters such as 'abc'; you can specify control characters such as '\t'; and, you can specify a character in Unicode by starting it with '\u', for instance, '\u0001' represents Ctrl-A in hexadecimal (see Wikipedia <a href="http://en.wikipedia.org/wiki/ASCII">ASCII</a>, <a href="http://en.wikipedia.org/wiki/Unicode">Unicode</a>, and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>). In theory, you should be able to specify non-UTF-8 constants on non-UTF-8 systems but as far as we know this has not been tested.</p>
      </li>
      <li>
         <p>To specify a long constant, l or L must be appended to the number (for example, 12345678L). If the l or L is not specified, but the number is too large to fit into an int, the problem will be detected at parse time and the processing is terminated. </p>
      </li>
      <li>
         <p>Any numeric constant with decimal point (for example, 1.5) and/or exponent (for example, 5e+1) is treated as double unless it ends with the following characters:</p>
         <ul>
            <li>
               <p>f or F in which case it is assigned type float (for example,  1.5f)</p>
            </li>
            <li>
               <p>BD or bd in which case it is assigned type BigDecimal (for example,  12345678.12345678BD)</p>
            </li>
         </ul>
      </li>
      <li>
         <p>BigIntegers can be specified by supplying BI or bi at the end of the number (for example, 123456789123456BI)</p>
      </li>
      <li>
         <p>There is no native constant type for datetime field. You can use a ToDate udf with chararray constant as argument to generate a datetime value. </p>
      </li>

   </ul>
   <p></p>
   <p>The data type definitions for tuples, bags, and maps apply to constants:</p>
   <ul>
      <li>
         <p>A tuple can contain fields of any data type</p>
      </li>
      <li>
         <p>A bag is a collection of tuples</p>
      </li>
      <li>
         <p>A map key must be a chararray; a map value can be any data type</p>
      </li>
   </ul>
   <p></p>
   <p>Complex constants (either with or without values) can be used in the same places scalar constants can be used; that is, in FILTER and GENERATE statements.</p>

<source>
A = LOAD 'data' USING MyStorage() AS (T: tuple(name:chararray, age: int));
B = FILTER A BY T == ('john', 25);
D = FOREACH B GENERATE T.name, [25#5.6], {(1, 5, 18)};
</source>
   </section>
   
   
    <!-- ++++++++++++++++++++++++++++++++++ --> 
   <section id="expressions">
   <title>Expressions</title>
   <p>In Pig Latin, expressions are language constructs used with the FILTER, FOREACH, GROUP, and SPLIT operators as well as the eval functions.</p>
   <p>Expressions are written in conventional mathematical infix notation and are adapted to the UTF-8 character set. Depending on the context, expressions can include:</p>
   <ul>
      <li>
         <p>Any Pig data type (simple data types, complex data types)</p>
      </li>
      <li>
         <p>Any Pig operator (arithmetic, comparison, null, boolean, dereference, sign, and cast)</p>
      </li>
      <li>
         <p>Any Pig built in function.</p>
      </li>
      <li>
         <p>Any user defined function (UDF) written in Java. </p>
       </li>
        </ul>
        <p></p>
       <p>In Pig Latin,</p>
        <ul>
       <li>
         <p>An arithmetic expression could look like this:</p>
         <source>
X = GROUP A BY f2*f3;
</source>
      </li>

      <li>
         <p></p>
         <p>A string expression could look like this, where a and b are both chararrays:</p>
         <source>
X = FOREACH A GENERATE CONCAT(a,b);
</source>
      </li>

      <li>
         <p></p>
         <p>A boolean expression could look like this:</p>
         <source>
X = FILTER A BY (f1==8) OR (NOT (f2+f3 &gt; f1));
</source>
      </li>
   </ul>

   <!-- ++++++++++++++++++++++++++++++++++ --> 
      <section id="fexp">
          <title>Field Expressions</title>
          <p>Field expressions represent a field or a <a href="#deref">dereference operator</a> applied to a field.</p>
      </section>

   <!-- ++++++++++++++++++++++++++++++++++ --> 
      <section id="sexp">
          <title>Star Expressions</title>
          <p>Star expressions ( * ) can be used to represent all the fields of a tuple. It is equivalent to writing out the fields explicitly. In the following example the definition of B and C are exactly the same, and MyUDF will be invoked with exactly the same arguments in both cases.</p>
          <source>
A = LOAD 'data' USING MyStorage() AS (name:chararray, age: int);
B = FOREACH A GENERATE *, MyUDF(name, age);
C = FOREACH A GENERATE name, age, MyUDF(*);
          </source>
          <p>A common error when using the star expression is shown below. In this example, the programmer really wants to count the number of elements in the bag in the second field: COUNT($1).</p>
          <source>
G = GROUP A BY $0;
C = FOREACH G GENERATE COUNT(*)
          </source>
        
<p>There are some restrictions on use of the star expression when the input schema is unknown (null):</p>
<ul>
<li>For GROUP/COGROUP, you can't include a star expression in a GROUP BY column. </li>
<li>For ORDER BY, if you have project-star as ORDER BY column, you can’t have any other ORDER BY column in that statement. </li>
</ul>
      </section>

   <!-- ++++++++++++++++++++++++++++++++++ --> 
<section id="prexp">
<title>Project-Range Expressions</title>
<p>Project-range ( .. ) expressions can be used to project a range of columns from input. For example:</p>
<ul>
<li>.. $x : projects columns $0 through $x, inclusive </li>
<li>$x .. : projects columns through end, inclusive </li>
<li>$x .. $y : projects columns through $y, inclusive </li>
</ul>
<p></p>

<p>If the input relation has a schema, you can refer to columns by alias rather than by column position. You can also combine aliases and column positions in an expression; for example, "col1 .. $5" is valid. </p>

<p>Project-range can be used in all cases where the <a href="#sexp">star expression</a> ( * ) is allowed.</p>

<p>Project-range can be used in the following statements:
<a href="#foreach">FOREACH</a>,
<a href="#join-inner">JOIN</a>,
<a href="#group">GROUP</a>,
<a href="#cogroup">COGROUP</a>, and
<a href="#order-by">ORDER BY</a> (also when ORDER BY is used within a nested FOREACH block).</p>

<p>A few examples are shown here:</p>
<source>
..... 
grunt> F = foreach IN generate (int)col0, col1 .. col3; 
grunt> describe F; 
F: {col0: int,col1: bytearray,col2: bytearray,col3: bytearray} 
..... 
..... 
grunt> SORT = order IN by col2 .. col3, col0, col4 ..; 
..... 
..... 
J = join IN1 by $0 .. $3, IN2 by $0 .. $3; 
..... 
..... 
g = group l1 by b .. c; 
..... 
</source>

<p>There are some restrictions on the use of project-to-end form of project-range (eg "x .. ") when the input schema is unknown (null): </p>
<ul>
<li>For GROUP/COGROUP, the project-to-end form of project-range is not allowed.</li>
<li>For ORDER BY, the project-to-end form of project-range is supported only as the last sort column.
<source>
..... 
grunt> describe IN; 
Schema for IN unknown. 

/* This statement is supported */
SORT = order IN by $2 .. $3, $6 ..; 

/* This statement is NOT supported */ 
SORT = order IN by $2 .. $3, $6 ..; 
..... 
</source>


</li>
</ul>
</section>
      
      <!-- ++++++++++++++++++++++++++++++++++ -->    
      <section id="bexp">
          <title>Boolean Expressions</title>
          <p>Boolean expressions can be made up of UDFs that return a boolean value or boolean operators 
          (see <a href="#boolops">Boolean Operators</a>). 
          </p>
      </section>

   <!-- ++++++++++++++++++++++++++++++++++ -->            
      <section id="texp">
          <title>Tuple Expressions</title>
          <p>Tuple expressions form subexpressions into tuples. The tuple expression has the form (expression [, expression …]), where expression is a general expression. The simplest tuple expression is the star expression, which represents all fields.
          </p>
      </section>

   <!-- ++++++++++++++++++++++++++++++++++ --> 
    <section id="gexp">
          <title>General Expressions</title>
          <p>General expressions can be made up of UDFs and almost any operator. Since Pig does not consider boolean a base type, the result of a general expression cannot be a boolean. Field expressions are the simpliest general expressions.
          </p>
      </section>
   </section>
   
   
    <!-- ================================================== --> 
   <section id="schemas">
   <title>Schemas</title>

   <p>Schemas enable you to assign names to fields and declare types for fields. Schemas are optional but we encourage you to use them whenever possible; type declarations result in better parse-time error checking and more efficient code execution.</p>  
   
   <p>Schemas for <a href="#schema-simple">simple types</a> and <a href="#schema-complex">complex types</a> can be used anywhere a schema definition is appropriate.</p>   
   
   <p>Schemas are defined with the <a href="#load">LOAD</a>, <a href="#stream">STREAM</a>, and <a href="#foreach">FOREACH</a> operators using the AS clause. If you define a schema using the LOAD operator, then it is the load function that enforces the schema
   (see <a href="#load">LOAD</a> and <a href="udf.html">User Defined Functions</a> for more information).</p>

   <p></p>
   <p><strong>Known Schema Handling</strong></p>
   <p>Note the following:</p>
   <ul>
      <li>You can define a schema that includes both the field name and field type.</li>
      <li>You can define a schema that includes the field name only; in this case, the field type defaults to bytearray.</li>
      <li>You can choose not to define a schema; in this case, the field is un-named and the field type defaults to bytearray.</li>
   </ul>
   <p>If you assign a name to a field, you can refer to that field using the name or by positional notation. If you don't assign a name to a field (the field is un-named) you can only refer to the field using positional notation.</p>
   <p>If you assign a type to a field, you can subsequently change the type using the cast operators. If you don't assign a type to a field, the field defaults to bytearray; you can change the default type using the cast operators.</p>
      <p></p>
   <p id="unknown-schema"><strong>Unknown Schema Handling</strong></p>
      <p>Note the following:</p>
   <ul>
      <li>When you JOIN/COGROUP/CROSS multiple relations, if any relation has an unknown schema (or no defined schema, also referred to as a null schema), the schema for the resulting relation is null. </li>
      <li>If you FLATTEN a bag with empty inner schema, the schema for the resulting relation is null.</li>
      <li>If you UNION two relations with incompatible schema, the schema for resulting relation is null.</li>
      <li>If the schema is null, Pig treats all fields as bytearray (in the backend, Pig will determine the real type for the fields dynamically) </li>
    </ul>      
    <p>See the examples below. If a field's data type is not specified, Pig will use bytearray to denote an unknown type. If the number of fields is not known, Pig will derive an unknown schema.</p>
    
 <source>
/* The field data types are not specified ... */
a = load '1.txt' as (a0, b0);
a: {a0: bytearray,b0: bytearray}

/* The number of fields is not known ... */
a = load '1.txt';
a: Schema for a unknown
</source>

   <p></p>
   <p><strong>How Pig Handles Schema</strong></p>
   
   <p>As shown above, with a few exceptions Pig can infer the schema of a relationship up front. You can examine the schema of particular relation using <a href="test.html#describe">DESCRIBE</a>. Pig enforces this computed schema during the actual execution by casting the input data to the expected data type. If the process is successful the results are returned to the user; otherwise, a warning is generated for each record that failed to convert.  Note that Pig does not know the actual types of the fields in the input data prior to the execution; rather, Pig determines the data types and performs the right conversions on the fly.</p>
  
<p>Having a deterministic schema is very powerful; however, sometimes it comes at the cost of performance. Consider the following example:</p>  
  
<source>
A = load 'input' as (x, y, z);
B = foreach A generate x+y;
</source>

 <p>If you do <a href="test.html#describe">DESCRIBE</a> on B, you will see a single column of type double. This is because Pig makes the safest choice and uses the largest numeric type when the schema is not know. In practice, the input data could contain integer values; however, Pig will cast the data to double and make sure that a double result is returned.</p>

 <p>If the schema of a relation can’t be inferred, Pig will just use the runtime data as is and propagate it through the pipeline.</p>


   <!-- ++++++++++++++++++++++++++++++++++ -->     
   <section id="schema-load">
   <title>Schemas with LOAD and STREAM </title>
   <p>With LOAD and STREAM operators, the schema following the AS keyword must be enclosed in parentheses.</p>
   <p>In this example the LOAD statement includes a schema definition for simple data types.</p>
<source>
A = LOAD 'data' AS (f1:int, f2:int);
</source>   
   </section>
 
    <!-- ++++++++++++++++++++++++++++++++++ -->   
   <section id="schemaforeach">
   <title>Schemas with FOREACH </title>
   <p>With FOREACH operators, the schema following the AS keyword must be enclosed in parentheses when the FLATTEN operator is used. Otherwise, the schema should not be enclosed in parentheses.</p>
   <p>In this example the FOREACH statement includes FLATTEN and a schema for simple data types.</p>
<source>
X = FOREACH C GENERATE FLATTEN(B) AS (f1:int, f2:int, f3:int), group;
</source>  
   <p>In this example the FOREACH statement includes a schema for simple expression.</p>
<source>
X = FOREACH A GENERATE f1+f2 AS x1:int;
</source>   
   <p>In this example the FOREACH statement includes a schemas for multiple fields.</p>
<source>
X = FOREACH A GENERATE f1 as user, f2 as age, f3 as gpa;
</source> 
   </section>
  
     <!-- ++++++++++++++++++++++++++++++++++ -->  
   <section  id="schema-simple">
   <title>Schemas for Simple Data Types</title>
   <p>Simple data types include int, long, float, double, chararray, bytearray, boolean, datetime, biginteger and bigdecimal.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr>
            <td>
               <p>(alias[:type]) [, (alias[:type]) …] )</p>
            </td>
         </tr>
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name assigned to the field.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>type</p>
            </td>
            <td>
               <p>(Optional) The simple data type assigned to the field.</p>
               <p>The alias and type are separated by a colon ( : ).</p>
               <p>If the type is omitted, the field defaults to type bytearray.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>( , )</p>
            </td>
            <td>
               <p>Multiple fields are enclosed in parentheses and separated by commas.</p>
            </td>
         </tr>
   </table></section>
   
   <section>
   <title>Examples</title>
   <p>In this example the schema defines multiple types.</p>
<source>
cat student;
John	18	4.0
Mary	19   	3.8
Bill	20   	3.9
Joe	18   	3.8

A = LOAD 'student' AS (name:chararray, age:int, gpa:float);

DESCRIBE A;
A: {name: chararray,age: int,gpa: float}

DUMP A;
(John,18,4.0F)
(Mary,19,3.8F)
(Bill,20,3.9F)
(Joe,18,3.8F)
</source>
   
   <p>In this example field "gpa" will default to bytearray because no type is declared. </p>
<source>
cat student;
John	18	4.0
Mary	19	3.8
Bill	20	3.9
Joe	18	3.8

A = LOAD 'data' AS (name:chararray, age:int, gpa);

DESCRIBE A;
A: {name: chararray,age: int,gpa: bytearray}

DUMP A;
(John,18,4.0)
(Mary,19,3.8)
(Bill,20,3.9)
(Joe,18,3.8)
</source>
   
   </section></section>
 
    <!-- ++++++++++++++++++++++++++++++++++ -->   
   <section id="schema-complex">
   <title>Schemas for Complex Data Types</title>
   <p>Complex data types include tuples, bags, and maps.</p></section>
   
      <!-- ++++++++++++++++++++++++++++++++++ --> 
   <section id="tuple-schema">
   <title>Tuple Schemas</title>
   <p>A tuple is an ordered set of fields.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr>
            <td>
               <p>alias[:tuple] (alias[:type]) [, (alias[:type]) …] )</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name assigned to the tuple.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>:tuple</p>
            </td>
            <td>
               <p>(Optional) The data type, tuple (case insensitive).</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>( )</p>
            </td>
            <td>
               <p>The designation for a tuple, a set of parentheses.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>alias[:type]</p>
            </td>
            <td>
               <p>The constituents of the tuple, where the schema definition rules for the corresponding type applies to the constituents of the tuple:</p>
               <ul>
                  <li>
                     <p>alias – the name assigned to the field</p>
                  </li>
                  <li>
                     <p>type (optional) – the simple or complex data type assigned to the field</p>
                  </li>
               </ul>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Examples</title>
   <p>In this example the schema defines one tuple. The load statements are equivalent.</p>

 <source>
cat data;
(3,8,9)
(1,4,7)
(2,5,8)

A = LOAD 'data' AS (T: tuple (f1:int, f2:int, f3:int));
A = LOAD 'data' AS (T: (f1:int, f2:int, f3:int));

DESCRIBE A;
A: {T: (f1: int,f2: int,f3: int)}

DUMP A;
((3,8,9))
((1,4,7))
((2,5,8))
</source>
   
   <p>In this example the schema defines two tuples.</p>
<source>
cat data;
(3,8,9) (mary,19)
(1,4,7) (john,18)
(2,5,8) (joe,18)

A = LOAD data AS (F:tuple(f1:int,f2:int,f3:int),T:tuple(t1:chararray,t2:int));

DESCRIBE A;
A: {F: (f1: int,f2: int,f3: int),T: (t1: chararray,t2: int)}

DUMP A;
((3,8,9),(mary,19))
((1,4,7),(john,18))
((2,5,8),(joe,18))
</source>
   </section></section>

   <!-- ++++++++++++++++++++++++++++++++++ -->    
   <section id="bag-schema">
   <title>Bag Schemas</title>
   <p>A bag is a collection of tuples.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr>
            <td>
               <p>alias[:bag] {tuple} </p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name assigned to the bag.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>:bag</p>
            </td>
            <td>
               <p>(Optional) The data type, bag (case insensitive).</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>{ }</p>
            </td>
            <td>
               <p>The designation for a bag, a set of curly brackets.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple</p>
            </td>
            <td>
               <p>A tuple (see Tuple Schema).</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Examples</title>
   <p>In this example the schema defines a bag. The two load statements are equivalent.</p>
<source>
cat data;
{(3,8,9)}
{(1,4,7)}
{(2,5,8)}

A = LOAD 'data' AS (B: bag {T: tuple(t1:int, t2:int, t3:int)});
A = LOAD 'data' AS (B: {T: (t1:int, t2:int, t3:int)});

DESCRIBE A:
A: {B: {T: (t1: int,t2: int,t3: int)}}

DUMP A;
({(3,8,9)})
({(1,4,7)})
({(2,5,8)})
</source>
   </section></section>
   
      <!-- ++++++++++++++++++++++++++++++++++ --> 
   <section id="map-schema">
   <title>Map Schemas</title>
   <p>A map is a set of key value pairs.</p>
   
   <section>
   <title>Syntax (&lt;&gt; demotes optional)</title>
   <table>
      <tr>
            <td>
               <p>alias&lt;:map&gt; [ &lt;type&gt; ] </p>
            </td>
         </tr>
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
         <tr>
            <td><p>alias</p></td>
            <td><p>The name assigned to the map.</p></td>
         </tr>
         <tr>
            <td><p>:map</p></td>
            <td><p>(Optional) The data type, map (case insensitive).</p></td>
         </tr>
         <tr>
            <td><p>[ ]</p></td>
            <td><p>The designation for a map, a set of straight brackets [ ].</p></td>
         </tr>
         <tr>
            <td><p>type</p></td>
            <td><p>(Optional) The datatype (all types allowed, bytearray is the default).</p>
            <p>The type applies to the map value only; the map key is always type chararray (see <a href="#map">Map</a>).</p>
            <p>If a type is declared then ALL values in the map must be of this type.</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Examples</title>
   <p>In this example the schema defines an untyped map (the map values default to bytearray). The load statements are equivalent.</p>
<source>
cat data;
[open#apache]
[apache#hadoop]

A = LOAD 'data' AS (M:map []);
A = LOAD 'data' AS (M:[]);

DESCRIBE A;
a: {M: map[ ]}

DUMP A;
([open#apache])
([apache#hadoop])
</source>

<p>This example shows the use of a typed maps.</p>
<source>
/* Map types are declared*/
a = load '1.txt' as(map[int]); --Map value is int
b = foreach a generate (map[(i:int)])a0; -- Map value is tuple
b = stream a through `cat` as (m:map[{(i:int,j:chararray)}]); -- Map value is bag

/* The MapLookup of a typed map will result in a datatype of the map value */
a = load '1.txt' as(map[int]);
b = foreach a generate $0#'key';

/* Schema for b */
b: {int}

</source>
 </section></section>
   
      <!-- ++++++++++++++++++++++++++++++++++ --> 
   <section id="schema-multi">
   <title>Schemas for Multiple Types</title>
   <p>You can define schemas for data that includes multiple types.</p>
   
   <section>
   <title>Example</title>
   <p>In this example the schema defines a tuple, bag, and map.</p>
<source>
A = LOAD 'mydata' AS (T1:tuple(f1:int, f2:int), B:bag{T2:tuple(t1:float,t2:float)}, M:map[] );

A = LOAD 'mydata' AS (T1:(f1:int, f2:int), B:{T2:(t1:float,t2:float)}, M:[] );
</source>
</section>
   <section id="previous-relation-shortcut">
     <title>Previous Relation Shortcut</title>
     <p>There is a shortcut form to reference the relation on the previous line of a pig script or grunt session:</p>
<source>
a = load 'thing' as (x:int);
b = foreach @ generate x;
c = foreach @ generate x;
d = foreach @ generate x;
</source>
   </section>
</section>
</section>
</section>
   
<!-- =================================================================== -->    
<!-- ARITHMETIC OPERATORS, ETC -->
<section id="artichmetic-ops">
	<title>Arithmetic Operators and More</title>

<section id="arithmetic">
<title>Arithmetic Operators</title>

<section>
<title>Description</title>
   <table>
      <tr>
            <td>
               <p>Operator</p>
            </td>
            <td>
               <p>Symbol</p>
            </td>
            <td>
               <p> Notes</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>addition </p>
            </td>
            <td>
               <p>+</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>subtraction </p>
            </td>
            <td>
               <p>-</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>multiplication  </p>
            </td>
            <td>
               <p>*</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>division  </p>
            </td>
            <td>
               <p>/</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>modulo  </p>
            </td>
            <td>
               <p>%</p>
            </td>
            <td>
               <p>Returns the remainder of a divided by b (a%b).</p>
               <p>Works with integral numbers (int, long). </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bincond </p>
            </td>
            <td>
               <p>? :</p>
            </td>
            <td>
               <p>(condition ? value_if_true : value_if_false) </p>
               <p>The bincond should be enclosed in parenthesis. </p>
               <p>The schemas for the two conditional outputs of the bincond should match.</p>
               <p>Use expressions  only (relational operators are not allowed).</p>
            </td>
          </tr>
         <tr>
            <td>
               <p>case</p>
            </td>
            <td>
               <p>CASE WHEN THEN ELSE END</p>
            </td>
            <td>
               <p>CASE expression [ WHEN value THEN value ]+ [ ELSE value ]? END</p>
               <p>CASE [ WHEN condition THEN value ]+ [ ELSE value ]? END</p>
               <p>Case operator is equivalent to nested bincond operators.</p> 
               <p>The schemas for all the outputs of the when/else branches should match.</p>
               <p>Use expressions only (relational operators are not allowed).</p>
            </td>
          </tr>
   </table>

   <section>
   <title>Examples </title>
   <p>Suppose we have relation A.</p>
<source>
A = LOAD 'data' AS (f1:int, f2:int, B:bag{T:tuple(t1:int,t2:int)});

DUMP A;
(10,1,{(2,3),(4,6)})
(10,3,{(2,3),(4,6)})
(10,6,{(2,3),(4,6),(5,7)})
</source>

  <p>In this example the modulo operator is used with fields f1 and f2.</p>
<source>
X = FOREACH A GENERATE f1, f2, f1%f2;

DUMP X;
(10,1,0)
(10,3,1)
(10,6,4)
</source>
   
   <p>In this example the bincond operator is used with fields f2 and B. The condition is "f2 equals 1"; if the condition is true, return 1; if the condition is false, return the count of the number of tuples in B.</p>
<source>
X = FOREACH A GENERATE f2, (f2==1?1:COUNT(B));

DUMP X;
(1,1L)
(3,2L)
(6,3L)
</source>

   <p>In this example the case operator is used with field f2. The expression is "f2 % 2"; if the expression is equal to 0, return 'even'; if the expression is equal to 1, return 'odd'.</p>
<source>
X = FOREACH A GENERATE f2, (
  CASE f2 % 2
    WHEN 0 THEN 'even'
    WHEN 1 THEN 'odd'
  END
);
DUMP X;
(1,odd)
(3,odd)
(6,even)
</source>

   <p>This can be also written as follows:</p>
<source>
X = FOREACH A GENERATE f2, (
  CASE
    WHEN f2 % 2 == 0 THEN 'even'
    WHEN f2 % 2 == 1 THEN 'odd'
  END
);
DUMP X;
(1,odd)
(3,odd)
(6,even)
</source>
   </section>
   
   <section id="types-table-add">
   <title> Types Table: addition (+) and subtraction (-) operators</title>
   <p>* bytearray cast as this data type</p>
   <table>
         <tr>
            <td>
               <p></p>
            </td>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p>map </p>
            </td>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p>bytearray </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>not yet </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>map </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>int </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as int </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>long </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as long  </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>float </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as float  </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>double </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as double   </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>cast as double </p>
            </td>
         </tr>
   </table>
   </section>
   
   <section id="types-table-mult">
   <title>Types Table: multiplication (*) and division (/) operators</title>
   <p>* bytearray cast as this data type</p>
   <table>
         <tr>
            <td>
               <p></p>
            </td>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p>map </p>
            </td>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p>bytearray </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>not yet </p>
            </td>
            <td>
               <p>not yet </p>
            </td>
            <td>
               <p>not yet </p>
            </td>
            <td>
               <p>not yet </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>not yet </p>
            </td>
            <td>
               <p>not yet </p>
            </td>
            <td>
               <p>not yet </p>
            </td>
            <td>
               <p>not yet </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>map </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>int </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as int </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>long </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as long </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>float </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as float </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>double </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as double  </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>cast as double  </p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Types Table: modulo (%) operator</title>
   <table>
         <tr>
            <td>
               <p></p>
            </td>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>bytearray </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>cast as int </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>long </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>cast as long </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
   </table>
   </section></section></section>
   
<!-- =================================================================== --> 
   <section id="boolops">
   <title>Boolean Operators</title>
      
      <section>
      <title>Description</title>
   <table>
      <tr>
            <td>
               <p>Operator</p>
            </td>
            <td>
               <p>Symbol</p>
            </td>
            <td>
               <p> Notes</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>AND       </p>
            </td>
            <td>
               <p>and</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>OR  </p>
            </td>
            <td>
               <p>or</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>IN</p>
            </td>
            <td>
               <p>in</p>
            </td>
            <td>
               <p>IN operator is equivalent to nested OR operators.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>NOT</p>
            </td>
            <td>
               <p>not</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
   </table>
   <p>The result of a boolean expression (an expression that includes boolean and comparison operators) is always of type boolean (true or false).</p>
   
   <section>
   <title>Example</title>
<source>
X = FILTER A BY (f1==8) OR (NOT (f2+f3 > f1)) OR (f1 IN (9, 10, 11));
</source>
   
   </section></section></section>   
   
   <!-- =================================================================== -->
   <section id="cast">
   <title>Cast Operators</title>
   
   <section>
   <title>Description</title>
   <p>Pig Latin supports casts as shown in this table. </p>
   <table>
       <tr>
            <td>
               <p><strong>from /  to</strong></p>
            </td>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p>map </p>
            </td>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p>boolean</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>map </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>error </p>
            </td>
           <td>
               <p>error</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>error </p>
            </td>
           <td>
               <p>error</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>yes</p>
            </td>
            <td>
               <p>yes</p>
            </td>
            <td>
               <p>yes</p>
            </td>
            <td>
               <p>yes</p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
           <td>
               <p>yes</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p>yes </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>yes</p>
            </td>
         </tr>
                  <tr>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error</p>
            </td>
            <td>
               <p>yes</p>
            </td>
            <td>
               <p>error</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
   </table>

   <section>
   <title>Syntax  </title>
   <table>
      <tr>
            <td>
               <p>{(data_type) |  (tuple(data_type))  | (bag{tuple(data_type)}) | (map[]) } field</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>(data_type)</p>
            </td>
            <td>
               <p>The data type you want to cast to, enclosed in parentheses. You can cast to any data type except bytearray (see the table above).</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>field</p>
            </td>
            <td>
               <p>The field whose type you want to change. </p>
               <p>The field can be represented by positional notation or by name (alias). For example, if f1 is the first field and type int, you can cast to type long using (long)$0 or (long)f1.</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Usage</title>
   <p>Cast operators enable you to cast or convert data from one type to another, as long as conversion is supported (see the table above). For example, suppose you have an integer field, myint, which you want to convert to a string. You can cast this field from int to chararray using (chararray)myint.</p>
   <p>Please note the following:</p>
   <ul>
      <li>
         <p>A field can be explicitly cast. Once cast, the field remains that type (it is not automatically cast back). In this example $0 is explicitly cast to int.</p>
<source>
B = FOREACH A GENERATE (int)$0 + 1;
</source>
      </li>
   </ul>
   <p></p>
   <ul>
      <li>
         <p>Where possible, Pig performs implicit casts. In this example $0 is cast to int (regardless of underlying data) and $1 is cast to double.</p>
<source>
B = FOREACH A GENERATE $0 + 1, $1 + 1.0
</source>
      </li>
   </ul>
   <ul>
      <li>
         <p>When two bytearrays are used in arithmetic expressions or a bytearray expression is used with built in aggregate functions (such as SUM) they are implicitly cast to double. If the underlying data is really int or long, you’ll get better performance by declaring the type or explicitly casting the data.</p>
      </li>
      <li>
         <p>Downcasts may cause loss of data. For example casting from long to int may drop bits.</p>
      </li>
   </ul>
   </section>
    </section>
   
   <!-- +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -->
   <section>
   <title>Examples</title>
   <p>In this example an int is cast to type chararray (see relation X).</p>
<source>
A = LOAD 'data' AS (f1:int,f2:int,f3:int);

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)

B = GROUP A BY f1;

DUMP B;
(1,{(1,2,3)})
(4,{(4,2,1),(4,3,3)})
(7,{(7,2,5)})
(8,{(8,3,4),(8,4,3)})

DESCRIBE B;
B: {group: int,A: {f1: int,f2: int,f3: int}}

X = FOREACH B GENERATE group, (chararray)COUNT(A) AS total;
(1,1)
(4,2)
(7,1)
(8,2)

DESCRIBE X;
X: {group: int,total: chararray}
</source>
   
   
<p>In this example a bytearray (fld in relation A) is cast to type tuple.</p>
<source>
cat data;
(1,2,3)
(4,2,1)
(8,3,4)

A = LOAD 'data' AS fld:bytearray;

DESCRIBE A;
a: {fld: bytearray}

DUMP A;
((1,2,3))
((4,2,1))
((8,3,4))

B = FOREACH A GENERATE (tuple(int,int,float))fld;

DESCRIBE B;
b: {(int,int,float)}

DUMP B;
((1,2,3))
((4,2,1))
((8,3,4))
</source>
   
   <p>In this example a bytearray (fld in relation A) is cast to type bag.</p>
<source>
cat data;
{(4829090493980522200L)}
{(4893298569862837493L)}
{(1297789302897398783L)}

A = LOAD 'data' AS fld:bytearray;

DESCRIBE A;
A: {fld: bytearray}

DUMP A;
({(4829090493980522200L)})
({(4893298569862837493L)})
({(1297789302897398783L)})

B = FOREACH A GENERATE (bag{tuple(long)})fld; 

DESCRIBE B;
B: {{(long)}}

DUMP B;
({(4829090493980522200L)})
({(4893298569862837493L)})
({(1297789302897398783L)})
</source>


   <p>In this example a bytearray (fld in relation A) is cast to type map.</p>
<source>
cat data;
[open#apache]
[apache#hadoop]
[hadoop#pig]
[pig#grunt]

A = LOAD 'data' AS fld:bytearray;

DESCRIBE A;
A: {fld: bytearray}

DUMP A;
([open#apache])
([apache#hadoop])
([hadoop#pig])
([pig#grunt])

B = FOREACH A GENERATE ((map[])fld;

DESCRIBE B;
B: {map[ ]}

DUMP B;
([open#apache])
([apache#hadoop])
([hadoop#pig])
([pig#grunt])
</source>
   
</section>

<!-- +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -->
   <section id="cast-relations">
   <title>Casting Relations to Scalars</title>
<p>Pig allows you to cast the elements of a single-tuple relation into a scalar value. 
The tuple can be a single-field or multi-field tulple. 
If the relation contains more than one tuple, however, a runtime error is generated: "Scalar has more than one row in the output". 
</p>

<p>The cast relation can be used in any place where an expression of the type would make sense, including FOREACH, FILTER, and SPLIT. Note that if an explicit cast is not used an implict cast will be inserted according to Pig rules. Also, when the schema can't be inferred bytearray is used.</p>  
 
<p>The primary use case for casting relations to scalars is the ability to use the values of global aggregates in follow up computations. </p> 
 
<p>In this example the percentage of clicks belonging to a particular user are computed. For the FOREACH statement, an explicit cast is used. If the SUM is not given a name, a position can be used as well (userid, clicks/(double)C.$0). </p>

<source>
A = load 'mydata' as (userid, clicks); 
B = group A all; 
C = foreach B genertate SUM(A.clicks) as total; 
D = foreach A generate userid, clicks/(double)C.total; 
dump D;
</source>
   
<p>In this example a multi-field tuple is used. For the FILTER statement, Pig performs an implicit cast. For the FOREACH statement, 
an explicit cast is used.</p>
<source>
A = load 'mydata' as (userid, clicks); 
B = group A all; 
C = foreach B genertate SUM(A.clicks) as total, COUNT(A) as cnt; 
D = FILTER A by clicks > C.total/3 
E = foreach D generate userid, clicks/(double)C.total, cnt; 
dump E; 
</source>
</section>
</section>
   
   <!-- =================================================================== --> 
   <section id="comparison">
   <title>Comparison Operators</title>
      
    <section><title>Description</title>
   <table>
      <tr>
            <td>
               <p>Operator</p>
            </td>
            <td>
               <p>Symbol</p>
            </td>
            <td>
               <p> Notes</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>equal  </p>
            </td>
            <td>
               <p>==</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>not equal </p>
            </td>
            <td>
               <p>!=</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>less than  </p>
            </td>
            <td>
               <p>&lt;</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>greater than </p>
            </td>
            <td>
               <p>&gt;</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>less than or equal to  </p>
            </td>
            <td>
               <p>&lt;=</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>greater than or equal to</p>
            </td>
            <td>
               <p>&gt;=</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p>pattern matching  </p>
            </td>
            <td>
               <p>matches</p>
            </td>
            <td>
            <p>Takes an expression on the left and a string constant on the right.</p>
            <p><em>expression</em> matches <em>string-constant</em></p>
            <p>Use the Java <a href="http://docs.oracle.com/javase/1.5.0/docs/api/java/util/regex/Pattern.html">format</a> for regular expressions.</p>

            </td>
         </tr>
   </table>
   <p>Use the comparison operators with numeric and string data.</p>
    </section>
    
   <section>
   <title>Examples</title>

<p><strong>Numeric Example</strong></p>
<source>
X = FILTER A BY (f1 == 8);
</source>

<p><strong>String Example</strong></p>   
<source>
X = FILTER A BY (f2 == 'apache');
</source>

 <p><strong>Matches Example</strong></p>    
<source>
X = FILTER A BY (f1 matches '.*apache.*');
</source>
   </section>
   
   <section id="types-table-equal">
   <title>Types Table: equal (==) operator</title>
   <p></p>
   <table>
         <tr>
            <td>
               <p></p>
            </td>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p>map </p>
            </td>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>datetime </p>
            </td>
            <td>
               <p>biginteger </p>
            </td>
            <td>
               <p>bigdecimal </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
               <p>(see Note 1) </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>map </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
               <p>(see Note 2)</p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>int </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>long </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as boolean </p>
            </td>
             <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>float </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as boolean  </p>
            </td>
             <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>double </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>cast as boolean  </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>cast as boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>datetime </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean</p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>biginteger </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean</p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bigdecimal </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean</p>
            </td>
         </tr>
   </table>
   <p>Note 1: boolean (Tuple A is equal to tuple B if they have the same size s, and for all 0 &lt;= i &lt; s A[i] == B[i])</p>
   <p>Note 2: boolean (Map A is equal to map B if A and B have the same number of entries, and for every key k1 in A with a value of v1, there is a key k2 in B with a value of v2, such that k1 == k2 and v1 == v2)</p>
</section>

   <section id="types-table-not-equal">
   <title>Types Table: not equal (!=) operator</title>
     <p></p>
   <table>
         <tr>
            <td>
               <p></p>
            </td>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p>map </p>
            </td>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>datetime </p>
            </td>
            <td>
               <p>biginteger </p>
            </td>
            <td>
               <p>bigdecimal </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>map </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>int </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>boolean (bytearray cast as int) </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>long </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>boolean (bytearray cast as long) </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>float </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>boolean (bytearray cast as float) </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>double </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>boolean (bytearray cast as double) </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean (bytearray cast as chararray) </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>datetime</p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>biginteger</p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
                  <tr>
            <td>
               <p>bigdecimal</p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p></p>
            </td>
            <td>
               <p>boolean </p>
            </td>
         </tr>
   </table>
   </section>
   
   <section id="types-table-matches">
   <title>Types Table: matches operator</title>
   <p>*Cast as chararray (the second argument must be chararray)</p>
   <table>
         <tr>
            <td>
               <p></p>
            </td>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p>bytearray* </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p>boolean </p>
            </td>
            <td>
               <p>boolean  </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p>boolean</p>
            </td>
            <td>
               <p>boolean </p>
            </td>
         </tr>
   </table>
   </section>
   </section>

   

  <!-- =================================================================== -->    
   <section id="type-construction">
   <title>Type Construction Operators</title>
   
   <section>
   <title>Description</title>
   <table>
      <tr>
            <td>
               <p>Operator</p>
            </td>
            <td>
               <p>Symbol</p>
            </td>
            <td>
               <p> Notes</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple constructor </p>
            </td>
            <td>
               <p> ( ) </p>
            </td>
            <td>
               <p>Use to construct a tuple from the specified elements. Equivalent to <a href="func.html#totuple">TOTUPLE</a>.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bag constructor</p>
            </td>
            <td>
               <p> { }</p>
            </td>
            <td>
               <p>Use to construct a bag from the specified elements. Equivalent to <a href="func.html#tobag">TOBAG</a>.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>map constructor</p>
            </td>
            <td>
               <p> [ ]</p>
            </td>
            <td>
               <p>Use to construct a map from the specified elements. Equivalent to <a href="func.html#tomap">TOMAP</a>.</p>
            </td>
         </tr>
   </table>

<p></p>   
<p>Note the following:</p>  
<ul>
<li>These operators can be used anywhere where the expression of the corresponding type is acceptable including FOREACH GENERATE, FILTER, etc.</li>
<li>A single element enclosed in parens ( ) like (5) is not considered to be a tuple but rather an arithmetic operator.</li>
<li>For bags, every element is put in the bag; if the element is not a tuple Pig will create a tuple for it:
<ul>
<li> Given this {$1, $2}  Pig creates this {($1), ($2)} a bag with two tuples
<p>... neither $1 and $2 are tuples so Pig creates a tuple around each item</p> <p>&nbsp;</p></li>

<li> Given this {($1), $2} Pig creates this {($1), ($2)} a bag with two tuples
<p>... since ($1) is treated as $1 (one cannot create a single element tuple using this syntax), {($1), $2} becomes {$1, $2} and Pig creates a tuple around each item</p><p>&nbsp;</p></li>

<li> Given this {($1, $2)} Pig creates this {($1, $2)} a bag with a single tuple
<p>... Pig creates a tuple ($1, $2) and then puts this tuple into the bag</p><p>&nbsp;</p></li>

</ul> 
</li>
</ul>
</section>
   
<!-- ++++++++++++++++++++++++++++++++++ --> 
   <section>
   <title>Examples</title>
<p><strong>Tuple Construction</strong></p>
<source>
A = load 'students' as (name:chararray, age:int, gpa:float);
B = foreach A generate (name, age);
store B into 'results';

Input (students):
joe smith  20  3.5
amy chen   22  3.2
leo allen  18  2.1

Output (results):
(joe smith,20)
(amy chen,22)
(leo allen,18)
</source>   
   
<!-- ++++++++++++++++++++++++++++++++++ --> 
<p><strong>Bag Construction</strong></p>
<source>
A = load 'students' as (name:chararray, age:int, gpa:float);
B = foreach A generate {(name, age)}, {name, age};
store B into 'results';

Input (students):
joe smith  20  3.5
amy chen   22  3.2
leo allen  18  2.1

Output (results):
{(joe smith,20)}   {(joe smith),(20)}
{(amy chen,22)}    {(amy chen),(22)}
{(leo allen,18)}   {(leo allen),(18)}
</source>   

<!-- ++++++++++++++++++++++++++++++++++ -->    
<p><strong>Map Construction</strong></p>
<source>
A = load 'students' as (name:chararray, age:int, gpa:float);
B = foreach A generate [name, gpa];
store B into 'results';

Input (students):
joe smith  20  3.5
amy chen   22  3.2
leo allen  18  2.1

Output (results):
[joe smith#3.5]
[amy chen#3.2]
[leo allen#2.1]
</source>
</section>
</section>


  <!-- =================================================================== -->    
   <section id="deref">
   <title>Dereference Operators</title>
   
   <section>
   <title>Description</title>
   <table>
      <tr>
            <td>
               <p>Operator</p>
            </td>
            <td>
               <p>Symbol</p>
            </td>
            <td>
               <p> Notes</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple dereference      </p>
            </td>
            <td>
               <p>tuple.id or tuple.(id,…)</p>
            </td>
            <td>
               <p>Tuple dereferencing can be done by name (tuple.field_name) or position (mytuple.$0). If a set of fields are dereferenced (tuple.(name1, name2) or tuple.($0, $1)), the expression represents a tuple composed of the specified fields. Note that if the dot operator is applied to a bytearray, the bytearray will be assumed to be a tuple.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bag dereference</p>
            </td>
            <td>
               <p>bag.id or bag.(id,…)</p>
            </td>
            <td>
               <p>Bag dereferencing can be done by name (bag.field_name) or position (bag.$0). If a set of fields are dereferenced (bag.(name1, name2) or bag.($0, $1)), the expression represents a bag composed of the specified fields.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>map dereference</p>
            </td>
            <td>
               <p>map#'key'</p>
            </td>
            <td>
               <p>Map dereferencing must be done by key (field_name#key or $0#key). If the pound operator is applied to a bytearray, the bytearray is assumed to be a map. If the key does not exist, the empty string is returned.</p>
            </td>
         </tr>
   </table>
   </section>
   
<!-- ++++++++++++++++++++++++++++++++++++ -->    
<section>
<title>Examples</title> 
  
<p><strong>Tuple Example</strong></p>   
<p>Suppose we have relation A.</p>
<source>
A = LOAD 'data' as (f1:int, f2:tuple(t1:int,t2:int,t3:int));

DUMP A;
(1,(1,2,3))
(2,(4,5,6))
(3,(7,8,9))
(4,(1,4,7))
(5,(2,5,8))
</source>
   
   <p>In this example dereferencing is used to retrieve two fields from tuple f2.</p>
<source>
X = FOREACH A GENERATE f2.t1,f2.t3;

DUMP X;
(1,3)
(4,6)
(7,9)
(1,7)
(2,8)
</source>

   
<!-- ++++++++++++++++++++++++++++++++++++ --> 
<p><strong>Bag Example</strong></p>   
   
<p>Suppose we have relation B, formed by grouping relation A (see the GROUP operator for information about the field names in relation B).</p>
<source>
A = LOAD 'data' AS (f1:int, f2:int,f3:int);

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)

B = GROUP A BY f1;

DUMP B;
(1,{(1,2,3)})
(4,{(4,2,1),(4,3,3)})
(7,{(7,2,5)})
(8,{(8,3,4),(8,4,3)})

ILLUSTRATE B;
<em>etc …</em>
----------------------------------------------------------
| b   | group: int | a: bag({f1: int,f2: int,f3: int}) |
----------------------------------------------------------
</source>
   
<p>In this example dereferencing is used with relation X to project the first field (f1) of each tuple in the bag (a).</p>
<source>
X = FOREACH B GENERATE a.f1;

DUMP X;
({(1)})
({(4),(4)})
({(7)})
({(8),(8)})
</source>
   

<!-- ++++++++++++++++++++++++++++++++++++ --> 
<p><strong>Tuple/Bag Example</strong></p>

<p>Suppose we have relation B, formed by grouping relation A  (see the GROUP operator for information about the field names in relation B).</p>

<source>
A = LOAD 'data' AS (f1:int, f2:int, f3:int);

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)

B = GROUP A BY (f1,f2);

DUMP B;
((1,2),{(1,2,3)})
((4,2),{(4,2,1)})
((4,3),{(4,3,3)})
((7,2),{(7,2,5)})
((8,3),{(8,3,4)})
((8,4),{(8,4,3)})

ILLUSTRATE B;
<em>etc …</em>
-------------------------------------------------------------------------------
| b     | group: tuple({f1: int,f2: int}) | a: bag({f1: int,f2: int,f3: int}) |
-------------------------------------------------------------------------------
|       | (8, 3)                                | {(8, 3, 4), (8, 3, 4)} |
-------------------------------------------------------------------------------
</source>   
   
   <p>In this example dereferencing is used to project a field (f1) from a tuple (group) and a field (f1) from a bag (a).</p>
<source>
X = FOREACH B GENERATE group.f1, a.f1;

DUMP X;
(1,{(1)})
(4,{(4)})
(4,{(4)})
(7,{(7)})
(8,{(8)})
(8,{(8)})
</source>

<!-- ++++++++++++++++++++++++++++++++++++ -->     
<p><strong>Map Example</strong></p>
 <p>Suppose we have relation A. </p>

<source>
A = LOAD 'data' AS (f1:int, f2:map[]);

DUMP A;
(1,[open#apache])
(2,[apache#hadoop])
(3,[hadoop#pig])
(4,[pig#grunt])
</source>

   <p>In this example dereferencing is used to look up the value of key 'open'.</p>

<source>
X = FOREACH A GENERATE f2#'open';

DUMP X;
(apache)
()
()
()
</source>
</section>
</section>
  
  <!-- =================================================================== -->    
<section id="disambiguate">
<title>Disambiguate Operator</title>

<p>After JOIN, COGROUP, CROSS, or FLATTEN operations, the field names have the orginial alias and the disambiguate
   operator ( :: ) prepended in the schema. The disambiguate operator is used to identify field names in case there
   is a ambiguity.</p>

<p>In this example, to disambiguate y,  use A::y or B::y.  In cases where there is no ambiguity, such as z, the :: is not necessary but is still supported.</p>

<source>
A = load 'data1' as (x, y);
B = load 'data2' as (x, y, z);
C = join A by x, B by x;
D = foreach C generate A::y, z; -- Cannot simply refer to y as it can refer to A::y or B::y
</source>
<p> In cases where the schema is stored as part of the StoreFunc like PigStorage, JsonStorage, AvroStorage or OrcStorage,
   users generally have to use an extra FOREACH before STORE to rename the field names and remove the disambiguate
   operator from the names. To automatically remove the disambiguate operator from the schema for the STORE operation,
   the pig.store.schema.disambiguate Pig property can be set to "false". It is the responsibility of the user
   to make sure that there is no conflict in the field names when using this setting.
</p>
</section>

    <!-- =================================================================== -->  
   <section  id="flatten">
   <title>Flatten Operator</title>
   <p>The FLATTEN operator looks like a UDF syntactically, but it is actually an operator that changes the structure of tuples 
   and bags in a way that a UDF cannot. Flatten un-nests tuples, bags and maps. The idea is the
      same, but the operation and result is different for each type of structure.</p>

   <p>For tuples, flatten substitutes the fields of a tuple in place of the tuple. For example, consider a relation that has a tuple 
   of the form (a, (b, c)). The expression GENERATE $0, flatten($1), will cause that tuple to become (a, b, c).</p>

   <p>For bags, the situation becomes more complicated. When we un-nest a bag, we create new tuples. If we have a 
   relation that is made up of tuples of the form ({(b,c),(d,e)}) and we apply GENERATE flatten($0), we end up with two 
   tuples (b,c) and (d,e). When we remove a level of nesting in a bag, sometimes we cause a cross product to happen. 
   For example, consider a relation that has a tuple of the form (a, {(b,c), (d,e)}), commonly produced by the GROUP operator. 
   If we apply the expression GENERATE $0, flatten($1) to this tuple, we will create new tuples: (a, b, c) and (a, d, e).</p>

   <p>For maps, flatten creates a tuple with two fields containing the key and value.
      If we have a map field named kvpair with input as (m[k1#v1, k2#v2]) and we apply GENERATE flatten(kvpair),
      it will generate two tuples (k1,v1) and (k2,v2) which can be accessed as kvpair::key and
      kvpair::value.<br/>When there are additional projections in the expression, a cross product will happen similar
      to bags. For example, if we apply the expression GENERATE $0, FLATTEN($1) to the input tuple (a, m[k1#1, k2#2, k3#3]),
      we will see (a,k1,1), (a,k2,2) and (a,k3,3) as the result.
   </p>

   <p>For other types, flatten becomes a no-op and simply returns the passed value. </p>

   <p>Also note that the flatten of empty bag will result in that row being discarded; no output is generated. 
   (See also <a href="perf.html#nulls">Drop Nulls Before a Join</a>.) </p>
   <p>As for flatten with null values, see <a href="#nulls_flatten">Nulls and FLATTEN operator</a>.</p>
   
   <source>
grunt> cat empty.bag
{}      1
grunt> A = LOAD 'empty.bag' AS (b : bag{}, i : int);
grunt> B = FOREACH A GENERATE flatten(b), i;
grunt> DUMP B;
grunt>
</source>
   
   <p>For examples using the FLATTEN operator, see <a href="#flatten-example">FOREACH</a>.</p>
   </section>

  <!-- =================================================================== -->    
<section id="null_operators">
   <title>Null Operators</title>
     
   <section>
   <title>Description</title>
   <table>
      <tr>
            <td>
               <p>Operator</p>
            </td>
            <td>
               <p>Symbol</p>
            </td>
            <td>
               <p> Notes</p>
            </td>
         </tr>
         <tr>
            <td>
               <p id="is-null">is null </p>
            </td>
            <td>
               <p>is null</p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
         <tr>
            <td>
               <p id="is-not-null">is not null  </p>
            </td>
            <td>
               <p>is not null  </p>
            </td>
            <td>
               <p></p>
            </td>
         </tr>
   </table>
   <p></p>
   <p>For a detailed discussion of nulls see <a href="#nulls">Nulls and Pig Latin</a>.</p>
      </section>
      
   <section>
   <title>Examples</title>
   
   <p>In this example, values that are not null are obtained.</p>
<source>
X = FILTER A BY f1 is not null;
</source>
   </section>

   <section id="types-table-nulls">
   <title>Types Table</title>
   <p>The null operators can be applied to all data types (see <a  href="#nulls">Nulls and Pig Latin</a>). </p>
   </section>
   </section>
   
  <!-- =================================================================== -->    
   <section id="sign">
   <title>Sign Operators</title>
   
   <section>
   <title>Description</title>
   <table>
      <tr>
            <td>
               <p>Operator</p>
            </td>
            <td>
               <p>Symbol</p>
            </td>
            <td>
               <p> Notes</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>positive       </p>
            </td>
            <td>
               <p>+</p>
            </td>
            <td>
               <p> Has no effect.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>negative (negation)</p>
            </td>
            <td>
               <p> -</p>
            </td>
            <td>
               <p> Changes the sign of a positive or negative number.</p>
            </td>
         </tr>
   </table>
   </section>
   
   <section>
   <title>Examples</title>
<p>In this example, the negation operator is applied to the "x" values.</p>   
<source>
A = LOAD 'data' as (x, y, z);

B = FOREACH A GENERATE -x, y;
</source>
   
   </section>
   
   <section id="types-table-negative">
   <title>Types Table: negative ( - ) operator</title>
   <table>
      <tr>
            <td>
               <p>bag </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>tuple </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>map </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>int </p>
            </td>
            <td>
               <p>int </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>long </p>
            </td>
            <td>
               <p>long </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>float </p>
            </td>
            <td>
               <p>float </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>double </p>
            </td>
            <td>
               <p>double </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>chararray </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bytearray </p>
            </td>
            <td>
               <p>double (as double) </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>datetime </p>
            </td>
            <td>
               <p>error </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>biginteger </p>
            </td>
            <td>
               <p>biginteger </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>bigdecimal </p>
            </td>
            <td>
               <p>bigdecimal </p>
            </td>
         </tr>
   </table>
   </section>
  
</section>
</section>   

<!-- =================================================================== -->
<!-- RELATIONAL OPERATORS, ETC -->
<section>
<title>Relational Operators</title>

<!-- =================================================================== -->
   <section id="assert">
   <title>ASSERT</title>
   <p>Assert a condition on the data.</p>

   <section>
   <title>Syntax</title>
   <table>
       <tr>
            <td>
               <p>ASSERT alias BY expression [, message];</p>
            </td>
        </tr>
   </table>
   </section>

   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of the relation.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>BY</p>
            </td>
            <td>
               <p>Required keyword.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>expression</p>
            </td>
            <td>
               <p>A boolean expression.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>message</p>
            </td>
            <td>
               <p>Error message when assertion fails.</p>
            </td>
         </tr>
   </table>
   </section>

   <section>
     <title>Usage</title>
     <p>Use assert to ensure a condition is true on your data. Processing fails if any of the records voilate the condition.</p>
   </section>

   <section>
   <title>Examples</title>
   <p>Suppose we have relation A.</p>
<source>
A = LOAD 'data' AS (a0:int,a1:int,a2:int);

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)
</source>

<p>Now, you can assert that a0 column in your data is >0, fail if otherwise</p>
<source>
ASSERT A by a0 > 0, 'a0 should be greater than 0';
</source>

</section></section>

<!-- =================================================================== -->
 <section id="cogroup">
<title>COGROUP</title>
   <p>See the <a href="#group">GROUP</a> operator.</p>
</section>

<!-- =================================================================== -->
   <section id="cross">
   <title>CROSS</title>
   <p>Computes the cross product of two or more relations.</p>
   
   <section>
   <title>Syntax</title>
   <table>
       <tr>
            <td>
               <p>alias = CROSS alias, alias [, alias …] [PARTITION BY partitioner] [PARALLEL n];</p>
            </td>
        </tr> 
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of a relation. </p>
            </td>
         </tr>
               <tr>
            <td>
               <p id="partition-by-cross">PARTITION BY partitioner</p>
            </td>
            <td>
             <p>Use this feature to specify the Hadoop Partitioner. The partitioner controls the partitioning of the keys of the intermediate map-outputs. </p>
             <ul>
             <li>
             <p>For more details, see <a href="http://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapred/Partitioner.html">http://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapred/Partitioner.html</a></p>
             </li>
             <li>
             <p>For usage, see <a href="#partitionby">Example: PARTITION BY</a></p>
             </li>
             </ul>
            </td>
         </tr>
         <tr>
            <td>
               <p>PARALLEL n</p>
            </td>
            <td>
               <p>Increase the parallelism of a job by specifying the number of reduce tasks, n. </p>
               <p>For more information, see <a href="perf.html#parallel">Use the Parallel Features</a>.</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Usage</title>
   <p>Use the CROSS operator to compute the cross product (Cartesian product) of two or more relations.</p>
   <p>CROSS is an expensive operation and should be used sparingly. </p>
   </section>
   
   <section>
   <title>Example</title>
   <p>Suppose we have relations A and B.</p>
<source>
A = LOAD 'data1' AS (a1:int,a2:int,a3:int);

DUMP A;
(1,2,3)
(4,2,1)

B = LOAD 'data2' AS (b1:int,b2:int);

DUMP B;
(2,4)
(8,9)
(1,3)
</source>
   
<p>In this example the cross product of relation A and B is computed.</p>
<source>
X = CROSS A, B;

DUMP X;
(1,2,3,2,4)
(1,2,3,8,9)
(1,2,3,1,3)
(4,2,1,2,4)
(4,2,1,8,9)
(4,2,1,1,3)
</source>
   
   </section></section>
   

<!-- =================================================================== -->
   <section id="cube">
      <title>CUBE</title>
      <p>Performs cube/rollup operations.</p>

      <section>
         <title>Cube operation</title>
         <p>Cube operation computes aggregates for all possbile combinations of specified group by dimensions. The number of group by combinations generated by cube for n dimensions will be 2^n.</p>
      </section>

      <section>
         <title>Rollup operation</title>
         <p>Rollup operations computes multiple levels of aggregates based on hierarchical ordering of specified group by dimensions. Rollup is useful when there is hierarchical ordering on the dimensions. The number of group by combinations generated by rollup for n dimensions will be n+1.</p>
      </section>

      <section>
         <title>Syntax</title>
            <table>
               <tr>
                  <td>
                     <p>alias = CUBE alias BY { CUBE expression | ROLLUP expression }, [ CUBE expression | ROLLUP expression ] [PARALLEL n];</p>
                  </td>
               </tr>
            </table>
      </section>

      <section>
         <title>Terms</title>
         <table>
            <tr>
               <td>
                  <p>alias</p>
               </td>
               <td>
                  <p>The name of the relation.</p>
               </td>
            </tr>

            <tr>
               <td>
                  <p>CUBE</p>
                  </td>
                  <td>
                   <p>Keyword</p>
               </td>
            </tr>

            <tr>
               <td>
                  <p>BY</p>
                  </td>
                  <td>
                   <p>Keyword</p>
               </td>
            </tr>

            <tr>
               <td>
                  <p>expression</p>
                  </td>
                  <td>
                   <p>Projections (dimensions) of the relation. Supports field, star and project-range expressions.</p>
               </td>
            </tr>

            <tr>
               <td>
                  <p>ROLLUP</p>
                  </td>
                  <td>
                   <p>Keyword</p>
               </td>
            </tr>

            <tr>
               <td>
                  <p>PARALLEL n</p>
               </td>
               <td>
                  <p>Increase the parallelism of a job by specifying the number of reduce tasks, n.</p>
                  <p>For more information, see <a href="perf.html#parallel">Use the Parallel Features</a>.</p>
               </td>
            </tr>
         </table>
      </section>

      <section>
         <title>Example</title>
      </section>

      <section>
         <title>Basic usage of CUBE operation</title>
         <source>
salesinp = LOAD '/pig/data/salesdata' USING PigStorage(',') AS
    (product:chararray, year:int, region:chararray, state:chararray, city:chararray, sales:long);
cubedinp = CUBE salesinp BY CUBE(product,year);
result = FOREACH cubedinp GENERATE FLATTEN(group), SUM(cube.sales) AS totalsales;</source>
         <p>For a sample input tuple (car, 2012, midwest, ohio, columbus, 4000), the above query with cube operation will output</p>
         <source>
(car,2012,4000)
(car,,4000)
(,2012,4000)
(,,4000)</source>
      </section>

      <section>
         <title>Output schema</title>
         <source>
grunt> describe cubedinp;
cubedinp: {group: (product: chararray,year: int),cube: {(product: chararray,year: int,region: chararray,
state: chararray,city: chararray,sales: long)}}</source>
         <p>Note the second column, ‘cube’ field which is a bag of all tuples that belong to ‘group’. Also note that the measure attribute ‘sales’ along with other unused dimensions in load statement are pushed down so that it can be referenced later while computing aggregates on the measure, like in this case SUM(cube.sales).</p>
      </section>

      <section>
         <title>Basic usage of ROLLUP operation</title>
         <source>
salesinp = LOAD '/pig/data/salesdata' USING PigStorage(',') AS
    (product:chararray, year:int, region:chararray, state:chararray, city:chararray, sales:long);
rolledup = CUBE salesinp BY ROLLUP(region,state,city);
result = FOREACH rolledup GENERATE FLATTEN(group), SUM(cube.sales) AS totalsales;</source>
         <p>For a sample input tuple (car, 2012, midwest, ohio, columbus, 4000), the above query with rollup operation will output</p>
         <source>
(midwest,ohio,columbus,4000)
(midwest,ohio,,4000)
(midwest,,,4000)
(,,,4000)</source>
      </section>

      <section>
         <title>Output schema</title>
         <source>
grunt> describe rolledup;
rolledup: {group: (region: chararray,state: chararray,city: chararray),cube: {(region: chararray,
state: chararray,city: chararray,product: chararray,year: int,sales: long)}}</source>
      </section>

      <section>
         <title>Basic usage of CUBE and ROLLUP operation combined</title>
         <p>If CUBE and ROLLUP operations are used together, the output groups will be the cross product of all groups generated by cube and rollup operation. If there are m dimensions in cube operations and n dimensions in rollup operation then overall number of combinations will be (2^m) * (n+1).</p>
         <source>
salesinp = LOAD '/pig/data/salesdata' USING PigStorage(',') AS
    (product:chararray, year:int, region:chararray, state:chararray, city:chararray, sales:long);
cubed_and_rolled = CUBE salesinp BY CUBE(product,year), ROLLUP(region, state, city);
result = FOREACH cubed_and_rolled GENERATE FLATTEN(group), SUM(cube.sales) AS totalsales;</source>
         <p>For a sample input tuple (car, 2012, midwest, ohio, columbus, 4000), the above query with cube and rollup operation will output</p>
         <source>
(car,2012,midwest,ohio,columbus,4000)
(car,2012,midwest,ohio,,4000)
(car,2012,midwest,,,4000)
(car,2012,,,,4000)
(car,,midwest,ohio,columbus,4000)
(car,,midwest,ohio,,4000)
(car,,midwest,,,4000)
(car,,,,,4000)
(,2012,midwest,ohio,columbus,4000)
(,2012,midwest,ohio,,4000)
(,2012,midwest,,,4000)
(,2012,,,,4000)
(,,midwest,ohio,columbus,4000)
(,,midwest,ohio,,4000)
(,,midwest,,,4000)
(,,,,,4000)</source>
      </section>

      <section>
         <title>Output schema</title>
         <source>
grunt> describe cubed_and_rolled;
cubed_and_rolled: {group: (product: chararray,year: int,region: chararray,
state: chararray,city: chararray),cube: {(product: chararray,year: int,region: chararray,
state: chararray,city: chararray,sales: long)}}</source>
      </section>

      <section>
         <title>Handling null values in dimensions</title>
         <p>Since null values are used to represent subtotals in cube and rollup operation, in order to differentiate the legitimate null values that already exists as dimension values, CUBE operator converts any null values in dimensions to "unknown" value before performing cube or rollup operation. For example, for CUBE(product,location) with a sample tuple (car,) the output will be </p>
         <source>
(car,unknown)
(car,)
(,unknown)
(,)</source>
      </section>
   </section>

<!-- =================================================================== -->    
   <section id="define">
   <title>DEFINE</title>
   
   <p>See:</p>
   <ul>
   <li><a href="basic.html#define-udfs">DEFINE (UDFs, streaming)</a></li>
   <li><a href="cont.html#define-macros">DEFINE (macros)</a></li>
   </ul>
   </section>
 
 <!-- =================================================================== -->  
   <section id="distinct">
   <title>DISTINCT </title>
   <p>Removes duplicate tuples in a relation.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>alias = DISTINCT alias [PARTITION BY partitioner] [PARALLEL n];        </p>
            </td>
         </tr> 
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
        <tr> 
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of the relation.</p>
            </td>
        </tr>
      
      <tr>      
         <td>
               <p id="partition-by-distinct">PARTITION BY partitioner</p>
            </td>
            <td>
             <p>Use this feature to specify the Hadoop Partitioner. The partitioner controls the partitioning of the keys of the intermediate map-outputs. </p>
             <ul>
             <li>
             <p>For more details, see <a href="http://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapred/Partitioner.html">http://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapred/Partitioner.html</a></p>
             </li>
             <li>
              <p>For usage, see <a href="#partitionby">Example: PARTITION BY</a>.</p>
             </li>
             </ul>
         </td>
     </tr> 
         
         <tr>
            <td>
               <p>PARALLEL n</p>
            </td>
            <td>
               <p>Increase the parallelism of a job by specifying the number of reduce tasks, n.</p>
               <p>For more information, see <a href="perf.html#parallel">Use the Parallel Features</a>.</p>
            </td>
         </tr> 
   </table>
   </section>
   
   <section>
   <title>Usage</title>
   <p>Use the DISTINCT operator to remove duplicate tuples in a relation. DISTINCT does not preserve the original order of the contents (to eliminate duplicates, Pig must first sort the data). You cannot use DISTINCT on a subset of fields; to do this, use FOREACH and a nested block to first select the fields and then apply DISTINCT  (see <a href="#nestedblock">Example: Nested Block</a>).</p>
   </section>
   
   <section>
   <title>Example</title>
   <p>Suppose we have relation A.</p>
<source>
A = LOAD 'data' AS (a1:int,a2:int,a3:int);

DUMP A;
(8,3,4)
(1,2,3)        
(4,3,3)        
(4,3,3)        
(1,2,3) 
</source>
   
   <p>In this example all duplicate tuples are removed.</p>
<source>
X = DISTINCT A;

DUMP X;
(1,2,3)
(4,3,3)
(8,3,4)
</source>

 </section></section>
   
  
<!-- =================================================================== -->   
   <section id="filter">
   <title>FILTER </title>
   <p>Selects tuples from a relation based on some condition.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>alias = FILTER alias  BY expression;</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of the relation.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>BY</p>
            </td>
            <td>
               <p>Required keyword.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>expression</p>
            </td>
            <td>
               <p>A boolean expression.</p>
            </td>
         </tr> 
   </table>
   </section>
   
   <section>
   <title>Usage</title>
   <p>Use the FILTER operator to work with tuples or rows of data (if you want to work with columns of data, use the FOREACH...GENERATE operation).</p>
   <p>FILTER is commonly used to select the data that you want; or, conversely, to filter out (remove) the data you don’t want.</p>
   </section>
   
   <section>
   <title>Examples</title>
   <p>Suppose we have relation A.</p>
<source>
A = LOAD 'data' AS (a1:int,a2:int,a3:int);

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)
</source>
   
   <p>In this example the condition states that if the third field equals 3, then include the tuple with relation X.</p>
<source>
X = FILTER A BY f3 == 3;

DUMP X;
(1,2,3)
(4,3,3)
(8,4,3)
</source>
   
<p>In this example the condition states that if the first field equals 8 or if the sum of fields f2 and f3 is not greater than first field, then include the tuple relation X.</p>
<source>
X = FILTER A BY (f1 == 8) OR (NOT (f2+f3 > f1));

DUMP X;
(4,2,1)
(8,3,4)
(7,2,5)
(8,4,3)
</source>

</section></section>
 
 <!-- =================================================================== -->  
   <section id="foreach">
   <title>FOREACH</title>
   <p>Generates data transformations based on columns of data.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>alias  = FOREACH { block | nested_block };</p>
            </td>
         </tr> 
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias </p>
            </td>
            <td>
               <p>The name of relation (outer bag).</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>block</p>
            </td>
            <td>
               <p>FOREACH…GENERATE block used with a relation (outer bag). Use this syntax:</p>
               <p></p>
               <p>alias = FOREACH alias GENERATE expression [AS schema] [expression [AS schema]….];</p>
               <p>See <a href="#schemas">Schemas</a></p>
               
            </td>
         </tr>
         <tr>
            <td>
               <p id="nested-block">nested_block</p>
            </td>
            <td>
               <p>Nested FOREACH...GENERATE block used with a inner bag. Use this syntax:</p>
               <p></p>
               <p>alias = FOREACH nested_alias {</p>
               <p>   alias = {nested_op | nested_exp}; [{alias = {nested_op | nested_exp}; …]</p>
               <p>   GENERATE expression [AS schema] [expression [AS schema]….]</p>
               <p>};</p>
               <p></p>
               <p>Where:</p>
               <p>The nested block is enclosed in opening and closing brackets { … }. </p>
               <p>The GENERATE keyword must be the last statement within the nested block.</p>
               <p>See <a href="#schemas">Schemas</a></p>
               <p>Macros are NOT alllowed inside a nested block.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>expression</p>
            </td>
            <td>
               <p>An expression.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>nested_alias</p>
            </td>
            <td>
               <p>The name of the inner bag.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>nested_op</p>
            </td>
            <td>
               <p>Allowed operations are CROSS, DISTINCT, FILTER, FOREACH, LIMIT, and ORDER BY. </p>
               <p>Note: FOREACH statements can be nested to two levels only. FOREACH statements that are nested to three or more levels will result in a grammar error.</p>
               <p>You can also perform projections within the nested block.</p>
               <p>For examples, see <a href="#nestedblock">Example: Nested Block</a>.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>nested_exp</p>
            </td>
            <td>
               <p>Any arbitrary, supported expression.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>AS</p>
            </td>
            <td>
               <p>Keyword</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>schema</p>
            </td>
            <td>
               <p>A schema using the AS keyword (see <a href="#schemas">Schemas</a>).</p>
               <ul>
                  <li>
                     <p>If the <a href="#flatten">FLATTEN</a> operator is used, enclose the schema in parentheses.</p>
                  </li>
                  <li>
                     <p>If the FLATTEN operator is not used, don't enclose the schema in parentheses.</p>
                  </li>
               </ul>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Usage</title>
   <p>Use the FOREACH…GENERATE operation to work with columns of data (if you want to work with tuples or rows of data, use the FILTER operation).</p>
  
   <p>FOREACH...GENERATE works with relations (outer bags) as well as inner bags:</p>
   <ul>
      <li>
         <p>If A is a relation (outer bag), a FOREACH statement could look like this.</p>
<source>
X = FOREACH A GENERATE f1;
</source>
      </li>
      <li>
         <p>If A is an inner bag, a FOREACH statement could look like this.</p>
  <source>
X = FOREACH B {
        S = FILTER A BY 'xyz';
        GENERATE COUNT (S.$0);
}
</source>
      </li>
   </ul>
   </section>
   
   <section id="projection">
   <title>Example: Projection</title>
   <p>In this example the asterisk (*) is used to project all fields from relation A to relation X. Relation A and X are identical.</p>
<source>
X = FOREACH A GENERATE *;

DUMP X;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)
</source>
   
   <p>In this example two fields from relation A are projected to form relation X. </p>
<source>
X = FOREACH A GENERATE a1, a2;

DUMP X;
(1,2)
(4,2)
(8,3)
(4,3)
(7,2)
(8,4)
</source>
   
   </section>
   
   <section>
   <title>Example: Nested Projection</title>


   <p>In this example if one of the fields in the input relation is a tuple, bag or map, we can perform a projection on that field (using a deference operator).</p>
<source>
X = FOREACH C GENERATE group, B.b2;

DUMP X;
(1,{(3)})
(4,{(6),(9)})
(8,{(9)})
</source>
   
   <p>In this example multiple nested columns are retained.</p>
<source>
X = FOREACH C GENERATE group, A.(a1, a2);

DUMP X;
(1,{(1,2)})
(4,{(4,2),(4,3)})
(8,{(8,3),(8,4)})
</source>
   </section>
   
   <section>
   <title>Example: Schema</title>
   <p>In this example two fields in relation A are summed to form relation X. A schema is defined for the projected field.</p>
<source>
X = FOREACH A GENERATE a1+a2 AS f1:int;

DESCRIBE X;
x: {f1: int}

DUMP X;
(3)
(6)
(11)
(7)
(9)
(12)

Y = FILTER X BY f1 > 10;

DUMP Y;
(11)
(12)
</source>
   
   </section>
   
   <section>
   <title>Example: Applying Functions</title>
   <p>In this example the built in function SUM() is used to sum a set of numbers in a bag.</p>
<source>
X = FOREACH C GENERATE group, SUM (A.a1);

DUMP X;
(1,1)
(4,8)
(8,16)
</source>
   
   </section>
   
   <section id="flatten-example">
   <title>Example: Flatten</title>
   <p>In this example the <a href="#flatten">FLATTEN</a> operator is used to eliminate nesting. </p>
<source>
X = FOREACH C GENERATE group, FLATTEN(A);

DUMP X;
(1,1,2,3)
(4,4,2,1)
(4,4,3,3)
(8,8,3,4)
(8,8,4,3)
</source>
   
   
   <p>Another FLATTEN example.</p>
<source>
X = FOREACH C GENERATE GROUP, FLATTEN(A.a3);

DUMP X;
(1,3)
(4,1)
(4,3)
(8,4)
(8,3)
</source>
   
   <p>Another FLATTEN example. Note that for the group '4' in C, there are two tuples in each bag. Thus, when both bags are flattened, the cross product of these tuples is returned; that is, tuples (4, 2, 6), (4, 3, 6), (4, 2, 9), and (4, 3, 9).</p>
<source>
X = FOREACH C GENERATE FLATTEN(A.(a1, a2)), FLATTEN(B.$1);

DUMP X;
(1,2,3)
(4,2,6)
(4,2,9)
(4,3,6)
(4,3,9)
(8,3,9)
(8,4,9)
</source>

   <p>Another FLATTEN example. Here, relations A and B both have a column x. When forming relation E,  you need to use the :: operator to identify which column x to use - either relation A column x (A::x) or relation B column x (B::x). This example uses relation A column x (A::x).</p>
<source>
A = LOAD 'data' AS (x, y);
B = LOAD 'data' AS (x, z);
C = COGROUP A BY x, B BY x;
D = FOREACH C GENERATE flatten(A), flatten(b);
E = GROUP D BY A::x;
……
</source>

   <p>A FLATTEN example on a map type. Here we load an integer and map (of integer values) into A. Then m gets
      flattened, and finally we are filtering the result to only include tuples where the value among the un-nested
      map entries was 5.</p>
<source>
A = LOAD 'data' AS (a:int, m:map[int]);
B = FOREACH A GENERATE a, FLATTEN(m);
C = FILTER B by m::value == 5;
……
</source>

   </section>
   
   <section id="nestedblock">
   <title>Example: Nested Block</title>
      <p>In this example a CROSS is performed within the nested block.</p>
<source>
user = load 'user' as (uid, age, gender, region);
session = load 'session' as (uid, region);
C = cogroup user by uid, session by uid;
D = foreach C {
    crossed = cross user, session;
    generate crossed;
}
dump D;  
</source>
<p>In this example FOREACH is nested to the second level.</p>
<source>
a = load '1.txt' as (a0, a1:chararray, a2:chararray); 
b = group a by a0; 
c = foreach b { 
    c0 = foreach a generate TOMAP(a1,a2); 
    generate c0; 
} 
dump c; 
</source>
<p>This example shows a CROSS and FOREACH nested to the second level.</p>
<source>
a = load '1.txt' as (a0, a1, a2); 
b = load '2.txt' as (b0, b1); 
c = cogroup a by a0, b by b0; 
d = foreach c { 
    d0 = cross a, b; 
    d1 = foreach d0 generate a1+b1; 
    generate d1; 
} 
dump d;
</source>
   <p>Suppose we have relations A and B. Note that relation B contains an inner bag.</p>
<source>
A = LOAD 'data' AS (url:chararray,outlink:chararray);

DUMP A;
(www.ccc.com,www.hjk.com)
(www.ddd.com,www.xyz.org)
(www.aaa.com,www.cvn.org)
(www.www.com,www.kpt.net)
(www.www.com,www.xyz.org)
(www.ddd.com,www.xyz.org)

B = GROUP A BY url;

DUMP B;
(www.aaa.com,{(www.aaa.com,www.cvn.org)})
(www.ccc.com,{(www.ccc.com,www.hjk.com)})
(www.ddd.com,{(www.ddd.com,www.xyz.org),(www.ddd.com,www.xyz.org)})
(www.www.com,{(www.www.com,www.kpt.net),(www.www.com,www.xyz.org)})
</source>
   
   <p>In this example we perform two of the operations allowed in a nested block, FILTER and DISTINCT. Note that the last statement in the nested block must be GENERATE. Also, note the use of projection (PA = FA.outlink;) to retrieve a field. DISTINCT can be applied to a subset of fields (as opposed to a relation) only within a nested block.</p>
<source>
X = FOREACH B {
        FA= FILTER A BY outlink == 'www.xyz.org';
        PA = FA.outlink;
        DA = DISTINCT PA;
        GENERATE group, COUNT(DA);
}

DUMP X;
(www.aaa.com,0)
(www.ccc.com,0)
(www.ddd.com,1)
(www.www.com,1)
</source>
   
</section></section>
   
 
 
            
            
<!-- =================================================================== -->
   <section id="group">
   <title>GROUP</title>
   <p>Groups the data in one or more relations.</p>
   <p>Note: The GROUP and COGROUP operators are identical. Both operators work with one or more relations. 
   For readability GROUP is used in statements involving one relation and COGROUP is used in statements involving two or more relations. 
   You can COGROUP up to but no more than 127 relations at a time.</p>   
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>alias = GROUP alias { ALL | BY expression} [, alias ALL | BY expression …]  [USING 'collected' | 'merge'] [PARTITION BY partitioner] [PARALLEL n];</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias </p>
            </td>
            <td>
               <p>The name of a relation.</p>
               <p>You can COGROUP up to but no more than 127 relations at a time.</p>
            </td>
         </tr>
                  <tr>
            <td>
               <p>ALL</p>
            </td>
            <td>
               <p>Keyword. Use ALL if you want all tuples to go to a single group; for example, when doing aggregates across entire relations.</p>
               <p>B = GROUP A ALL;</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>BY</p>
            </td>
            <td>
               <p>Keyword. Use this clause to group the relation by field, tuple or expression.</p>
               <p>B = GROUP A BY f1;</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>expression</p>
            </td>
            <td>
               <p>A tuple expression. This is the group key or key field. If the result of the tuple expression is a single field, the key will be the value of the first field rather than a tuple with one field. To group using multiple keys, enclose the keys in parentheses:</p>
               <p>B = GROUP A BY (key1,key2);</p>
            </td>
         </tr>
         
         <tr>
            <td>
               <p>USING</p>
            </td>
            <td>
               <p>Keyword</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>'collected'</p>
            </td>
  
            <td>

            <p>Use the ‘collected’ clause with the GROUP operation (works with one relation only).</p>
            <p>The following conditions apply:</p>
               <ul>
               <li>
               <p>The loader must implement the {CollectableLoader} interface.</p>
               </li>
               <li>
               <p>Data must be sorted on the group key.</p>
               </li>
               </ul>
               <p></p>
               <p>If your data and loaders satisfy these conditions, use the ‘collected’ clause to perform an optimized version of GROUP; 
            the operation will execute on the map side and avoid running the reduce phase.</p>
               <p></p>
            </td>
         </tr>    
         
         <tr>
            <td>
               <p>'merge'</p>
            </td>
            <td>

               <p>Use the ‘merge’ clause with the COGROUP operation (works with two or more relations only).</p>
               <p>The following conditions apply:</p>
               <ul>
               <li>
               <p>No other operations can be done between the LOAD and COGROUP statements.</p>
               </li>
               <li>
               <p>Data must be sorted on the COGROUP key for all tables in ascending (ASC) order.</p>
               </li> 
                <li>
               <p>Nulls are considered smaller than evertyhing. If data contains null keys, they should occur before anything else.</p>
               </li>
               <li>
               <p>Left-most loader must implement the {CollectableLoader} interface as well as {OrderedLoadFunc} interface.</p>
               </li>
               <li>
               <p>All other loaders must implement IndexableLoadFunc.</p>
               </li>
                <li>
               <p>Type information must be provided in the schema for all the loaders.</p>
               </li>              
               </ul>
               <p></p>
               <p>If your data and loaders satisfy these conditions, the ‘merge’ clause to perform an optimized version of COGROUP; 
               the operation will execute on the map side and avoid running the reduce phase.</p>
            </td>
            
         </tr>     
         
     <tr>      
         <td >
               <p id="partition-by-group">PARTITION BY partitioner</p>
            </td>
            <td>
             <p>Use this feature to specify the Hadoop Partitioner. The partitioner controls the partitioning of the keys of the intermediate map-outputs. </p>
             <ul>
             <li>
             <p>For more details, see <a href="http://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapred/Partitioner.html">http://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapred/Partitioner.html</a></p>
             </li>
             <li>
             <p>For usage, see <a href="#partitionby">Example: PARTITION BY</a></p>
             </li>
             </ul>
         </td>
     </tr> 

         <tr>
            <td>
               <p>PARALLEL n</p>
            </td>
            <td>
               <p>Increase the parallelism of a job by specifying the number of reduce tasks, n.</p>
               <p>For more information, see <a href="perf.html#Parallel">Use the Parallel Features</a>.</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Usage</title>
   <p>The GROUP operator groups together tuples that have the same group key (key field). The key field will be a tuple if the group key has more than one field, otherwise it will be the same type as that of the group key. The result of a GROUP operation is a relation that includes one tuple per group. This tuple contains two fields: </p>
   <ul>
      <li>
         <p>The first field is named "group" (do not confuse this with the GROUP operator) and is the same type as the group key.</p>
      </li>
      <li>
         <p>The second field takes the name of the original relation and is type bag.</p>
     </li>
     <li>
         <p>The names of both fields are generated by the system as shown in the example below.</p>
      </li>
   </ul>
   <p></p>
   
   <p>Note the following about the GROUP/COGROUP and JOIN operators:</p>
      <ul>
      <li>
         <p>The GROUP and JOIN operators perform similar functions. GROUP creates a nested set of output tuples while JOIN creates a flat set of output tuples</p>
      </li>
      <li>
         <p>The GROUP/COGROUP and JOIN operators handle null values differently (see <a href="#nulls_group">Nulls and GROUP/COGROUP Operataors</a>).</p>
     </li>
   </ul>
   
   </section>
   
   <section>
   <title>Example</title>
<p>Suppose we have relation A.</p>

<source>
A = load 'student' AS (name:chararray,age:int,gpa:float);

DESCRIBE A;
A: {name: chararray,age: int,gpa: float}

DUMP A;
(John,18,4.0F)
(Mary,19,3.8F)
(Bill,20,3.9F)
(Joe,18,3.8F)
</source>
   
   <p>Now, suppose we group relation A on field "age" for form relation B. We can use the DESCRIBE and ILLUSTRATE operators to examine the structure of relation B. Relation B has two fields. The first field is named "group" and is type int, the same as field "age" in relation A. The second field is name "A"  after relation A and is type bag.</p>
<source>
B = GROUP A BY age;

DESCRIBE B;
B: {group: int, A: {name: chararray,age: int,gpa: float}}

ILLUSTRATE B;
<em>etc ... </em>
----------------------------------------------------------------------
| B     | group: int | A: bag({name: chararray,age: int,gpa: float}) |
----------------------------------------------------------------------
|       | 18         | {(John, 18, 4.0), (Joe, 18, 3.8)}             |
|       | 20         | {(Bill, 20, 3.9)}                             |
----------------------------------------------------------------------

DUMP B;
(18,{(John,18,4.0F),(Joe,18,3.8F)})
(19,{(Mary,19,3.8F)})
(20,{(Bill,20,3.9F)})
</source>
   

   <p>Continuing on, as shown in these FOREACH statements, we can refer to the fields in relation B by names "group" and "A" or by positional notation.</p>

<source>
C = FOREACH B GENERATE group, COUNT(A);

DUMP C;
(18,2L)
(19,1L)
(20,1L)

C = FOREACH B GENERATE $0, $1.name;

DUMP C;
(18,{(John),(Joe)})
(19,{(Mary)})
(20,{(Bill)})
</source>
</section>
   
   <section>
   <title>Example</title>
   
   <p>Suppose we have relation A.</p>
<source>
A = LOAD 'data' as (f1:chararray, f2:int, f3:int);

DUMP A;
(r1,1,2)
(r2,2,1)
(r3,2,8)
(r4,4,4)
</source>
   
<p>In this example the tuples are grouped using an expression, f2*f3.</p>
<source>
X = GROUP A BY f2*f3;

DUMP X;
(2,{(r1,1,2),(r2,2,1)})
(16,{(r3,2,8),(r4,4,4)})
</source>
</section>

   <section>
   <title>Example</title>
   <p>Suppose we have two relations, A and B.</p>
<source>
A = LOAD 'data1' AS (owner:chararray,pet:chararray);

DUMP A;
(Alice,turtle)
(Alice,goldfish)
(Alice,cat)
(Bob,dog)
(Bob,cat)

B = LOAD 'data2' AS (friend1:chararray,friend2:chararray);

DUMP B;
(Cindy,Alice)
(Mark,Alice)
(Paul,Bob)
(Paul,Jane)
</source>
   
   <p>In this example tuples are co-grouped using field “owner” from relation A and field “friend2” from relation B as the key fields. The DESCRIBE operator shows the schema for relation X, which has three fields, "group", "A" and "B" (see the GROUP operator for information about the field names).</p>
<source>
X = COGROUP A BY owner, B BY friend2;

DESCRIBE X;
X: {group: chararray,A: {owner: chararray,pet: chararray},B: {friend1: chararray,friend2: chararray}}
</source>
   
   <p>Relation X looks like this. A tuple is created for each unique key field. The tuple includes the key field and two bags. The first bag is the tuples from the first relation with the matching key field. The second bag is the tuples from the second relation with the matching key field. If no tuples match the key field, the bag is empty.</p>
<source>
(Alice,{(Alice,turtle),(Alice,goldfish),(Alice,cat)},{(Cindy,Alice),(Mark,Alice)})
(Bob,{(Bob,dog),(Bob,cat)},{(Paul,Bob)})
(Jane,{},{(Paul,Jane)})
</source>
   </section>
   
   <section>
   <title>Example</title>
<p>This example shows how to group using multiple keys.</p>   
<source>
 A = LOAD 'allresults' USING PigStorage() AS (tcid:int, tpid:int, date:chararray, result:chararray, tsid:int, tag:chararray);
 B = GROUP A BY (tcid, tpid); 
</source>
    </section>   
     
   <section id="partitionby">
   <title>Example: PARTITION BY</title>
<p>To use the Hadoop Partitioner add PARTITION BY clause to the appropriate operator: </p>
<source>
A = LOAD 'input_data'; 
B = GROUP A BY $0 PARTITION BY org.apache.pig.test.utils.SimpleCustomPartitioner PARALLEL 2;
</source>
<p>Here is the code for SimpleCustomPartitioner:</p>
<source>
public class SimpleCustomPartitioner extends Partitioner &lt;PigNullableWritable, Writable&gt; { 
     //@Override 
    public int getPartition(PigNullableWritable key, Writable value, int numPartitions) { 
        if(key.getValueAsPigType() instanceof Integer) { 
            int ret = (((Integer)key.getValueAsPigType()).intValue() % numPartitions); 
            return ret; 
       } 
       else { 
            return (key.hashCode()) % numPartitions; 
        } 
    } 
}
</source>
   </section>
   </section>
   
   <!-- =================================================================== -->    
   <section id="import">
   <title>IMPORT</title>
   
   <p>See <a href="cont.html#import-macros">IMPORT (macros)</a></p>

   </section>
   
<!-- =========================================================================== -->     
   
   <section id="join-inner">
   <title>JOIN (inner) </title>
   <p>Performs an inner join of two or more relations based on common field values.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>alias = JOIN alias BY {expression|'('expression [, expression …]')'} (, alias BY {expression|'('expression [, expression …]')'} …) [USING 'replicated' | 'bloom' | 'skewed' | 'merge' | 'merge-sparse'] [PARTITION BY partitioner] [PARALLEL n];  </p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of a relation.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>BY</p>
            </td>
            <td>
               <p>Keyword</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>expression</p>
            </td>
            <td>
               <p>A field expression.</p>
               <p>Example: X = JOIN A BY fieldA, B BY fieldB, C BY fieldC;</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>USING</p>
            </td>
            <td>
               <p>Keyword</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>'replicated'</p>
            </td>
            <td>
               <p>Use to perform replicated joins (see <a href="perf.html#replicated-joins">Replicated Joins</a>).</p>
            </td>
         </tr>

         <tr>
            <td>
               <p>'bloom'</p>
            </td>
            <td>
               <p>Use to perform bloom joins (see <a href="perf.html#bloom-joins">Bloom Joins</a>).</p>
            </td>
         </tr>

         <tr>
            <td>
               <p>'skewed'</p>
            </td>
            <td>
               <p>Use to perform skewed joins (see <a href="perf.html#skewed-joins">Skewed Joins</a>).</p>
            </td>
         </tr>
         
          <tr>
            <td>
               <p>'merge'</p>
            </td>
            <td>
               <p>Use to perform merge joins (see <a href="perf.html#merge-joins">Merge Joins</a>).</p>
            </td>
         </tr>
         
          <tr>
            <td>
               <p>'merge-sparse'</p>
            </td>
            <td>
               <p>Use to perform merge-sparse joins (see <a href="perf.html#merge-sparse-joins">Merge-Sparse Joins</a>).</p>
            </td>
         </tr>         
         
              <tr>      
         <td>
               <p id="partition-by-join-inner">PARTITION BY partitioner</p>
            </td>
            <td>
             <p>Use this feature to specify the Hadoop Partitioner. The partitioner controls the partitioning of the keys of the intermediate map-outputs. </p>
             <ul>
             <li>
             <p>For more details, see <a href="http://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapred/Partitioner.html">http://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapred/Partitioner.html</a></p>             </li>
             <li>
              <p>For usage, see <a href="#partitionby">Example: PARTITION BY</a></p>
             </li>
             </ul>
             <p></p>
             <p>This feature CANNOT be used with skewed joins.</p>
         </td>
     </tr> 

         <tr>
            <td>
               <p>PARALLEL n</p>
            </td>
            <td>
               <p>Increase the parallelism of a job by specifying the number of reduce tasks, n. </p>
               <p>For more information, see <a href="perf.html#parallel">Use the Parallel Features</a>.</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Usage</title>
   <p>Use the JOIN operator to perform an inner, equijoin join of two or more relations based on common field values. 
   Inner joins ignore null keys, so it makes sense to filter them out before the join.</p>
   
   <p>Note the following about the GROUP/COGROUP and JOIN operators:</p>
      <ul>
      <li>
         <p>The GROUP and JOIN operators perform similar functions. GROUP creates a nested set of output tuples while JOIN creates a flat set of output tuples.</p>
      </li>
      <li>
         <p>The GROUP/COGROUP and JOIN operators handle null values differently (see <a href="#nulls_join">Nulls and JOIN Operator</a>).</p>
     </li>
   </ul>
   <p></p>
   <p id="self-joins"><strong>Self Joins</strong></p>
   <p>To perform self joins in Pig load the same data multiple times, under different aliases, to avoid naming conflicts.</p>  
   <p>In this example the same data is loaded twice using aliases A and B.</p>
   <source>
grunt> A = load 'mydata';
grunt> B = load 'mydata';
grunt> C = join A by $0, B by $0;
grunt> explain C;
</source>
    </section>

   <section>
   <title>Example</title>
   <p>Suppose we have relations A and B.</p>
<source>
A = LOAD 'data1' AS (a1:int,a2:int,a3:int);

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)

B = LOAD 'data2' AS (b1:int,b2:int);

DUMP B;
(2,4)
(8,9)
(1,3)
(2,7)
(2,9)
(4,6)
(4,9)
</source>
   
   <p>In this example relations A and B are joined by their first fields.</p>
<source>
X = JOIN A BY a1, B BY b1;

DUMP X;
(1,2,3,1,3)
(4,2,1,4,6)
(4,3,3,4,6)
(4,2,1,4,9)
(4,3,3,4,9)
(8,3,4,8,9)
(8,4,3,8,9)
</source>
   
 </section>
 </section>
   
<!-- =========================================================================== -->  

<section id="join-outer">
   <title>JOIN (outer) </title>
   <p>Performs an outer join of two relations based on common field values.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>alias = JOIN left-alias BY left-alias-column [LEFT|RIGHT|FULL] [OUTER], right-alias BY right-alias-column 
               [USING 'replicated' | 'bloom' | 'skewed' | 'merge'] [PARTITION BY partitioner] [PARALLEL n];  </p>
            </td>
         </tr> 
   </table>
   </section>
   
   <section>
   <title>Terms</title>

   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of a relation. Applies to alias, left-alias and right-alias.</p>
            </td>
        </tr>
        <tr>
            <td>
               <p>alias-column</p>
            </td>
            <td>
               <p>The name of the join column for the corresponding relation. Applies to left-alias-column and right-alias-column.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>BY</p>
            </td>
            <td>
               <p>Keyword</p>
            </td>
         </tr>

         <tr>
            <td>
               <p>LEFT</p>
            </td>
            <td>
               <p>Left outer join.</p>
            </td>
         </tr>
         
         <tr>
            <td>
               <p>RIGHT</p>
            </td>
            <td>
               <p>Right outer join.</p>
            </td>
         </tr>
         
              <tr>
            <td>
               <p>FULL</p>
            </td>
            <td>
               <p>Full outer join.</p>
            </td>
         </tr>

         <tr>
            <td>
               <p>OUTER</p>
            </td>
            <td>
               <p>(Optional) Keyword </p>
            </td>
         </tr>

         <tr>
            <td>
               <p>USING</p>
            </td>
            <td>
               <p>Keyword</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>'replicated'</p>
            </td>
            <td>
               <p>Use to perform replicated joins (see <a href="perf.html#replicated-joins">Replicated Joins</a>).</p>
               <p>Only left outer join is supported for replicated joins.</p>
            </td>
         </tr>

         <tr>
            <td>
               <p>'bloom'</p>
            </td>
            <td>
               <p>Use to perform bloom joins (see <a href="perf.html#bloom-joins">Bloom Joins</a>).</p>
               <p>Full outer join is not supported for bloom joins.</p>
            </td>
         </tr>

         <tr>
            <td>
               <p>'skewed'</p>
            </td>
            <td>
               <p>Use to perform skewed joins (see <a href="perf.html#skewed-joins">Skewed Joins</a>).</p>
            </td>
         </tr>

         <tr>
            <td>
               <p>'merge'</p>
            </td>
            <td>
               <p>Use to perform merge joins (see <a href="perf.html#merge-joins">Merge Joins</a>).</p>
            </td>
         </tr>
         
         
      <tr>      
         <td>
               <p id="partition-by-join-outer">PARTITION BY partitioner</p>
            </td>
            <td>
             <p>Use this feature to specify the Hadoop Partitioner. The partitioner controls the partitioning of the keys of the intermediate map-outputs. </p>
             <ul>
             <li>
             <p>For more details, see <a href="http://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapred/Partitioner.html">http://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapred/Partitioner.html</a></p>
             </li>
             <li>
              <p>For usage, see <a href="#partitionby">Example: PARTITION BY</a></p>
             </li>
             </ul>
             <p></p>
             <p>This feature CANNOT be used with skewed joins.</p>
         </td>
     </tr> 

         <tr>
            <td>
               <p>PARALLEL n</p>
            </td>
            <td>
               <p>Increase the parallelism of a job by specifying the number of reduce tasks, n. </p>
               <p>For more information, see <a href="perf.html#parallel">Use the Parallel Features</a>.</p>
            </td>
         </tr>
    
   </table>

</section>
   
 <section>
   <title>Usage</title>
   <p>Use the JOIN operator with the corresponding keywords to perform left, right, or full outer joins.  
   The keyword OUTER is optional for outer joins; the keywords LEFT, RIGHT and FULL will imply left outer, right outer and full outer joins respectively when OUTER is omitted. 
   The Pig Latin syntax closely adheres to the SQL standard.</p>
    <p>
   Please note the following:
    </p>
    <ul>
		<li>
			<p>Outer joins will only work provided the relations which need to produce nulls (in the case of non-matching keys) have schemas.</p>
		</li>
		<li>
			<p>Outer joins will only work for two-way joins; to perform a multi-way outer join, you will need to perform multiple two-way outer join statements.</p>
		</li>
    </ul>
</section>

   <section>
   <title>Examples</title>
<p>This example shows a left outer join.</p>
<source>
A = LOAD 'a.txt' AS (n:chararray, a:int); 
B = LOAD 'b.txt' AS (n:chararray, m:chararray);
C = JOIN A by $0 LEFT OUTER, B BY $0;
</source>

<p>This example shows a full outer join.</p>
<source>
A = LOAD 'a.txt' AS (n:chararray, a:int); 
B = LOAD 'b.txt' AS (n:chararray, m:chararray);
C = JOIN A BY $0 FULL, B BY $0;
</source>

<p>This example shows a replicated left outer join.</p>
<source>
A = LOAD 'large';
B = LOAD 'tiny';
C= JOIN A BY $0 LEFT, B BY $0 USING 'replicated';
</source>

<p>This example shows a bloom right outer join.</p>
<source>
A = LOAD 'large';
B = LOAD 'small';
C= JOIN A BY $0 RIGHT, B BY $0 USING 'bloom';
</source>

<p>This example shows a skewed full outer join.</p>
<source>
A = LOAD 'studenttab' as (name, age, gpa);
B = LOAD 'votertab' as (name, age, registration, contribution);
C = JOIN A BY name FULL, B BY name USING 'skewed';
</source>
</section>
</section>  
  
<!-- =========================================================================== -->

   <section id="limit">
   <title>LIMIT </title>
   <p>Limits the number of output tuples.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>alias = LIMIT alias  n;</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of a relation.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>n</p>
            </td>
            <td>
               <p>The number of output tuples, either:</p>
               <ul>
					<li>a constant (for example, 3)</li>
					<li>a scalar used in an expression (for example, c.sum/100)</li>
				</ul>
				<p></p>
				<p>Note: The expression can consist of constants or scalars; it cannot contain any columns from the input relation.</p>
				<p>Note: Using a scalar instead of a constant in LIMIT automatically disables most optimizations (only push-before-foreach is performed).</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Usage</title>
   <p>Use the LIMIT operator to limit the number of output tuples.</p> 
   
   <p>If the specified number of output tuples is equal to or exceeds the number of tuples in the relation, all tuples in the relation are returned.</p>
   <p>If the specified number of output tuples is less than the number of tuples in the relation, then n tuples are returned. There is no guarantee which n tuples will be returned, and the tuples that are returned can change from one run to the next. A particular set of tuples can be requested using the ORDER operator followed by LIMIT.</p>
   <p>Note: The LIMIT operator allows Pig to avoid processing all tuples in a relation. In most cases a query that uses LIMIT will run more efficiently than an identical query that does not use LIMIT. It is always a good idea to use limit if you can.</p>
   </section>
   
   <section>
   <title>Examples</title>
   <p>In this example the limit is expressed as a scalar.</p>
 <source>
a = load 'a.txt';
b = group a all;
c = foreach b generate COUNT(a) as sum;
d = order a by $0;
e = limit d c.sum/100;
</source>
<p></p>
   <p>Suppose we have relation A.</p>
<source>
A = LOAD 'data' AS (a1:int,a2:int,a3:int);

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)
</source>
   
   <p>In this example output is limited to 3 tuples. Note that there is no guarantee which three tuples will be output.</p>
<source>
X = LIMIT A 3;

DUMP X;
(1,2,3)
(4,3,3)
(7,2,5)
</source>
   
   <p>In this example the ORDER operator is used to order the tuples and the LIMIT operator is used to output the first three tuples.</p>
<source>
B = ORDER A BY f1 DESC, f2 ASC;

DUMP B;
(8,3,4) 
(8,4,3) 
(7,2,5) 
(4,2,1)
(4,3,3)
(1,2,3)

X = LIMIT B 3;

DUMP X;
(8,3,4)
(8,4,3) 
(7,2,5) 
</source>
   </section></section>
   
   <!-- =========================================================================== -->
   
   <section id="load">
   <title>LOAD </title>
   <p>Loads data from the file system.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>LOAD 'data' [USING function] [AS schema];        </p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>'data'</p>
            </td>
            <td>
               <p>The name of the file or directory, in single quotes.</p>
               <p>If you specify a directory name, all the files in the directory are loaded. </p>
               <p>You can use Hadoop globing to specify files at the file system or directory levels (see Hadoop
                  <a href="http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/fs/FileSystem.html#globStatus(org.apache.hadoop.fs.Path)">globStatus</a> for details on globing syntax).</p>
                  <p id="load-glob"><strong>Note:</strong> Pig uses Hadoop globbing so the functionality is IDENTICAL. However, when you run from the command line using the Hadoop fs command (rather than the Pig LOAD operator), the Unix shell may do some of the substitutions; this could alter the outcome giving the impression that globing works differently for Pig and Hadoop. For example:</p>
                <ul>
					<li>This works <br></br>hadoop fs -ls /mydata/20110423{00,01,02,03,04,05,06,07,08,09,{10..23}}00//<strong>part </strong></li>
					<li>This does not work <br></br>LOAD '/mydata/20110423{00,01,02,03,04,05,06,07,08,09,{10..23}}00//<strong>part </strong>'</li>
				</ul>
            </td>
         </tr>
         <tr>
            <td>
               <p>USING</p>
            </td>
            <td>
               <p>Keyword. </p>
               <p>If the USING clause is omitted, the default load function PigStorage is used. </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>function</p>
            </td>
            <td>
               <p>The load function. </p>
               <ul>
                  <li>
                  
                  
                     <p>You can use a built in function (see <a href="func.html#load-store-functions">Load/Store Functions</a>). PigStorage is the default load function and does not need to be specified (simply omit the USING clause).</p>
                  </li>
                  <li>
                     <p>You can write your own load function  
                     if your data is in a format that cannot be processed by the built in functions (see <a href="udf.html">User Defined Functions</a>).</p>
                  </li>
               </ul>
            </td>
         </tr>
         <tr>
            <td>
               <p>AS</p>
            </td>
            <td>
               <p>Keyword. </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>schema</p>
            </td>
            <td>
               <p>A schema using the AS keyword, enclosed in parentheses (see <a href="#schemas">Schemas</a>).</p>
               <p>The loader produces the data of the type specified by the schema. If the data does not conform to the schema, depending on the loader, either a null value or an error is generated.</p>
               <p>Note: For performance reasons the loader may not immediately convert the data to the specified format; however, you can still operate on the data assuming the specified type.</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Usage</title>
   <p>Use the LOAD operator to load data from the file system. </p></section>
   
   <section>
   <title>Examples</title>
   <p>Suppose we have a data file called myfile.txt. The fields are tab-delimited. The records are newline-separated.</p>
<source>
1 2 3
4 2 1
8 3 4
</source>
   
   <p>In this example the default load function, PigStorage, loads data from myfile.txt to form relation A. The two LOAD statements are equivalent. Note that, because no schema is specified, the fields are not named and all fields default to type bytearray. </p>
<source>
A = LOAD 'myfile.txt';

A = LOAD 'myfile.txt' USING PigStorage('\t');

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
</source>
   
   <p>In this example a schema is specified using the AS keyword. The two LOAD statements are equivalent. You can use the DESCRIBE and ILLUSTRATE operators to view the schema. </p>
<source>
A = LOAD 'myfile.txt' AS (f1:int, f2:int, f3:int);

A = LOAD 'myfile.txt' USING PigStorage('\t') AS (f1:int, f2:int, f3:int);

DESCRIBE A;
a: {f1: int,f2: int,f3: int}

ILLUSTRATE A;
---------------------------------------------------------
| a     | f1: bytearray | f2: bytearray | f3: bytearray |
---------------------------------------------------------
|       | 4             | 2             | 1             |
---------------------------------------------------------

---------------------------------------
| a     | f1: int | f2: int | f3: int |
---------------------------------------
|       | 4       | 2       | 1       |
---------------------------------------
</source>
   <p>
      For examples of how to specify more complex schemas for use with the LOAD operator, see <a href="#schema-complex">Schemas for Complex Data Types</a> and <a href="#schema-multi">Schemas for Multiple Types</a>.
      </p></section></section>
      

<!-- =================================================================== -->
<section id="native">
   <title>NATIVE</title>
   <p>Executes native MapReduce/Tez jobs inside a Pig script.</p>
   
   <section>
   <title>Syntax</title>
      <table>
      <tr> 
            <td>
               <p>alias1 = NATIVE 'native.jar' STORE alias2 INTO
'inputLocation' USING storeFunc LOAD 'outputLocation' USING loadFunc AS schema [`params, ... `];</p>
            </td>
         </tr> 
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias1, alias2</p>
            </td>
            <td>
               <p>The names of relations.</p>
            </td>
     </tr>
     <tr>
            <td>
               <p>native.jar</p>
            </td>
            <td>
                <p>The jar file containing MapReduce or Tez program (enclosed in single quotes).</p>
               <p>You can specify any MapReduce/Tez jar file that can be run through the <code>hadoop jar native.jar params</code> command. </p>
               <p>The values for inputLocation and outputLocation can be passed in the params. </p>
            </td>
     </tr>

      <tr>
            <td>
               <p>STORE ... INTO ... USING</p>
            </td>
            <td>
               <p>See <a href="basic.html#store">STORE</a></p>
               <p>Store alias2 into the inputLocation using storeFunc, which is then used by the MapReduce/Tez job to read its data.</p>
                
            </td>
     </tr>

      <tr>
            <td>
               <p>LOAD ... USING ... AS </p>
            </td>
            <td>
               <p>See <a href="basic.html#load">LOAD</a></p>
               <p>After running native.jar's MapReduce/Tez job, load back the data from outputLocation into alias1 using loadFunc as schema.</p>
            </td>
     </tr>

      <tr>
            <td>
               <p>`params, ...`</p>
            </td>
            <td>
               <p>Extra parameters required for the mapreduce/tez job (enclosed in back tics). </p>
            </td>
     </tr>
       
</table>
</section>

<section>
<title>Usage</title>
<p>Use the NATIVE operator to run native MapReduce/Tez jobs from inside a Pig script.</p>

<p>The input and output locations for the MapReduce/Tez program are conveyed to Pig using the STORE/LOAD clauses.
Pig, however, does not pass this information (nor require that this information be passed) to the MapReduce/Tez program.
If you want to pass the input and output locations to the MapReduce/Tez program you can use the params clause or you can hardcode the locations in the MapReduce/Tez program.</p>
</section>

<section>
<title>Example</title>
<p>This example demonstrates how to run the wordcount MapReduce progam from Pig.
Note that the files specified as input and output locations in the NATIVE statement will NOT be deleted by Pig automatically. You will need to delete them manually. </p>
<source>
A = LOAD 'WordcountInput.txt';
B = NATIVE 'wordcount.jar' STORE A INTO 'inputDir' LOAD 'outputDir'
    AS (word:chararray, count: int) `org.myorg.WordCount inputDir outputDir`;
</source>
</section>

</section>
 
 <!-- =================================================================== -->     
      <section id="order-by">
      <title>ORDER BY</title>
   <p>Sorts a relation based on one or more fields.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>alias = ORDER alias BY { * [ASC|DESC] | field_alias [ASC|DESC] [, field_alias [ASC|DESC] …] } [PARALLEL n];</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of a relation.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>*</p>
            </td>
            <td>
               <p>The designator for a tuple.</p>
            </td>
         </tr>
                  <tr>
            <td>
               <p>field_alias</p>
            </td>
            <td>
               <p>A field in the relation. The field must be a simple type.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>ASC</p>
            </td>
            <td>
               <p>Sort in ascending order.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>DESC</p>
            </td>
            <td>
               <p>Sort in descending order.</p>
            </td>
         </tr>

         <tr>
            <td>
               <p>PARALLEL n</p>
            </td>
            <td>
               <p>Increase the parallelism of a job by specifying the number of reduce tasks, n.</p>
               <p>For more information, see <a href="perf.html#parallel">Use the Parallel Features</a>.</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Usage</title>
   <p><strong>Note:</strong> ORDER BY is NOT stable; if multiple records have the same ORDER BY key, the order in which these records are returned is not defined and is not guarantted to be the same from one run to the next.</p>
   
   <p>In Pig, relations are unordered (see <a href="#relations">Relations, Bags, Tuples, Fields</a>):</p>
   <ul>
      <li>
         <p>If you order relation A to produce relation X (X = ORDER A BY * DESC;) relations A and X still contain the same data. </p>
      </li>
      <li>
         <p>If you retrieve relation X (DUMP X;) the data is guaranteed to be in the order you specified (descending).</p>
      </li>
      <li>
         <p>However, if you further process relation X (Y = FILTER X BY $0 &gt; 1;) there is no guarantee that the data will be processed in the order you originally specified (descending).</p>
      </li>
   </ul>
   <p></p>
      <p>Pig currently supports ordering on fields with simple types or by tuple designator (*). You cannot order on fields with complex types or by expressions. </p>
     <source>
A = LOAD 'mydata' AS (x: int, y: map[]);     
B = ORDER A BY x; -- this is allowed because x is a simple type
B = ORDER A BY y; -- this is not allowed because y is a complex type
B = ORDER A BY y#'id'; -- this is not allowed because y#'id' is an expression
</source> 
   </section>

   <section>
   <title>Examples</title>
   <p>Suppose we have relation A.</p>
<source>
A = LOAD 'data' AS (a1:int,a2:int,a3:int);

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)
</source>
   
   <p>In this example relation A is sorted by the third field, f3 in descending order. Note that the order of the three tuples ending in 3 can vary.</p>
<source>
X = ORDER A BY a3 DESC;

DUMP X;
(7,2,5)
(8,3,4)
(1,2,3)
(4,3,3)
(8,4,3)
(4,2,1)
</source>
   
   </section></section>
    <!-- =================================================================== -->     
    <section id="rank">
        <title>RANK</title>
        <p>Returns each tuple with the rank within a relation.</p>
        
        <section>
            <title>Syntax</title>
            <table>
                <tr> 
                    <td>
                        <p>alias = RANK alias [ BY { * [ASC|DESC] | field_alias [ASC|DESC] [, field_alias [ASC|DESC] …] } [DENSE] ];</p>
                    </td>
                </tr> 
            </table>
        </section>
    
        
        <section>
            <title>Terms</title>
            <table>
                <tr>
                    <td>
                        <p>alias</p>
                    </td>
                    <td>
                        <p>The name of a relation.</p>
                    </td>
                </tr>
                <tr>
                    <td>
                        <p>*</p>
                    </td>
                    <td>
                        <p>The designator for a tuple.</p>
                    </td>
                </tr>
                <tr>
                    <td>
                        <p>field_alias</p>
                    </td>
                    <td>
                        <p>A field in the relation. The field must be a simple type.</p>
                    </td>
                </tr>
                <tr>
                    <td>
                        <p>ASC</p>
                    </td>
                    <td>
                        <p>Sort in ascending order.</p>
                    </td>
                </tr>
                <tr>
                    <td>
                        <p>DESC</p>
                    </td>
                    <td>
                        <p>Sort in descending order.</p>
                    </td>
                </tr>
                
                <tr>
                    <td>
                        <p>DENSE</p>
                    </td>
                    <td>
                        <p>No gap in the ranking values. </p>
                    </td>
                </tr> 
            </table>
        </section>
        
        <section>
            <title>Usage</title>
            <p>When specifying no field to sort on, the RANK operator simply prepends a sequential value to each tuple.</p>
            <p>Otherwise, the RANK operator uses each field (or set of fields) to sort the relation. The rank of a tuple is one plus the number of different rank values preceding it. If two or more tuples tie on the sorting field values, they will receive the same rank.</p>
            <p><strong>NOTE:</strong> When using the option <strong>DENSE</strong>, ties do not cause gaps in ranking values.</p>

        </section>  
        
        <section>
            <title>Examples</title>
            <p>Suppose we have relation A.</p>
            <source>
A = load 'data' AS (f1:chararray,f2:int,f3:chararray);
   
DUMP A;
(David,1,N)
(Tete,2,N)
(Ranjit,3,M)
(Ranjit,3,P)
(David,4,Q)
(David,4,Q)
(Jillian,8,Q)
(JaePak,7,Q)
(Michael,8,T)
(Jillian,8,Q)
(Jose,10,V)
            </source>
            <p>In this example, the RANK operator does not change the order of the relation and simply prepends to each tuple a sequential value.</p>
            <source>
B = rank A;

dump B;
(1,David,1,N)
(2,Tete,2,N)
(3,Ranjit,3,M)
(4,Ranjit,3,P)
(5,David,4,Q)
(6,David,4,Q)
(7,Jillian,8,Q)
(8,JaePak,7,Q)
(9,Michael,8,T)
(10,Jillian,8,Q)
(11,Jose,10,V)
            </source>
            
            <p>In this example, the RANK operator works with f1 and f2 fields, and each one with different sorting order. RANK sorts the relation on these fields and 
                prepends the rank value to each tuple. Otherwise, the RANK operator uses each field (or set of fields) to sort the relation. The rank of a tuple is one plus the number of different rank values preceding it. If two or more tuples tie on the sorting field values, they will receive the same rank.</p>
            <source>
C = rank A by f1 DESC, f2 ASC;
                                
dump C;
(1,Tete,2,N)
(2,Ranjit,3,M)
(2,Ranjit,3,P)
(4,Michael,8,T)
(5,Jose,10,V)
(6,Jillian,8,Q)
(6,Jillian,8,Q)
(8,JaePak,7,Q)
(9,David,1,N)
(10,David,4,Q)
(10,David,4,Q)                
            </source>
            
            <p>Same example as previous, but DENSE. In this case there are no gaps in ranking values.</p>
            <source>
C = rank A by f1 DESC, f2 ASC DENSE;

dump C;
(1,Tete,2,N)
(2,Ranjit,3,M)
(2,Ranjit,3,P)
(3,Michael,8,T)
(4,Jose,10,V)
(5,Jillian,8,Q)
(5,Jillian,8,Q)
(6,JaePak,7,Q)
(7,David,1,N)
(8,David,4,Q)
(8,David,4,Q)
            </source>
            
        </section>
    </section>


<!-- =========================================================================== -->
   <section id="sample">
   <title>SAMPLE</title>
   <p>Selects a random sample of data based on the specified sample size.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>SAMPLE alias size;</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of a relation.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>size</p>
            </td>
            <td>
               <p>Sample size, either</p>
               <ul>
               <li>a constant, range 0 to 1 (for example, enter 0.1 for 10%)</li>
                <li>a scalar used in an expression</li>
               </ul>
               <p></p>
               <p>Note: The expression can consist of constants or scalars; it cannot contain any columns from the input relation.</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
     <title>Usage</title>
     <p>Use the SAMPLE operator to select a random data sample with the stated sample size. 
     SAMPLE is a probabalistic operator; there is no guarantee that the exact same number of tuples will be returned for a particular sample size
     each time the operator is used.</p>
   </section>
   
   <section>
   <title>Example</title>
   <p>In this example relation X will contain 1% of the data in relation A.</p>
<source>
A = LOAD 'data' AS (f1:int,f2:int,f3:int);

X = SAMPLE A 0.01;
</source>
<p>In this example, a scalar expression is used (it will sample approximately 1000 records from the input).</p>
<source>
a = LOAD 'a.txt';
b = GROUP a ALL;
c = FOREACH b GENERATE COUNT_STAR(a) AS num_rows;
d = SAMPLE a (double)1000/c.num_rows;
</source>
   </section></section>  
   
   <section>
   <title>SPLIT</title>
   <p>Partitions a relation into two or more relations.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>SPLIT alias INTO alias IF expression, alias IF expression [, alias IF expression …] [, alias OTHERWISE];</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of a relation.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>INTO</p>
            </td>
            <td>
               <p>Required keyword.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>IF</p>
            </td>
            <td>
               <p>Required keyword.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>expression</p>
            </td>
            <td>
               <p>An expression.</p>
            </td>
         </tr> 
           <tr>
            <td>
               <p>OTHERWISE</p>
            </td>
            <td>
               <p>Optional keyword. Designates a default relation.</p>
            </td>
         </tr>
   </table></section>
   
   <section>
   <title>Usage</title>
   <p>Use the SPLIT operator to partition the contents of a relation into two or more relations based on some expression. Depending on the conditions stated in the expression:</p>
   <ul>
      <li>
         <p>A tuple may be assigned to more than one relation.</p>
      </li>
      <li>
         <p>A tuple may not be assigned to any relation.</p>
         <p></p>
         <p></p>
      </li>
   </ul></section>
   
   <section>
   <title>Example</title>
   <p>In this example relation A is split into three relations, X, Y, and Z.</p>
<source>
A = LOAD 'data' AS (f1:int,f2:int,f3:int);

DUMP A;                
(1,2,3)
(4,5,6)
(7,8,9)        

SPLIT A INTO X IF f1&lt;7, Y IF f2==5, Z IF (f3&lt;6 OR f3&gt;6);

DUMP X;
(1,2,3)
(4,5,6)

DUMP Y;
(4,5,6)

DUMP Z;
(1,2,3)
(7,8,9)
</source>
</section>


      <section>
   <title>Example</title>
   <p>In this example, the SPLIT and FILTER statements are essentially equivalent. 
   However, because SPLIT is implemented as "split the data stream and then apply filters" the 
  SPLIT statement is more expensive than the FILTER statement because Pig needs to filter and store two data streams.</p>
   <source>
SPLIT input_var INTO output_var IF (field1 is not null), ignored_var IF (field1 is null);  
-- where ignored_var is not used elsewhere
   
output_var = FILTER input_var BY (field1 is not null);
   </source>
      </section>
   </section>


<!-- =========================================================================== -->   

   <section id="store">
   <title>STORE </title>
   <p>Stores or saves results to the file system.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>STORE alias INTO 'directory' [USING function];</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of a relation.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>INTO</p>
            </td>
            <td>
               <p>Required keyword.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>'directory'</p>
            </td>
            <td>
               <p>The name of the storage directory, in quotes. If the directory already exists, the STORE operation will fail.</p>
               <p></p>
               <p>The output data files, named part-nnnnn, are written to this directory. </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>USING</p>
            </td>
            <td>
               <p>Keyword. Use this clause to name the store function.</p>
               <p>If the USING clause is omitted, the default store function PigStorage is used.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>function</p>
            </td>
            <td>
               <p>The store function.</p>
               <ul>
                  <li>
                  
                  
                     <p>You can use a built in function (see the <a href="func.html#load-store-functions">Load/Store Functions</a>). PigStorage is the default store function and does not need to be specified (simply omit the USING clause).</p>
                  </li>
                  <li>
                     <p>You can write your own store function  
                     if your data is in a format that cannot be processed by the built in functions (see <a href="udf.html">User Defined Functions</a>).</p>
                  </li>
               </ul>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Usage</title>
   <p>Use the STORE operator to run (execute) Pig Latin statements and save (persist) results to the file system. Use STORE for production scripts and batch mode processing.</p>
   
   <p>Note: To debug scripts during development, you can use <a href="test.html#dump">DUMP</a> to check intermediate results.</p>
</section>
   
   <section>
   <title>Examples</title>
   <p>In this example data is stored using PigStorage and the asterisk character (*) as the field delimiter.</p>
<source>
A = LOAD 'data' AS (a1:int,a2:int,a3:int);

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)

STORE A INTO 'myoutput' USING PigStorage ('*');

CAT myoutput;
1*2*3
4*2*1
8*3*4
4*3*3
7*2*5
8*4*3
</source>
   
   <p>In this example, the CONCAT function is used to format the data before it is stored.</p>
<source>
A = LOAD 'data' AS (a1:int,a2:int,a3:int);

DUMP A;
(1,2,3)
(4,2,1)
(8,3,4)
(4,3,3)
(7,2,5)
(8,4,3)

B = FOREACH A GENERATE CONCAT('a:',(chararray)f1), CONCAT('b:',(chararray)f2), CONCAT('c:',(chararray)f3);

DUMP B;
(a:1,b:2,c:3)
(a:4,b:2,c:1)
(a:8,b:3,c:4)
(a:4,b:3,c:3)
(a:7,b:2,c:5)
(a:8,b:4,c:3)

STORE B INTO 'myoutput' using PigStorage(',');

CAT myoutput;
a:1,b:2,c:3
a:4,b:2,c:1
a:8,b:3,c:4
a:4,b:3,c:3
a:7,b:2,c:5
a:8,b:4,c:3
</source>
   
   </section></section>
   
   
   <!-- =========================================================================== -->
   
   <section id="stream">
   <title>STREAM</title>
   <p>Sends data to an external script or program.</p>
      
      <section>
      <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>alias = STREAM alias [, alias …] THROUGH {`command` | cmd_alias } [AS schema] ;</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of a relation.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>THROUGH</p>
            </td>
            <td>
               <p>Keyword. </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>`command`</p>
            </td>
            <td>
               <p>A command, including the arguments, enclosed in back tics (where a command is anything that can be executed).</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>cmd_alias</p>
            </td>
            <td>
               <p>The name of a command created using the DEFINE operator (see <a href="#define-udfs">DEFINE (UDFs, streaming)</a>  for additional  streaming examples).</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>AS</p>
            </td>
            <td>
               <p>Keyword.</p>
            </td>
         </tr>
         <tr>
            <td>
               <p>schema</p>
            </td>
            <td>
               <p>A schema using the AS keyword, enclosed in parentheses (see <a href="#schemas">Schemas</a>).</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Usage</title>
   <p>Use the STREAM operator to send data through an external script or program. Multiple stream operators can appear in the same Pig script. The stream operators can be adjacent to each other or have other operations in between.</p>
   <p>When used with a command, a stream statement could look like this:</p>
<source>
A = LOAD 'data';

B = STREAM A THROUGH `stream.pl -n 5`;
</source>
   <p>When used with a cmd_alias, a stream statement could look like this, where mycmd is the defined alias.</p>
<source>
A = LOAD 'data';

DEFINE mycmd `stream.pl –n 5`;

B = STREAM A THROUGH mycmd;
</source>
   </section>
   
   <section>
   <title>About Data Guarantees</title>
   <p>Data guarantees are determined based on the position of the streaming operator in the Pig script. </p>
   <ul>
      <li>
         <p>Unordered data – No guarantee for the order in which the data is delivered to the streaming application. </p>
      </li>
      <li>
         <p>Grouped data – The data for the same grouped key is guaranteed to be provided to the streaming application contiguously</p>
      </li>
      <li>
         <p>Grouped and ordered data – The data for the same grouped key is guaranteed to be provided to the streaming application contiguously. Additionally, the data within the group is guaranteed to be sorted by the provided secondary key.</p>
      </li>
   </ul>
   <p>In addition to position, data grouping and ordering can be determined by the data itself. However, you need to know the property of the data to be able to take advantage of its structure.</p>
   </section>
   
   <section>
   <title>Example: Data Guarantees</title>
   <p>In this example the data is unordered.</p>
<source>
A = LOAD 'data';

B = STREAM A THROUGH `stream.pl`;
</source>
   
   <p>In this example the data is grouped.</p>
<source>
A = LOAD 'data';

B = GROUP A BY $1;

C = FOREACH B FLATTEN(A);

D = STREAM C THROUGH `stream.pl`;
</source>
   
   <p>In this example the data is grouped and ordered.</p>
<source>
A = LOAD 'data';

B = GROUP A BY $1;

C = FOREACH B {
      D = ORDER A BY ($3, $4);
      GENERATE D;
}

E = STREAM C THROUGH `stream.pl`;
</source>
   </section>
   
   <section>
   <title>Example: Schemas</title>
   <p>In this example a schema is specified as part of the STREAM statement.</p>
<source>
X = STREAM A THROUGH `stream.pl` as (f1:int, f2:int, f3:int);
</source>
   </section>
   </section>
   
   
   <!-- =========================================================================== -->
   
   <section id="union">
   <title>UNION</title>
   <p>Computes the union of two or more relations.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>alias = UNION [ONSCHEMA] alias, alias [, alias …] [PARALLEL n];</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
       
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name of a relation.</p>
            </td>
      </tr>
      
      <tr>
             <td>
               <p id="onschema">ONSCHEMA </p>  
            </td>
            <td>
               <p>Use the ONSCHEMA clause to base the union on named fields (rather than positional notation). 
               All inputs to the union must have a non-unknown (non-null) <a href="#schemas">schema</a>.</p>
            </td>
         </tr>
         
     <tr>
        <td>
           <p>PARALLEL n</p>
        </td>
        <td>
           <p>This is only applicable for Tez execution mode and will not work with Mapreduce mode. Specifying PARALLEL will introduce an extra reduce step that will slightly degrade performance. The primary purpose in this case is to control the number of output files.</p>
           <p>For more information, see <a href="perf.html#parallel">Use the Parallel Features</a>.</p>
        </td>
     </tr>
   </table>
   </section>
   
   <section>
   <title>Usage</title>
   <p>Use the UNION operator to merge the contents of two or more relations. The UNION operator:</p>
   <ul>
      <li>
         <p>Does not preserve the order of tuples. Both the input and output relations are interpreted as unordered bags of tuples.</p>
      </li>
      <li>
         <p>Does not ensure (as databases do) that all tuples adhere to the same schema or that they have the same number of fields. In a typical scenario, however, this should be the case; therefore, it is the user's responsibility to either (1) ensure that the tuples in the input relations have the same schema or (2) be able to process varying tuples in the output relation.</p>
      </li>
      <li>
         <p>Does not eliminate duplicate tuples.</p>
      </li>
   </ul>
   <p></p> 
   <p><strong>Schema Behavior</strong></p>
   <p>The behavior of schemas for UNION (positional notation / data types) and UNION ONSCHEMA (named fields / data types) is the same, except where noted.</p>

<p>Union on relations with two different sizes result in a null schema (union only): </p>
<source>
A: (a1:long, a2:long) 
B: (b1:long, b2:long, b3:long) 
A union B: null 
</source>
  
<p>Union columns with incompatible types results in a failure. (See <a href="#types-table-add">Types Table for addition and subtraction</a> for incompatible types.)</p>
<source>
A: (a1:long)
B: (a1:chararray)
A union B: ERROR: Cannot cast from long to bytearray
</source>

<p>Union columns of compatible type will produce an "escalate" type. 
The priority is:</p>
<ul>
<li>double &gt; float &gt; long &gt; int &gt; bytearray</li>
<li>tuple|bag|map|chararray &gt; bytearray</li>
</ul>
<source>
A: (a1:int, a2:bytearray, a3:int) 
B: (b1:float, b2:chararray, b3:bytearray) 
A union B: (a1:float, a2:chararray, a3:int) 
</source>

<p>Union of different inner types results in an empty complex type: </p>
<source>
A: (a1:(a11:long, a12:int), a2:{(a21:charray, a22:int)}) 
B: (b1:(b11:int, b12:int), b2:{(b21:int, b22:int)}) 
A union B: (a1:(), a2:{()}) 
</source>  

<p>The alias of the first relation is always taken as the alias of the unioned relation field. </p>  
   
</section>

   <section>
   <title>Example</title>
   <p>In this example the union of relation A and B is computed.</p>
<source>
A = LOAD 'data' AS (a1:int,a2:int,a3:int);

DUMP A;
(1,2,3)
(4,2,1)

B = LOAD 'data' AS (b1:int,b2:int);

DUMP A;
(2,4)
(8,9)
(1,3)

X = UNION A, B;

DUMP X;
(1,2,3)
(4,2,1)
(2,4)
(8,9)
(1,3)
</source>
   </section>
   
   <section>
   <title>Example</title>
   <p>This example shows the use of ONSCHEMA.</p>
<source>
L1 = LOAD 'f1' USING (a : int, b : float);
DUMP L1;
(11,12.0)
(21,22.0)

L2 = LOAD  'f1' USING (a : long, c : chararray);
DUMP L2;
(11,a)
(12,b)
(13,c)

U = UNION ONSCHEMA L1, L2;
DESCRIBE U ;
U : {a : long, b : float, c : chararray}

DUMP U;
(11,12.0,)
(21,22.0,)
(11,,a)
(12,,b)
(13,,c)
</source>
</section>
</section>
</section>
   
   
   <!-- =========================================================================== -->
   <!-- =========================================================================== -->
   
    <!-- UDF STATEMENTS --> 
   <section id="udf-statements">
   <title>UDF Statements</title>
   
<!-- ======================================================== -->
   <section id="define-udfs">
   <title>DEFINE (UDFs, streaming)</title>
   <p>Assigns an alias to a UDF or streaming command.</p>
   
   <section>
   <title>Syntax: UDF and streaming</title>
   <table>
      <tr> 
            <td>
               <p>DEFINE alias {function | [`command` [input] [output] [ship] [cache] [stderr] ] };</p>
            </td>
         </tr> 
   </table>
   </section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>alias</p>
            </td>
            <td>
               <p>The name for a UDF function or the name for a streaming command (the cmd_alias for the <a href="#stream">STREAM</a> operator). </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>function</p>
            </td>
            <td>
            <p>For use with functions.</p>
               <p>The name of a UDF function. </p>
            </td>
         </tr>
         <tr>
            <td>
               <p>`command`</p>
            </td>
            <td>
            <p>For use with streaming.</p>
               <p>A command, including the arguments, enclosed in back tics (where a command is anything that can be executed).</p>
               <p>The clauses (input, output, ship, cache, stderr) are described below. Note the following:</p>
               <ul>
                  <li>All clauses are optional.</li>
				  <li>The clauses can be specified in any order (for example, stderr can appear before input)</li>
				  <li>Each clause can be specified at most once (for example, multiple inputs are not allowed)</li>
				</ul>
            </td>
         </tr>
         <tr>
            <td>
               <p>input</p>
            </td>
            <td>
                <p>For use with streaming.</p>
               <p>INPUT ( {stdin | 'path'} [USING serializer] [, {stdin | 'path'} [USING serializer] …] )</p>
               <p>Where:</p>
               <ul>
                  <li>
                     <p>INPUT – Keyword.</p>
                  </li>
                  <li>
                     <p>'path' – A file path, enclosed in single quotes.</p>
                  </li>
                  <li>
                     <p>USING – Keyword.</p>
                  </li>
                  <li>
                     <p>serializer – PigStreaming is the default serializer. </p>
                  </li>
               </ul>
            </td>
         </tr>
         <tr>
            <td>
               <p>output</p>
            </td>
            <td>
            <p>For use with streaming.</p>
               <p>OUTPUT ( {stdout | stderr | 'path'} [USING deserializer] [, {stdout | stderr | 'path'} [USING deserializer] …] )</p>
               <p>Where:</p>
               <ul>
                  <li>
                     <p>OUTPUT – Keyword.</p>
                  </li>
                  <li>
                     <p>'path' – A file path, enclosed in single quotes.</p>
                  </li>
                  <li>
                     <p>USING – Keyword.</p>
                  </li>
                  <li>
                     <p>deserializer – PigStreaming is the default deserializer. </p>
                  </li>
               </ul>
            </td>
         </tr>
         <tr>
            <td>
               <p id="ship">ship</p>
            </td>
            <td>
            <p>For use with streaming.</p>
               <p>SHIP('path' [, 'path' …])</p>
               <p>Where:</p>
               <ul>
                  <li>
                     <p>SHIP – Keyword.</p>
                  </li>
                  <li>
                     <p>'path' – A file path, enclosed in single quotes.</p>
                  </li>
               </ul>
            </td>
         </tr>
         <tr>
            <td>
               <p id="cache">cache</p>
            </td>
            <td>
            <p>For use with streaming.</p>
               <p>CACHE('dfs_path#dfs_file' [, 'dfs_path#dfs_file' …])</p>
               <p>Where:</p>
               <ul>
                  <li>
                     <p>CACHE – Keyword.</p>
                  </li>
                  <li>
                     <p>'dfs_path#dfs_file' – A file path/file name on the distributed file system, enclosed in single quotes. Example: '/mydir/mydata.txt#mydata.txt'</p>
                  </li>
               </ul>
            </td>
         </tr> 
         <tr>
            <td>
               <p>stderr</p>
            </td>
           <td>
            <p>For use with streaming.</p>
            <p>STDERR( '/dir') or STDERR( '/dir' LIMIT n)</p>
             <p>Where:</p>
             <ul><li>'/dir' is the log directory, enclosed in single quotes.</li></ul>
             <ul><li>(optional) LIMIT n is the error threshold where n is an integer value. If not specified, the default error threshold is unlimited.</li></ul>
            </td>
         </tr>
   </table></section>
   
   <section>
   <title>Usage</title>
   <p>Use the DEFINE statement to assign a name (alias) to a UDF function or to a streaming command.</p>
   <p>Use DEFINE to specify a UDF function when:</p>
   <ul>
      <li>
         <p>The function has a long package name that you don't want to include in a script, especially if you call the function several times in that script.</p>
      </li>
      <li>
         <p>The constructor for the function takes string parameters. If you need to use different constructor parameters for different calls to the function you will need to create multiple defines – one for each parameter set.</p>
      </li>
   </ul>
   <p>Use DEFINE to specify a streaming command when: </p>
   <ul>
   <li>
   <p>The streaming command specification is complex.</p>
   </li>
      <li>
   <p>The streaming command specification requires additional parameters (input, output, and so on).</p>
   </li>
   </ul>
   
   
   <section id="pig-streaming-input-output">
   <title>About Input and Output for Streaming</title>
   <p>Serialization is needed to convert data from tuples to a format that can be processed by the streaming application. Deserialization is needed to convert the output from the streaming application back into tuples. PigStreaming is the default serialization/deserialization function.</p>
   
<p>Streaming uses the same default format as PigStorage to serialize/deserialize the data. If you want to explicitly specify a format, you can do it as show below (see more examples in the Examples: Input/Output section).  </p> 

<source>
DEFINE CMD `perl PigStreaming.pl - nameMap` input(stdin using PigStreaming(',')) output(stdout using PigStreaming(','));
A = LOAD 'file';
B = STREAM B THROUGH CMD;
</source>  

<p>If you need an alternative format, you will need to create a custom serializer/deserializer by implementing the following interfaces.</p>

<source>
interface PigToStream {

    /**
     * Given a tuple, produce an array of bytes to be passed to the streaming
     * executable.
     */
    public byte[] serialize(Tuple t) throws IOException;
}

interface StreamToPig {

    /**
     *  Given a byte array from a streaming executable, produce a tuple.
     */
    public Tuple deserialize(byte[]) throws IOException;

    /**
     * This will be called on both the front end and the back
     * end during execution.
     *
     * @return the {@link LoadCaster} associated with this object.
     * @throws IOException if there is an exception during LoadCaster
     */
    public LoadCaster getLoadCaster() throws IOException;
}
</source>  
   
   </section>
   
   <section id="ship-about">
   <title>About Ship </title>
   <p>Use the ship option to send streaming binary and supporting files, if any, from the client node to the compute nodes. Pig does not automatically ship dependencies; it is your responsibility to explicitly specify all the dependencies and to make sure that the software the processing relies on (for instance, perl or python) is installed on the cluster. Supporting files are shipped to the task's current working directory and only relative paths should be specified. Any pre-installed binaries should be specified in the PATH. </p>
   <p>Only files, not directories, can be specified with the ship option. One way to work around this limitation is to tar all the dependencies into a tar file that accurately reflects the structure needed on the compute nodes, then have a wrapper for your script that un-tars the dependencies prior to execution.</p>
   <p>Note that the ship option has two components: the source specification, provided in the ship( ) clause, is the view of your machine; the command specification is the view of the actual cluster. The only guarantee is that the shipped files are available in the current working directory of the launched job and that your current working directory is also on the PATH environment variable. </p>
   <p>Shipping files to relative paths or absolute paths is not supported since you might not have permission to read/write/execute from arbitrary paths on the clusters.</p>
   
    <p>Note the following:</p>
	<ul>
		<li>
			<p>It is safe only to ship files to be executed from the current working directory on the task on the cluster.</p>
			<source>
OP = stream IP through 'script';
or
DEFINE CMD 'script' ship('/a/b/script');
OP = stream IP through CMD;
</source>
		</li>
	    <li>
			<p>Shipping files to relative paths or absolute paths is undefined and mostly will fail since you may not have permissions to read/write/execute from arbitraty paths on the actual clusters. </p>
	    </li>
	</ul>   
   </section>
   
   
   <section id="cache-about">
   <title>About Cache</title>
   <p>The ship option works with binaries, jars, and small datasets. However, loading larger datasets at run time for every execution can severely impact performance. Instead, use the cache option to access large files already moved to and available on the compute nodes. Only files, not directories, can be specified with the cache option.</p>
   </section>
   
   <section id="autoship">
   <title>About Auto-Ship</title>
   <p>If the ship and cache options are not specified, Pig will attempt to auto-ship the binary in the following way:</p>
   <ul>
		<li>
            <p>If the first word on the streaming command is perl or python, Pig assumes that the binary is the first non-quoted string it encounters that does not start with dash.</p>
		</li>
		<li>
			<p>Otherwise, Pig will attempt to ship the first string from the command line as long as it does not come from <code>/bin, /usr/bin, /usr/local/bin</code>. Pig will determine this by scanning the path if an absolute path is provided or by executing  <code>which</code>. The paths can be made configurable using the <a href="cmds.html#set">set stream.skippath</a> option (you can use multiple set commands to specify more than one path to skip). </p>
		</li>
	</ul>
	<p>If you don't supply a DEFINE for a given streaming command, then auto-shipping is turned off.</p>
	<p>Note the following:</p>
	<ul>
		<li>
			<p>If Pig determines that it needs to auto-ship an absolute path it will not ship it at all since there is no way to ship files to the necessary location (lack of permissions and so on). </p>
			<source>
OP = stream IP through `/a/b/c/script`;
or 
OP = stream IP through `perl /a/b/c/script.pl`;
</source>
		</li>
	    <li>
			<p>Pig will not auto-ship files in the following system directories (this is determined by executing 'which &lt;file&gt;' command). </p>
			<source>
/bin /usr/bin /usr/local/bin /sbin /usr/sbin /usr/local/sbin
</source>
		</li>
	    <li>
			<p>To auto-ship, the file in question should be present in the PATH. So if the file is in the current working directory then the current working directory should be in the PATH. </p>
		</li>
	</ul>
   
   </section>
      </section>
   
 <section>
 <title>Examples: Input/Output</title>
 <p>In this example PigStreaming is the default serialization/deserialization function. The tuples from relation A are converted to tab-delimited lines that are passed to the script.</p>
<source>
X = STREAM A THROUGH `stream.pl`;
</source>
   
   <p>In this example PigStreaming is used as the serialization/deserialization function, but a comma is used as the delimiter.</p>
<source>
DEFINE Y 'stream.pl' INPUT(stdin USING PigStreaming(',')) OUTPUT (stdout USING PigStreaming(','));

X = STREAM A THROUGH Y;
</source>
   
   <p>In this example user defined serialization/deserialization functions are used with the script.</p>
<source>
DEFINE Y 'stream.pl' INPUT(stdin USING MySerializer) OUTPUT (stdout USING MyDeserializer);

X = STREAM A THROUGH Y;
</source>
   </section>
   
   <section>
   <title>Examples: Ship/Cache</title>
   <p>In this example ship is used to send the script to the cluster compute nodes.</p>
<source>
DEFINE Y 'stream.pl' SHIP('/work/stream.pl');

X = STREAM A THROUGH Y;
</source>
   
   <p>In this example cache is used to specify a file located on the cluster compute nodes.</p>
<source>
DEFINE Y 'stream.pl data.gz' SHIP('/work/stream.pl') CACHE('/input/data.gz#data.gz');

X = STREAM A THROUGH Y;
</source>
   </section>
   
 
     <section>
   <title>Example: DEFINE with STREAM</title>
<p>In this example a command is defined for use with the <a href="#stream">STREAM</a> operator.</p>
<source>
A = LOAD 'data';

DEFINE mycmd 'stream_cmd –input file.dat';

B = STREAM A through mycmd;
</source>
</section>   
   
   <section>
   <title>Examples: Logging</title>
   <p>In this example the streaming stderr is stored in the _logs/&lt;dir&gt; directory of the job's output directory. Because the job can have multiple streaming applications associated with it, you need to ensure that different directory names are used to avoid conflicts. Pig stores up to 100 tasks per streaming job.</p>
<source>
DEFINE Y 'stream.pl' stderr('&lt;dir&gt;' limit 100);

X = STREAM A THROUGH Y;
</source>
</section>

   
<section>
<title>Examples: DEFINE a function</title>
<p>In this example a function is defined for use with the FOREACH …GENERATE operator.</p>
<source>
REGISTER /src/myfunc.jar

DEFINE myFunc myfunc.MyEvalfunc('foo');

A = LOAD 'students';

B = FOREACH A GENERATE myFunc($0);
</source>

</section>
  </section>   
   
   
   
   <!-- =========================================================================== -->
   <section id="register-jar">
   <title>REGISTER (a jar/script)</title>
   <p>Registers a JAR file so that the UDFs in the file can be used.</p>
   
   <section>
   <title>Syntax</title>
   <table>
      <tr> 
            <td>
               <p>REGISTER path;</p>
            </td>
         </tr> 
   </table></section>
   
   <section>
   <title>Terms</title>
   <table>
      <tr>
            <td>
               <p>path</p>
            </td>
            <td>
               <p>The path to the JAR file (the full location URI is required). Do not place the name in quotes.</p>
               
            </td>
         </tr> 
   </table>
   </section>
   
   <section>
   <title>Usage</title>
   <p><strong>Pig Scripts</strong></p>
   
   <p>Use the REGISTER statement inside a Pig script to specify a JAR file or a Python/JavaScript module. Pig supports JAR files and modules stored in local file systems as well as remote, distributed file systems such as HDFS and Amazon S3 (see <a href="start.html#pig-scripts">Pig Scripts</a>).</p>
   
   <p id="register-glob">Additionally, JAR files stored in local file systems can be specified as a glob pattern using “*”. Pig will search for matching jars in the local file system, either the relative path (relative to your working directory) or the absolute path. Pig will pick up all JARs that match the glob.</p>
   
   <p><strong>Command Line</strong></p>
   <p>You can register additional files (to use with your Pig script) via PIG_OPTS environment variable using the -Dpig.additional.jars.uris option. 
For more information see <a href="udf.html">User Defined Functions</a>.</p>
   </section>
   
   <section>
   <title>Examples</title>
<p>In this example REGISTER states that the JavaScript module, myfunc.js, is located in the /src directory.</p>
<source>
/src $ java -jar pig.jar –

REGISTER /src/myfunc.js;
A = LOAD 'students';
B = FOREACH A GENERATE myfunc.MyEvalFunc($0);
</source>
   
<p>In this example additional JAR files are registered via PIG_OPTS environment variable.</p>
<source>
export PIG_OPTS="-Dpig.additional.jars.uris=my.jar,your.jar"
</source>

<p>In this example a JAR file stored in HDFS and a local JAR file are registered.</p>
<source>
export PIG_OPTS="-Dpig.additional.jars.uris=hdfs://nn.mydomain.com:9020/myjars/my.jar,file:///home/root/pig/your.jar"
</source>

<p>Note, the legacy property pig.additional.jars which use colon as separator is still supported. But we recommend to use pig.additional.jars.uris since colon is also used in URL scheme, and thus we cannot use full scheme in the list. We will deprecate pig.additional.jar in future releases.</p>

<p>This example shows how to specify a glob pattern using either a relative path or an absolute path.</p>
<source>
register /homes/user/pig/myfunc*.jar
register count*.jar
register jars/*.jar
</source>
   </section>
   </section>

      <!-- =========================================================================== -->
      <section id="register-artifact">

        <title>REGISTER (an artifact)</title>

        <p>
          Instead of figuring out the dependencies manually, downloading them and registering each jar using the above
          <a href="#register-jar">register command</a>, you can specify the artifact's coordinates and expect pig to automatically
          fetch the required dependencies, download and register them.
        </p>

        <!-- Command Syntax-->
        <section>
          <title>Syntax</title>
          <p>
            To download an Artifact (and its dependencies), you need to specify the artifact's group, module and version following
            the syntax shown below. This command will download the Jar specified and all its dependencies and load it into the
            classpath.
          </p>
          <table>
            <tr>
              <td>
                <p>REGISTER ivy://group:module:version?querystring</p>
              </td>
            </tr>
          </table>
        </section>

        <!-- Terms -->
        <section>
          <title>Terms</title>
          <table>
            <tr>
              <td>
                <p>group</p>
              </td>
              <td>
                <p>Which module group the module comes from. Translates directly to a Maven groupId or an Ivy Organization.</p>
              </td>
            </tr>
            <tr>
              <td>
                <p>module</p>
              </td>
              <td>
                <p>The name of the module to load. Translated directly to a Maven artifactId or an Ivy artifact.</p>
              </td>
            </tr>
            <tr>
              <td>
                <p>version</p>
              </td>
              <td>
                <p>The version of the module to use. You can specify a specific version or use "+" or "*" to use the latest version.</p>
              </td>
            </tr>
            <tr>
              <td>
                <p>querystring</p>
              </td>
              <td>
                <p>This will contain "&amp;" separated key-value pairs to help us exclude all or specific dependencies etc.</p>
              </td>
            </tr>
          </table>
        </section>

        <section>
          <title>Usage</title>

          <p>
            The Register artifact command is an extension to the above register command used to <a href="#register-jar">register a
            jar</a>. In addition to registering a jar from a local system or from hdfs, you can now specify the coordinates of the
            artifact and pig will download the artifact (and its dependencies if needed) from the configured repository.
          </p>

          <section>
            <title>Parameters Supported in the Query String</title>

            <ul>
              <li>
                <strong>Transitive</strong>
                <p>
                  Transitive helps specifying if you need the dependencies along with the registering jar. By setting transitive to
                  false in the querystring we can tell pig to register only the artifact without its dependencies. This will
                  download only the artifact specified and will not download the dependencies of the jar. The default value of
                  transitive is true.
                </p>
                <strong>Syntax</strong>
                <table>
                  <tr>
                    <td>
                      <p>REGISTER ivy://org:module:version?transitive=false</p>
                    </td>
                  </tr>
                </table>
              </li>
              <li>
                <strong>Exclude</strong>
                <p>
                  While registering an artifact if you wish to exclude some dependencies you can specify them using the exclude
                  key. Suppose you want to use a specific version of a dependent jar which doesn't match the version of the jar
                  when automatically fetched, then you could exclude such dependencies by specifying a comma separated list of
                  dependencies and register the dependent jar separately.
                </p>
                <strong>Syntax</strong>
                <table>
                  <tr>
                    <td>
                      <p>REGISTER ivy://org:module:version?exclude=org:mod,org:mod,...</p>
                    </td>
                  </tr>
                </table>
              </li>
              <li>
                <strong>Classifier</strong>
                <p>
                  Some maven dependencies need classifiers in order to be able to resolve. You can specify them using a classifier
                  key.
                </p>
                <strong>Syntax</strong>
                <table>
                  <tr>
                    <td>
                      <p>REGISTER ivy://org:module:version?classifier=value</p>
                    </td>
                  </tr>
                </table>
              </li>
            </ul>
          </section>

          <section>
            <title>Other properties</title>

            <ul>
              <li>
                <p>
                  An optional pig property, pig.artifacts.download.location, can be used to configure the location where the
                  artifacts should be downloaded. By default, they will be downloaded to ~/.groovy/grapes
                </p>
              </li>

              <li>
                <p>
                  This command can be used or can replace the <a href="#register-jar">register jar</a> command wherever used
                  including macros.<br></br>
                </p>
              </li>

              <li>
                <p>
                  Group/Organization and Version are optional fields. In such cases you can leave them blank.<br></br>
                </p>
              </li>

              <li>
                <p>
                  The repositories can be configured using an ivysettings file. Pig will search for an ivysettings.xml file
                  in the following locations in order. PIG_CONF_DIR > PIG_HOME > Classpath<br></br>
                </p>
              </li>
            </ul>
          </section>
        </section>

        <!-- Examples-->
        <section>
          <title>Examples</title>

          <ul>
            <li>
              <p>Registering an Artifact and all its dependencies.</p>
              <source>
                -- Both are the same<br></br>
                REGISTER ivy://org.apache.avro:avro:1.5.1<br></br>
                REGISTER ivy://org.apache.avro:avro:1.5.1?transitive=true</source>
            </li>

            <li>
              <p>Registering an artifact without getting its dependencies.</p>
              <source>
               REGISTER ivy://org.apache.avro:avro:1.5.1?transitive=false</source>
            </li>

            <li>
              <p>Registering the latest artifact.</p>
              <source>
                -- Both of the following syntaxes work.<br></br>
                REGISTER ivy://org.apache.avro:avro:+<br></br>
                REGISTER ivy://org.apache.avro:avro:*</source>
            </li>

            <li>
              <p>Registering an artifact by excluding specific dependencies.</p>
              <source>
                REGISTER ivy://org.apache.pig:pig:0.10.0?exclude=commons-cli:commons-cli,commons-codec:commons-codec</source>
            </li>

            <li>
              <p>Specifying a classifier</p>
              <source>
                REGISTER ivy://net.sf.json-lib:json-lib:2.4?classifier=jdk15</source>
            </li>

            <li>
              <p>Registering an artifact without a group or organization. Just skip them.</p>
              <source>
                REGISTER ivy://:module:</source>
            </li>
          </ul>
        </section>
      </section>

      <!-- =========================================================================== -->
    </section>
  </body>
</document>
