blob: 803333ac64a57f7797c4a583ec6e4796a85958d3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
--Load
A = load 'x.file' using org.apache.pig.PigStorage ( ',' ) as ( a:int, b:long );
A = load 'x.file' as c:int;
A=LoAD 'myfile.txt' using PigTextLoader() as ( a:int, b, c );
A = load 'xxx' as ( a : INT, b : LONG );
A = LOAD 'myfile.txt' USING PigTextLoader( 'arg1', 'arg2' ) as ( a:INT, b : long, c : bytearray, d : chararray, e : float, f : double);
aa = load '/data/intermediate/pow/elcarobootstrap/account/full/weekly/data/$date' using org.apache.pig.PigStorage('\n');
A = load 'xxx' as (a:int, b:long, c:bag{});
--Filter
B = FILTER A by $0 == 100 OR $0 < 5 parallel 20;
B = FILTER ( load 'x.file' as c:int ) by c == 40;
bb = filter aa by $4 eq '' or $4 eq 'NULL' or $4 eq 'ss' parallel 400;
B = filter A by NOT( ( a > 5 ) OR ( b < 100 AND a < -1 ) AND d matches 'abc' );
inactiveAccounts = filter a by ($1 neq '') and ($1 == '2') parallel 400;
--Distinct
C = DISTINCT B parallel 10;
C = DISTINCT B partition by org.apache.pig.RandomPartitioner;
--Foreach
D = foreach bb { generate $0,$12,$7; }
D = foreach bb { generate $0,$12,$7; };
D = foreach C generate $0;
D = foreach ( load 'x' as (a:bag{}, b:chararray, c:int) ) { E = c; S = order a by $0; generate $1, COUNT( S ); }
countInactiveAcct = foreach grpInactiveAcct { generate COUNT( inactiveAccounts ); }
E = foreach A generate a as b:int;
I = foreach A generate flatten(c);
--sample
E = sample D 0.9;
--limit
F = limit E 100;
--order by
G = ORDER F by $2;
G = order F by * DESC;
E = order B by $0 ASC;
--define
define myudf org.apache.pig.TextLoader( 'test', 'data' );
define CMD `ls -l`;
--group
D = cogroup A by $0 inner, B by $0 outer;
grpInactiveAcct = group inactiveAccounts all;
B = GROUP A ALL using 'collected';
--cube
C = CUBE A BY CUBE(a, b);
CC = CUBE A BY ROLLUP(*);
--join
E = join A by $0, B by $0 using 'replicated';
H = join A by u, B by u;
I = foreach H generate A::u, B::u;
--croos
F = Cross A, B;
--store
store C into 'output.txt';
store countInactiveAcct into '/user/kaleidoscope/pow_stats/20080228/acct_stats/InactiveAcctCount';
store inactiveAccounts into '/user/kaleidoscope/pow_stats/20080228/acct/InactiveAcct';
--split
Split A into X if $0 > 0, Y if $0 == 0;
--union
H = union onschema A, B;
--stream
C = stream A through CMD;
--rank
R = rank A;
R = rank A by a;
R = rank A by a DESC;
R = rank A by a DESC, b;