blob: 06d607f6a5be5ce02d5f58c956219e90f3f9373f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.scripting;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FsShell;
import org.apache.pig.PigServer;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.impl.PigContext;
import org.apache.pig.tools.grunt.GruntParser;
/**
* The class being used in scripts to interact with Pig
*/
public class Pig {
private static final Log LOG = LogFactory.getLog(Pig.class);
private static List<String> defineCache = new ArrayList<String>();
private static List<String> scriptUDFCache = new ArrayList<String>();
/**
* Run a filesystem command. Any output from this command is written to
* stdout or stderr as appropriate.
* @param cmd Filesystem command to run along with its arguments as one
* string.
* @throws IOException
*/
public static int fs(String cmd) throws IOException {
ScriptPigContext ctx = getScriptContext();
FsShell shell = new FsShell(ConfigurationUtil.toConfiguration(ctx
.getPigContext().getProperties()));
int code = -1;
if (cmd != null) {
String[] cmdTokens = cmd.split("\\s+");
if (!cmdTokens[0].startsWith("-")) cmdTokens[0] = "-" + cmdTokens[0];
try {
code = shell.run(cmdTokens);
} catch (Exception e) {
throw new IOException("Run filesystem command failed", e);
}
}
return code;
}
/**
* Run a sql command. Any output from this command is written to
* stdout or stderr as appropriate.
* @param cmd sql command to run along with its arguments as one
* string. Currently only hcat is supported as a sql backend
* @throws IOException
*/
public static int sql(String cmd) throws IOException {
ScriptPigContext ctx = getScriptContext();
if (!ctx.getPigContext().getProperties().get("pig.sql.type").equals("hcat")) {
throw new IOException("sql command only support hcat currently");
}
if (ctx.getPigContext().getProperties().get("hcat.bin")==null) {
throw new IOException("hcat.bin is not defined. Define it to be your hcat script (Usually $HCAT_HOME/bin/hcat");
}
String hcatBin = (String)ctx.getPigContext().getProperties().get("hcat.bin");
if (new File("hcat.bin").exists()) {
throw new IOException(hcatBin + " does not exist. Please check your 'hcat.bin' setting in pig.properties.");
}
int ret = GruntParser.runSQLCommand(hcatBin, cmd, false);
return ret;
}
/**
* Register a jar for use in Pig. Once this is done this jar will be
* registered for <b>all subsequent</b> Pig pipelines in this script.
* If you wish to register it for only a single Pig pipeline, use
* register within that definition.
* @param jarfile Path of jar to include.
* @throws IOException if the indicated jarfile cannot be found.
*/
public static void registerJar(String jarfile) throws IOException {
LOG.info("Register jar: "+ jarfile);
ScriptPigContext ctx = getScriptContext();
PigServer pigServer = new PigServer(ctx.getPigContext(), false);
pigServer.registerJar(jarfile);
}
/**
* Register scripting UDFs for use in Pig. Once this is done all UDFs
* defined in the file will be available for <b>all subsequent</b>
* Pig pipelines in this script. If you wish to register UDFS for
* only a single Pig pipeline, use register within that definition.
* @param udffile Path of the script UDF file
* @param namespace namespace of the UDFs
* @throws IOException
*/
public static void registerUDF(String udffile, String namespace)
throws IOException {
LOG.info("Register script UDF file: "+ udffile);
ScriptPigContext ctx = getScriptContext();
ScriptEngine engine = ctx.getScriptEngine();
// script file contains only functions, no need to separate
// functions from control flow code
if (namespace != null && namespace.isEmpty()) namespace = null;
engine.registerFunctions(udffile, namespace, ctx.getPigContext());
addRegisterScriptUDFClause(udffile, namespace);
}
/**
* Define an alias for a UDF or a streaming command. This definition
* will then be present for <b>all subsequent</b> Pig pipelines defined in this
* script. If you wish to define it for only a single Pig pipeline, use
* define within that definition.
* @param alias name of the defined alias
* @param definition string this alias is defined as
*/
public static void define(String alias, String definition)
throws IOException {
LOG.info("Add define clause: "+ alias + " -- " + definition);
addDefineClause(alias, definition);
}
/**
* Set a variable for use in Pig Latin. This set
* will then be present for <b>all subsequent</b> Pig pipelines defined in this
* script. If you wish to set it for only a single Pig pipeline, use
* set within that definition.
* @param var variable to set
* @param value to set it to
*/
public static void set(String var, String value) throws IOException {
ScriptPigContext ctx = getScriptContext();
PigServer pigServer = new PigServer(ctx.getPigContext(), false);
pigServer.getPigContext().getProperties().setProperty(var, value);
}
/**
* Define a Pig pipeline.
* @param pl Pig Latin definition of the pipeline.
* @return Pig object representing this pipeline.
* @throws IOException if the Pig Latin does not compile.
*/
public static Pig compile(String pl) throws IOException {
return compile(null, pl);
}
/**
* Define a named portion of a Pig pipeline. This allows it
* to be imported into another pipeline.
* @param name Name that will be used to define this pipeline.
* The namespace is global.
* @param pl Pig Latin definition of the pipeline.
* @return Pig object representing this pipeline.
* @throws IOException if the Pig Latin does not compile.
*/
public static Pig compile(String name, String pl) throws IOException {
ScriptPigContext ctx = getScriptContext();
StringBuilder sb = new StringBuilder();
sb.append(getRegisterScriptUDFClauses()).append(getDefineClauses());
sb.append(pl).append("\n");
return new Pig(sb.toString(), ctx, name);
}
/**
* Define a Pig pipeline based on Pig Latin in a separate file.
* @param filename File to read Pig Latin from. This must be a purely
* Pig Latin file. It cannot contain host language constructs in it.
* @return Pig object representing this pipeline.
* @throws IOException if the Pig Latin does not compile or the file
* cannot be found.
*/
public static Pig compileFromFile(String filename)
throws IOException {
return compileFromFile(null, filename);
}
/**
* Define a named Pig pipeline based on Pig Latin in a separate file.
* This allows it to be imported into another pipeline.
* @param name Name that will be used to define this pipeline.
* The namespace is global.
* @param filename File to read Pig Latin from. This must be a purely
* Pig Latin file. It cannot contain host language constructs in it.
* @return Pig object representing this pipeline.
* @throws IOException if the Pig Latin does not compile or the file
* cannot be found.
*/
public static Pig compileFromFile(String name,
String filename) throws IOException {
return compile(name, getScriptFromFile(filename));
}
//-------------------------------------------------------------------------
/**
* Bind this to a set of variables. Values must be provided
* for all Pig Latin parameters.
* @param vars map of variables to bind. Keys should be parameters defined
* in the Pig Latin. Values should be strings that provide values for those
* parameters. They can be either constants or variables from the host
* language. Host language variables must contain strings.
* @return a {@link BoundScript} object
* @throws IOException if there is not a key for each
* Pig Latin parameter or if they contain unsupported types.
*/
public BoundScript bind(Map<String, Object> vars) throws IOException {
return new BoundScript(replaceParameters(script, vars), scriptContext, name);
}
/**
* Bind this to multiple sets of variables. This will
* cause the Pig Latin script to be executed in parallel over these sets of
* variables.
* @param vars list of maps of variables to bind. Keys should be parameters defined
* in the Pig Latin. Values should be strings that provide values for those
* variables. They can be either constants or variables from the host
* language. Host language variables must be strings.
* @return a {@link BoundScript} object
* @throws IOException if there is not a key for each
* Pig Latin parameter or if they contain unsupported types.
*/
public BoundScript bind(List<Map<String, Object>> vars) throws IOException {
List<String> lst = new ArrayList<String>();
for (Map<String, Object> var : vars) {
lst.add(replaceParameters(script, var));
}
return new BoundScript(lst, scriptContext, name);
}
/**
* Bind a Pig object to variables in the host language (optional
* operation). This does an implicit mapping of variables in the host
* language to parameters in Pig Latin. For example, if the user
* provides a Pig Latin statement
* <tt> p = Pig.compile("A = load '$input';");</tt>
* and then calls this function it will look for a variable called
* <tt>input</tt> in the host language. Scoping rules of the host
* language will be followed in selecting which variable to bind. The
* variable bound must contain a string value. This method is optional
* because not all host languages may support searching for in scope
* variables.
* @throws IOException if host language variables are not found to resolve all
* Pig Latin parameters or if they contain unsupported types.
*/
public BoundScript bind() throws IOException {
ScriptEngine engine = scriptContext.getScriptEngine();
int index = script.indexOf('$');
if (index == -1) { // no parameter substitution is needed
return new BoundScript(script, scriptContext, name);
}
Map<String, Object> vars = engine.getParamsFromVariables();
return bind(vars);
}
//-------------------------------------------------------------------------
private String script = null;
private ScriptPigContext scriptContext = null;
private String name = null;
protected Pig(String script, ScriptPigContext scriptContext, String name) {
this.script = script;
this.scriptContext = scriptContext;
this.name = name;
}
/**
* Replaces the $<identifier> with their actual values
* @param qstr the pig script to rewrite
* @param vars parameters and their values
* @return the modified version
*/
private String replaceParameters(String qstr, Map<String, Object> vars)
throws IOException {
List<String> params = new ArrayList<String>();
for (Entry<String, Object> entry : vars.entrySet()) {
params.add(entry.getKey() + "="
+ fixNonEscapedDollarSign(entry.getValue().toString()));
}
PigContext context = getScriptContext().getPigContext();
List<String> contextParams = context.getParams();
if (contextParams != null) {
for (String param : contextParams) {
params.add(param);
}
}
BufferedReader reader = new BufferedReader(new StringReader(qstr));
String substituted = context.doParamSubstitution(reader, params, context.getParamFiles());
context.setParams(contextParams); // reset params that were originally in PigContext
return substituted;
}
// Escape the $ so that we can use the parameter substitution
// to perform bind operation. Parameter substitution will un-escape $
private static String fixNonEscapedDollarSign(String s) {
String[] tkns = s.split("\\$", -1);
if (tkns.length == 1) return s;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < tkns.length -1; i++) {
if (tkns[i].isEmpty()) {
sb.append("\\\\");
} else {
sb.append(tkns[i]);
if (tkns[i].charAt(tkns[i].length()-1) != '\\') {
sb.append("\\\\");
}
}
sb.append("$");
}
sb.append(tkns[tkns.length - 1]);
return sb.toString();
}
//-------------------------------------------------------------------------
private static String getScriptFromFile(String filename) throws IOException {
LineNumberReader rd = new LineNumberReader(new FileReader(filename));
StringBuilder sb = new StringBuilder();
try {
String line = rd.readLine();
while (line != null) {
sb.append(line);
sb.append("\n");
line = rd.readLine();
}
} finally {
rd.close();
}
return sb.toString();
}
private static void addDefineClause(String alias, String definition) {
defineCache.add("DEFINE " + alias + " " + definition + ";\n");
}
private static void addRegisterScriptUDFClause(String path, String namespace)
throws IOException {
ScriptPigContext ctx = getScriptContext();
ScriptEngine engine = ctx.getScriptEngine();
String clause = "REGISTER '" + path + "' USING "
+ engine.getScriptingLang();
if (namespace != null && !namespace.isEmpty()) {
clause += " AS " + namespace;
}
scriptUDFCache.add(clause + ";\n");
}
private static String getDefineClauses() {
StringBuilder sb = new StringBuilder();
for (String def : defineCache) {
sb.append(def);
}
return sb.toString();
}
private static String getRegisterScriptUDFClauses() {
StringBuilder sb = new StringBuilder();
for (String udf : scriptUDFCache) {
sb.append(udf);
}
return sb.toString();
}
private static ScriptPigContext getScriptContext() throws IOException {
ScriptPigContext ctx = ScriptPigContext.get();
if (ctx == null) {
throw new IOException("Script context is not set");
}
return ctx;
}
}