train_df, test_df = df.randomSplit([.80, .20], seed=42)
rf = RandomForestRegressor(numTrees=100)
model = rf.fit(train_df)
model.transform(test_df).show() {% endhighlight %} {% highlight python %} df = spark.read.csv(“accounts.csv”, header=True)
filtered_df = df.select(“AccountBalance”, “CountOfDependents”).filter(“AccountBalance > 0”)
filtered_df.summary().show() {% endhighlight %} Run now $ SPARK-HOME/bin/spark-sql spark-sql> {% highlight sql %} SELECT name.first AS first_name, name.last AS last_name, age FROM json.logs.json
WHERE age > 21; {% endhighlight %} Run now $ SPARK-HOME/bin/spark-shell scala> {% highlight scala %} val df = spark.read.json(“logs.json”) df.where(“age > 21”) .select(“name.first”).show() {% endhighlight %} Run now $ SPARK-HOME/bin/spark-shell scala> {% highlight java %} Dataset df = spark.read().json(“logs.json”); df.where(“age > 21”) .select(“name.first”).show(); {% endhighlight %} Run now $ SPARK-HOME/bin/sparkR > {% highlight r %} df <- read.json(path = “logs.json”) df <- filter(df, df$age > 21) head(select(df, df$name.first)) {% endhighlight %}