blob: 79b12ebf3bb400a633bf49a64c857cdd87f35dc6 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Falcon Overview
Falcon is a feed processing and feed management system aimed at making it
easier for end consumers to onboard their feed processing and feed
management on hadoop clusters.
Why?
* Dependencies across various data processing pipelines are not easy to
establish. Gaps here typically leads to either incorrect/partial
processing or expensive reprocessing. Repeated duplicate definition of
a single feed multiple times can lead to inconsistencies / issues.
* Input data may not arrive always on time and it is required to kick off
the processing without waiting for all data to arrive and accommodate
late data separately
* Feed management services such as feed retention, replications across
clusters, archival etc are tasks that are burdensome on individual
pipeline owners and better offered as a service for all customers.
* It should be easy to onboard new workflows/pipelines
* Smoother integration with metastore/catalog
* Provide notification to end customer based on availability of feed
groups (logical group of related feeds, which are likely to be used
together)
Usage
a. Setup cluster definition
$FALCON_HOME/bin/falcon entity -submit -type cluster -file /cluster/definition.xml -url http://falcon-server:falcon-port
b. Setup feed definition
$FALCON_HOME/bin/falcon entity -submit -type feed -file /feed1/definition.xml -url http://falcon-server:falcon-port
$FALCON_HOME/bin/falcon entity -submit -type feed -file /feed2/definition.xml -url http://falcon-server:falcon-port
c. Setup process definition
$FALCON_HOME/bin/falcon entity -submit -type process -file /process/definition.xml -url http://falcon-server:falcon-port
d. Once submitted, entity definition, status and dependency can be queried.
$FALCON_HOME/bin/falcon entity -type [cluster|feed|process] -name <<name>> [-definition|-status|-dependency] -url http://falcon-server:falcon-port
or entities for a particular type can be listed through
$FALCON_HOME/bin/falcon entity -type [cluster|feed|process] -list
e. Schedule process
$FALCON_HOME/bin/falcon entity -type process -name process -schedule -url http://falcon-server:falcon-port
f. Once scheduled entities can be suspended, resumed or deleted (post submit)
$FALCON_HOME/bin/falcon entity -type [cluster|feed|process] -name <<name>> [-suspend|-delete|-resume] -url http://falcon-server:falcon-port
g. Once scheduled process instances can be managed through irovy CLI
$FALCON_HOME/bin/falcon instance -processName <<name>> [-kill|-suspend|-resume|-re-run] -start "yyyy-MM-dd'T'HH:mm'Z'" -url http://falcon-server:falcon-port
Example configurations
Cluster:
<?xml version="1.0"?>
<!--
Production cluster configuration
-->
<cluster colo="ua2" description="" name="staging-red" xmlns="uri:falcon:cluster:0.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<interfaces>
<interface type="readonly" endpoint="hftp://gsgw1001.red.ua2.inmobi.com:50070"
version="0.20.2-cdh3u0" />
<interface type="write" endpoint="hdfs://gsgw1001.red.ua2.inmobi.com:54310"
version="0.20.2-cdh3u0" />
<interface type="execute" endpoint="gsgw1001.red.ua2.inmobi.com:54311" version="0.20.2-cdh3u0" />
<interface type="workflow" endpoint="http://gs1134.blue.ua2.inmobi.com:11000/oozie/"
version="3.1.4" />
<interface type="messaging" endpoint="tcp://gs1134.blue.ua2.inmobi.com:61616?daemon=true"
version="5.1.6" />
</interfaces>
<locations>
<location name="staging" path="/projects/falcon/staging" />
<location name="temp" path="/tmp" />
<location name="working" path="/projects/falcon/working" />
</locations>
<properties/>
</cluster>
Feed:
<?xml version="1.0" encoding="UTF-8"?>
<!--
Hourly ad carrier summary. Generated by hourly processing of rr logs
-->
<feed description="RRHourlyAdCarrierSummary" name="RRHourlyAdCarrierSummary" xmlns="uri:falcon:feed:0.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<partitions/>
<groups>rmchourly</groups>
<frequency>hours</frequency>
<periodicity>1</periodicity>
<late-arrival cut-off="hours(6)" />
<clusters>
<cluster name="staging-red" type="source">
<validity start="2009-01-01T00:00Z" end="2099-12-31T00:00Z" timezone="UTC" />
<retention limit="months(24)" action="delete" />
</cluster>
</clusters>
<locations>
<location type="data" path="/projects/bi/rmc/rr/${YEAR}-${MONTH}-${DAY}-${HOUR}.concat/HourlyAdCarrierSummary" />
<location type="stats" path="/none" />
<location type="meta" path="/none" />
</locations>
<ACL owner="rmcuser" group="users" permission="0755" />
<schema location="/none" provider="none" />
<properties/>
</feed>
Process:
<?xml version="1.0" encoding="UTF-8"?>
<!--
RMC Daily process, produces 34 new feeds
-->
<process name="rmc-daily">
<cluster name="staging-red" />
<frequency>days(1)</frequency>
<validity start="2012-04-03T06:00Z" end="2022-12-30T00:00Z" timezone="UTC" />
<inputs>
<input name="WapAd" feed="WapAd" start="today(0,0)" end="today(0,0)" />
</inputs>
<outputs>
<output name="TrafficDailyAdSiteSummary" feed="TrafficDailyAdSiteSummary" instance="yesterday(0,0)" />
</outputs>
<properties>
<property name="lastday" value="${formatTime(yesterday(0,0), 'yyyy-MM-dd')}" />
</properties>
<workflow engine="oozie" path="/projects/bi/rmc/pipelines/workflow/rmcdaily" />
<retry policy="backoff" delay="5" delayUnit="minutes" attempts="3" />
</process>