| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| Falcon Overview |
| |
| Falcon is a feed processing and feed management system aimed at making it |
| easier for end consumers to onboard their feed processing and feed |
| management on hadoop clusters. |
| |
| Why? |
| |
| * Dependencies across various data processing pipelines are not easy to |
| establish. Gaps here typically leads to either incorrect/partial |
| processing or expensive reprocessing. Repeated duplicate definition of |
| a single feed multiple times can lead to inconsistencies / issues. |
| |
| * Input data may not arrive always on time and it is required to kick off |
| the processing without waiting for all data to arrive and accommodate |
| late data separately |
| |
| * Feed management services such as feed retention, replications across |
| clusters, archival etc are tasks that are burdensome on individual |
| pipeline owners and better offered as a service for all customers. |
| |
| * It should be easy to onboard new workflows/pipelines |
| |
| * Smoother integration with metastore/catalog |
| |
| * Provide notification to end customer based on availability of feed |
| groups (logical group of related feeds, which are likely to be used |
| together) |
| |
| Usage |
| |
| a. Setup cluster definition |
| $FALCON_HOME/bin/falcon entity -submit -type cluster -file /cluster/definition.xml -url http://falcon-server:falcon-port |
| |
| b. Setup feed definition |
| $FALCON_HOME/bin/falcon entity -submit -type feed -file /feed1/definition.xml -url http://falcon-server:falcon-port |
| $FALCON_HOME/bin/falcon entity -submit -type feed -file /feed2/definition.xml -url http://falcon-server:falcon-port |
| |
| c. Setup process definition |
| $FALCON_HOME/bin/falcon entity -submit -type process -file /process/definition.xml -url http://falcon-server:falcon-port |
| |
| d. Once submitted, entity definition, status and dependency can be queried. |
| $FALCON_HOME/bin/falcon entity -type [cluster|feed|process] -name <<name>> [-definition|-status|-dependency] -url http://falcon-server:falcon-port |
| |
| or entities for a particular type can be listed through |
| $FALCON_HOME/bin/falcon entity -type [cluster|feed|process] -list |
| |
| e. Schedule process |
| $FALCON_HOME/bin/falcon entity -type process -name process -schedule -url http://falcon-server:falcon-port |
| |
| f. Once scheduled entities can be suspended, resumed or deleted (post submit) |
| $FALCON_HOME/bin/falcon entity -type [cluster|feed|process] -name <<name>> [-suspend|-delete|-resume] -url http://falcon-server:falcon-port |
| |
| g. Once scheduled process instances can be managed through irovy CLI |
| $FALCON_HOME/bin/falcon instance -processName <<name>> [-kill|-suspend|-resume|-re-run] -start "yyyy-MM-dd'T'HH:mm'Z'" -url http://falcon-server:falcon-port |
| |
| Example configurations |
| |
| Cluster: |
| |
| <?xml version="1.0"?> |
| <!-- |
| Production cluster configuration |
| --> |
| |
| <cluster colo="ua2" description="" name="staging-red" xmlns="uri:falcon:cluster:0.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> |
| |
| <interfaces> |
| <interface type="readonly" endpoint="hftp://gsgw1001.red.ua2.inmobi.com:50070" |
| version="0.20.2-cdh3u0" /> |
| |
| <interface type="write" endpoint="hdfs://gsgw1001.red.ua2.inmobi.com:54310" |
| version="0.20.2-cdh3u0" /> |
| |
| <interface type="execute" endpoint="gsgw1001.red.ua2.inmobi.com:54311" version="0.20.2-cdh3u0" /> |
| |
| <interface type="workflow" endpoint="http://gs1134.blue.ua2.inmobi.com:11000/oozie/" |
| version="3.1.4" /> |
| |
| <interface type="messaging" endpoint="tcp://gs1134.blue.ua2.inmobi.com:61616?daemon=true" |
| version="5.1.6" /> |
| </interfaces> |
| |
| <locations> |
| <location name="staging" path="/projects/falcon/staging" /> |
| <location name="temp" path="/tmp" /> |
| <location name="working" path="/projects/falcon/working" /> |
| </locations> |
| |
| <properties/> |
| |
| </cluster> |
| |
| Feed: |
| |
| <?xml version="1.0" encoding="UTF-8"?> |
| <!-- |
| Hourly ad carrier summary. Generated by hourly processing of rr logs |
| --> |
| |
| <feed description="RRHourlyAdCarrierSummary" name="RRHourlyAdCarrierSummary" xmlns="uri:falcon:feed:0.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> |
| <partitions/> |
| |
| <groups>rmchourly</groups> |
| |
| <frequency>hours</frequency> |
| <periodicity>1</periodicity> |
| |
| <late-arrival cut-off="hours(6)" /> |
| |
| <clusters> |
| <cluster name="staging-red" type="source"> |
| <validity start="2009-01-01T00:00Z" end="2099-12-31T00:00Z" timezone="UTC" /> |
| <retention limit="months(24)" action="delete" /> |
| </cluster> |
| </clusters> |
| |
| <locations> |
| <location type="data" path="/projects/bi/rmc/rr/${YEAR}-${MONTH}-${DAY}-${HOUR}.concat/HourlyAdCarrierSummary" /> |
| <location type="stats" path="/none" /> |
| <location type="meta" path="/none" /> |
| </locations> |
| |
| <ACL owner="rmcuser" group="users" permission="0755" /> |
| |
| <schema location="/none" provider="none" /> |
| |
| <properties/> |
| |
| </feed> |
| |
| |
| Process: |
| |
| <?xml version="1.0" encoding="UTF-8"?> |
| <!-- |
| RMC Daily process, produces 34 new feeds |
| --> |
| <process name="rmc-daily"> |
| <cluster name="staging-red" /> |
| |
| <frequency>days(1)</frequency> |
| |
| <validity start="2012-04-03T06:00Z" end="2022-12-30T00:00Z" timezone="UTC" /> |
| |
| <inputs> |
| <input name="WapAd" feed="WapAd" start="today(0,0)" end="today(0,0)" /> |
| </inputs> |
| |
| <outputs> |
| <output name="TrafficDailyAdSiteSummary" feed="TrafficDailyAdSiteSummary" instance="yesterday(0,0)" /> |
| </outputs> |
| |
| <properties> |
| <property name="lastday" value="${formatTime(yesterday(0,0), 'yyyy-MM-dd')}" /> |
| </properties> |
| |
| <workflow engine="oozie" path="/projects/bi/rmc/pipelines/workflow/rmcdaily" /> |
| |
| <retry policy="backoff" delay="5" delayUnit="minutes" attempts="3" /> |
| </process> |