blob: 92a3efcd8248e2277ef6fa366a0aaca2f928043b [file] [log] [blame]
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" attributeFormDefault="unqualified" elementFormDefault="qualified"
targetNamespace="uri:falcon:feed:0.1" xmlns="uri:falcon:feed:0.1"
xmlns:jaxb="http://java.sun.com/xml/ns/jaxb" jaxb:version="2.1">
<xs:annotation>
<xs:documentation>
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for
additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version
2.0
(the "License"); you may not use this file
except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in
writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
See the License
for the specific language governing permissions and
limitations under the License.
</xs:documentation>
<xs:appinfo>
<jaxb:schemaBindings>
<jaxb:package name="org.apache.falcon.entity.v0.feed"/>
</jaxb:schemaBindings>
</xs:appinfo>
</xs:annotation>
<xs:element name="feed" type="feed">
</xs:element>
<xs:complexType name="feed">
<xs:annotation>
<xs:documentation>
name: A feed should have a unique name and this name is referenced
by processes as input or output feed.
tags: a feed specifies an optional list of comma separated tags
which is used for classification of data sets.
groups: a feed specifies a list of comma separated groups,
a group is a logical grouping of feeds and a group is said to be
available if all the feeds belonging to a group are available.
The frequency of all
the feed which belong to the same group
must be same.
availabilityFlag: specifies the name of a file which when
present/created
in a feeds data directory, the feed is
termed as available. ex: _SUCCESS, if
this element is ignored then Falcon would consider the presence of feed's
data directory as feed availability.
A feed has a
frequency and a periodicity which specifies the frequency by which
this feed is generated. ex: it can be generated every hour, every 5 minutes, daily, weekly etc.
valid frequency type for a feed are minutes, hours, days, months.
sla: A feed can have SLA and each SLA has two properties - slaLow and slaHigh. Both slaLow and slaHigh
are written using expressions like frequency. slaLow is intended to serve for alerting for feeds which
are in danger of missing their availability SLAs. slaHigh is intended to serve for reporting the feeds
which missed their SLAs. SLAs are relative to feed instance time.
</xs:documentation>
</xs:annotation>
<xs:sequence>
<xs:element type="KEY_VALUE_PAIR" name="tags" minOccurs="0">
<xs:annotation>
<xs:documentation>
tags: a feed specifies an optional list of comma separated tags,
Key Value Pairs, separated by comma,
which is used for classification of processes.
Example: consumer=consumer@xyz.com, owner=producer@xyz.com, department=forecasting
</xs:documentation>
</xs:annotation>
</xs:element>
<xs:element type="partitions" name="partitions" minOccurs="0"/>
<xs:element type="group-type" name="groups" minOccurs="0"/>
<xs:element type="xs:string" name="availabilityFlag" minOccurs="0"/>
<xs:element type="frequency-type" name="frequency"/>
<xs:element type="sla" name="sla" minOccurs="0"/>
<xs:element name="timezone" minOccurs="0" default="UTC">
<xs:simpleType>
<xs:annotation>
<xs:appinfo>
<jaxb:javaType name="java.util.TimeZone" parseMethod="java.util.TimeZone.getTimeZone"
printMethod="org.apache.falcon.entity.v0.SchemaHelper.getTimeZoneId"/>
</xs:appinfo>
</xs:annotation>
<xs:restriction base="xs:string"/>
</xs:simpleType>
</xs:element>
<xs:element type="late-arrival" name="late-arrival" minOccurs="0"/>
<xs:element type="clusters" name="clusters"/>
<xs:choice minOccurs="1" maxOccurs="1">
<xs:element type="locations" name="locations"/>
<xs:element type="catalog-table" name="table"/>
</xs:choice>
<xs:element type="notification" name="notification" minOccurs="0">
<xs:annotation>
<xs:documentation>Notification will help to notify the users about the finished status of Falcon
Instance. Currently Email type notification is supported and users must specify the receiver's
email address.
</xs:documentation>
</xs:annotation>
</xs:element>
<xs:element type="ACL" name="ACL"/>
<xs:element type="schema" name="schema"/>
<xs:element type="properties" name="properties" minOccurs="0"/>
<xs:element type="lifecycle" name="lifecycle" minOccurs="0" />
</xs:sequence>
<xs:attribute type="IDENTIFIER" name="name" use="required"/>
<xs:attribute type="xs:string" name="description"/>
</xs:complexType>
<xs:complexType name="cluster">
<xs:annotation>
<xs:documentation>
Feed references a cluster by it's name, before submitting a feed all the
referenced cluster should be submitted to Falcon.
type: specifies whether the
referenced cluster should be treated as a
source or target for a feed.
Validity of a feed on cluster specifies duration for which this feed is
valid on this cluster.
Retention specifies how long the feed is retained on this cluster and the
action to be taken on the feed after the expiry of retention period.
The retention limit is
specified by expression frequency(times), ex: if
feed should be retained for at least 6 hours then retention's limit="hours(6)".
The field partitionExp contains
partition tags. Number of partition tags has to be equal to number of partitions specified in feed
schema.
A partition tag can be a wildcard(*), a static string or
an expression. Atleast one of the strings has to be an expression.
</xs:documentation>
</xs:annotation>
<xs:sequence>
<xs:element type="validity" name="validity"/>
<xs:element type="retention" name="retention"/>
<xs:element type="sla" name="sla" minOccurs="0" maxOccurs="1"/>
<xs:element type="import" name="import" minOccurs="0" maxOccurs="1"/>
<xs:element type="export" name="export" minOccurs="0" maxOccurs="1"/>
<xs:choice minOccurs="0" maxOccurs="1">
<xs:element type="locations" name="locations" minOccurs="0"/>
<xs:element type="catalog-table" name="table"/>
</xs:choice>
<xs:element type="lifecycle" name="lifecycle" minOccurs="0" />
</xs:sequence>
<xs:attribute type="IDENTIFIER" name="name" use="required"/>
<xs:attribute type="cluster-type" name="type" use="optional"/>
<xs:attribute type="xs:string" name="partition" use="optional"/>
<xs:attribute type="frequency-type" name="delay" use="optional" />
</xs:complexType>
<xs:complexType name="partitions">
<xs:annotation>
<xs:documentation>
A list of partition, which is the logical partition of a feed and this
is maintained in Hcatalog registry.
</xs:documentation>
</xs:annotation>
<xs:sequence>
<xs:element type="partition" name="partition" maxOccurs="unbounded" minOccurs="0"/>
</xs:sequence>
</xs:complexType>
<xs:complexType name="schema">
<xs:annotation>
<xs:documentation>A schema specifies the location of a schema file
for a feed and the provider of schema like protobuf, thrift etc.
</xs:documentation>
</xs:annotation>
<xs:attribute type="xs:string" name="location" use="required"/>
<xs:attribute type="xs:string" name="provider" use="required"/>
</xs:complexType>
<xs:complexType name="properties">
<xs:annotation>
<xs:documentation>
A list of name-value pair of property.
</xs:documentation>
</xs:annotation>
<xs:sequence>
<xs:element type="property" name="property" maxOccurs="unbounded" minOccurs="0"/>
</xs:sequence>
</xs:complexType>
<xs:complexType name="validity">
<xs:annotation>
<xs:documentation>
A validity has a start, which is the validity start date and end the
validity
end date. ex: start="2011-11-01T00:00Z" in TZ format.
timezone can be UTC,
GMT.
Processes referring this feed would consider the validity period for
validation.
</xs:documentation>
</xs:annotation>
<xs:attribute type="date-time-type" name="start" use="required"/>
<xs:attribute type="date-time-type" name="end" use="required"/>
</xs:complexType>
<xs:complexType name="sla">
<xs:annotation>
<xs:documentation>
sla has two properties - slaLow and slaHigh. Both slaLow and slaHigh
are written using expressions like frequency. slaLow is intended to serve for alerting for feeds which
are in danger of missing their availability SLAs. slaHigh is intended to serve for reporting the feeds
which missed their SLAs. SLAs are relative to feed instance time.
</xs:documentation>
</xs:annotation>
<xs:attribute type="frequency-type" name="slaLow" use="required"/>
<xs:attribute type="frequency-type" name="slaHigh" use="required"/>
</xs:complexType>
<xs:complexType name="locations">
<xs:annotation>
<xs:documentation>
A list of locations on the file system.
</xs:documentation>
</xs:annotation>
<xs:choice maxOccurs="unbounded" minOccurs="0">
<xs:element type="location" name="location"/>
</xs:choice>
</xs:complexType>
<xs:complexType name="late-arrival">
<xs:annotation>
<xs:documentation>
late-arrival specifies the cut-off period till which the feed is
expected to arrive late and should be honored be processes referring
to it as input
feed by rerunning the instances in case
the data arrives late with in a cut-off period.
The cut-off period is specified by expression
frequency(times), ex: if the feed
can arrive late
upto 8 hours then late-arrival's cut-off="hours(8)"
</xs:documentation>
</xs:annotation>
<xs:attribute type="frequency-type" name="cut-off" use="required"/>
</xs:complexType>
<xs:complexType name="property">
<xs:annotation>
<xs:documentation>
A key-value pair, which are propagated to the
workflow engine.
</xs:documentation>
</xs:annotation>
<xs:attribute type="xs:string" name="name" use="required"/>
<xs:attribute type="xs:string" name="value" use="required"/>
</xs:complexType>
<xs:complexType name="clusters">
<xs:annotation>
<xs:documentation>
A list of clusters.
</xs:documentation>
</xs:annotation>
<xs:sequence>
<xs:element type="cluster" name="cluster" maxOccurs="unbounded" minOccurs="1">
</xs:element>
</xs:sequence>
</xs:complexType>
<xs:complexType name="retention">
<xs:attribute type="retention-type" name="type" default="instance"/>
<xs:attribute type="frequency-type" name="limit" use="required"/>
<xs:attribute type="action-type" name="action" use="required"/>
</xs:complexType>
<xs:simpleType name="retention-type">
<xs:restriction base="xs:string">
<xs:enumeration value="instance"/>
<!-- <xs:enumeration value="age" /> -->
</xs:restriction>
</xs:simpleType>
<xs:complexType name="location">
<xs:annotation>
<xs:documentation>
location specifies the type of location like data, meta, stats
and the corresponding paths for them.
A feed should at least define the location for type
data, which
specifies the HDFS path pattern where the feed is generated
periodically. ex: type="data" path="/projects/TrafficHourly/${YEAR}-${MONTH}-${DAY}/traffic"
</xs:documentation>
</xs:annotation>
<xs:attribute type="location-type" name="type" use="required"/>
<xs:attribute type="non-empty-string" name="path" use="required"/>
</xs:complexType>
<xs:complexType name="partition">
<xs:attribute type="IDENTIFIER" name="name" use="required"/>
</xs:complexType>
<xs:complexType name="notification">
<xs:annotation>
<xs:documentation>
Notification specifies the "type" of notification to be used to send notification.
Currently email based notification type is supported and user can specify the comma
separated email address with "to" property.
e.g: type="email" to="falcon@localhost,hive@localhost"
"limit" property in notification will help to set the frequency of email notification
in case of Falcon instance failure. Incase of feed entity limit="attempt" is only supported
as there is no retry element.
</xs:documentation>
</xs:annotation>
<xs:attribute name="type" use="required">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="email"/>
</xs:restriction>
</xs:simpleType>
</xs:attribute>
<xs:attribute name="level" use="optional">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="attempt"/>
<xs:enumeration value="instance"/>
</xs:restriction>
</xs:simpleType>
</xs:attribute>
<xs:attribute type="xs:string" name="to" use="required"/>
</xs:complexType>
<xs:complexType name="ACL">
<xs:annotation>
<xs:documentation>
Access control list for this feed.
</xs:documentation>
</xs:annotation>
<xs:attribute type="xs:string" name="owner"/>
<xs:attribute type="xs:string" name="group"/>
<xs:attribute type="xs:string" name="permission" default="*"/>
</xs:complexType>
<xs:simpleType name="action-type">
<xs:restriction base="xs:string">
<xs:annotation>
<xs:documentation>
action type specifies the action that should be taken on a feed
when the retention period of a feed expires on a cluster,
the valid
actions are
archive, delete, chown and chmod.
</xs:documentation>
</xs:annotation>
<xs:enumeration value="archive"/>
<xs:enumeration value="delete"/>
<xs:enumeration value="chown"/>
<xs:enumeration value="chmod"/>
</xs:restriction>
</xs:simpleType>
<xs:complexType name="lifecycle">
<xs:annotation>
<xs:documentation>
Lifecycle of the feed consists of various stages. For example typical stages of a feed are import,
replication, archival, retention and export. All these stages together are called lifecycle of a feed.
</xs:documentation>
</xs:annotation>
<xs:all>
<xs:element type="retention-stage" name="retention-stage" minOccurs="0"></xs:element>
</xs:all>
</xs:complexType>
<xs:simpleType name="cluster-type">
<xs:annotation>
<xs:documentation>
The clusters on feed can be either defined as source or target,
a feed
should at least have one source cluster defined.
the target clusters
are used for
replication of feed.
</xs:documentation>
</xs:annotation>
<xs:restriction base="xs:string">
<xs:enumeration value="source"/>
<xs:enumeration value="target"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="location-type">
<xs:restriction base="xs:string">
<xs:enumeration value="data"/>
<xs:enumeration value="stats"/>
<xs:enumeration value="meta"/>
<xs:enumeration value="tmp"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="IDENTIFIER">
<xs:restriction base="xs:string">
<xs:pattern value="(([a-zA-Z]([\-a-zA-Z0-9])*){1,39})"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="frequency-type">
<xs:annotation>
<xs:appinfo>
<jaxb:javaType name="org.apache.falcon.entity.v0.Frequency"
parseMethod="org.apache.falcon.entity.v0.Frequency.fromString"
printMethod="org.apache.falcon.entity.v0.Frequency.toString"/>
</xs:appinfo>
</xs:annotation>
<xs:restriction base="xs:string">
<xs:pattern value="(minutes|hours|days|months)\([1-9]\d*\)"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="date-time-type">
<xs:annotation>
<xs:appinfo>
<jaxb:javaType name="java.util.Date" parseMethod="org.apache.falcon.entity.v0.SchemaHelper.parseDateUTC"
printMethod="org.apache.falcon.entity.v0.SchemaHelper.formatDateUTC"/>
</xs:appinfo>
</xs:annotation>
<xs:restriction base="xs:string">
<xs:pattern
value="((19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])T([0-1][0-9]|2[0-3]):([0-5][0-9]))Z"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="group-type">
<xs:restriction base="xs:string">
<xs:pattern value="(\w+(,\w+)*)"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="KEY_VALUE_PAIR">
<xs:restriction base="xs:string">
<xs:pattern value="([\w_]+=[^,]+)?([,]?[ ]*[\w_]+=[^,]+)*"/>
</xs:restriction>
</xs:simpleType>
<xs:complexType name="catalog-table">
<xs:annotation>
<xs:documentation>
catalog specifies the uri of a Hive table along with the partition spec.
uri="catalog:$database:$table#(partition-key=partition-value);+"
Example: catalog:logs-db:clicks#ds=${YEAR}-${MONTH}-${DAY}
</xs:documentation>
</xs:annotation>
<xs:attribute type="xs:string" name="uri" use="required"/>
</xs:complexType>
<xs:simpleType name="non-empty-string">
<xs:restriction base="xs:string">
<xs:minLength value="1"/>
</xs:restriction>
</xs:simpleType>
<xs:complexType name="import">
<xs:sequence>
<xs:element type="datasource" name="source"/>
<xs:element type="arguments" name="arguments" minOccurs="0"/>
</xs:sequence>
</xs:complexType>
<xs:complexType name="export">
<xs:sequence>
<xs:element type="datasource" name="target"/>
<xs:element type="arguments" name="arguments" minOccurs="0"/>
</xs:sequence>
</xs:complexType>
<xs:complexType name="datasource">
<xs:annotation>
<xs:documentation>
Specifies the source entity name from which data can be imported or exported.
This can be Database or other data source types in the future. The connection
and authentication details of the data source are defined in the Datasource
entity.
Table name specifies the table to import or export depending on the action type.
Extract type specifies a extraction method (full or incremental).
DeltaColumn specifies the column name on source databbase table
to identify the new data since the last extraction.
Merge type specifies how the data will be organized on Hadoop.
The supported types are snapshot (as in a particular time) or append
(as in timeseries partitions).
Load type specifies if new rows are inserted (load type=insertallowed) into
the database table or updated (load type=updateonly). If updateonly load type
is specified, then update columns need to be passed via the arguments.
Fields can be specified as includes or excludes of fields. If exlusion list
is specified, all column except the ones specified will not be imported or exported.
If inclusion list is specified, only the specified columns are exported or imported.
</xs:documentation>
</xs:annotation>
<xs:sequence>
<xs:choice minOccurs="1" maxOccurs="1">
<xs:element type="extract" name="extract"/>
<xs:element type="load" name="load"/>
</xs:choice>
<xs:element type="fields-type" name="fields" minOccurs="0" maxOccurs="1"/>
</xs:sequence>
<xs:attribute type="non-empty-string" name="name" use="required"/>
<xs:attribute type="non-empty-string" name="tableName" use="required"/>
</xs:complexType>
<xs:complexType name="extract">
<xs:sequence>
<xs:element type="xs:string" name="deltacolumn" minOccurs="0" maxOccurs="1"/>
<xs:element type="merge-type" name="mergepolicy" minOccurs="1" maxOccurs="1"/>
</xs:sequence>
<xs:attribute type="extract-method" name="type" use="required"/>
</xs:complexType>
<xs:complexType name="load">
<xs:attribute type="load-method" name="type" use="required"/>
</xs:complexType>
<xs:simpleType name="extract-method">
<xs:restriction base="xs:string">
<xs:enumeration value="full"/>
<xs:enumeration value="incremental"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="load-method">
<xs:restriction base="xs:string">
<xs:enumeration value="updateonly"/>
<xs:enumeration value="allowinsert"/>
</xs:restriction>
</xs:simpleType>
<xs:simpleType name="merge-type">
<xs:restriction base="xs:string">
<xs:enumeration value="snapshot"/>
<xs:enumeration value="append"/>
</xs:restriction>
</xs:simpleType>
<xs:complexType name="fields-type">
<xs:annotation>
<xs:documentation>
Specifies either an include or exclude fields list. If include field list is specified, only
the specified fields will be imported. If exclude field list is specified, all fields except
the ones specified will be imported from datasource to HDFS.
</xs:documentation>
</xs:annotation>
<xs:choice minOccurs="1" maxOccurs="1">
<xs:element type="field-include-exclude" name="includes"/>
<xs:element type="field-include-exclude" name="excludes"/>
</xs:choice>
</xs:complexType>
<xs:complexType name="field-include-exclude">
<xs:sequence>
<xs:element type="xs:string" name="field" maxOccurs="unbounded" minOccurs="1"/>
</xs:sequence>
</xs:complexType>
<xs:complexType name="arguments">
<xs:annotation>
<xs:documentation>
A list of name-value pair of extra arguments to be passed to the concrete implementation.
</xs:documentation>
</xs:annotation>
<xs:sequence>
<xs:element type="argument" name="argument" maxOccurs="unbounded" minOccurs="0"/>
</xs:sequence>
</xs:complexType>
<xs:complexType name="argument">
<xs:annotation>
<xs:documentation>
A key-value pair, which are used while invoking
ingestion engines.
</xs:documentation>
</xs:annotation>
<xs:attribute type="xs:string" name="name" use="required"/>
<xs:attribute type="xs:string" name="value" use="required"/>
</xs:complexType>
<xs:complexType name="retention-stage">
<xs:annotation>
<xs:documentation>
Retention stage is the new way to define retention for a feed using feed lifecycle feature. Retention
has a configurable policy which does the validation and the real execution through workflow engine.
This method of specifying retention gives you more control like using different queue name, priority
and execution-order for retention than other lifecycle stages of feed like replication.
</xs:documentation>
</xs:annotation>
<xs:all>
<xs:element type="non-empty-string" name="policy" minOccurs="0" maxOccurs="1"></xs:element>
<xs:element type="frequency-type" name="frequency" minOccurs="0" maxOccurs="1"></xs:element>
<xs:element type="xs:string" name="queue" minOccurs="0" maxOccurs="1"></xs:element>
<xs:element type="xs:string" name="priority" minOccurs="0" maxOccurs="1"></xs:element>
<xs:element type="properties" name="properties" minOccurs="0" maxOccurs="1"></xs:element>
</xs:all>
</xs:complexType>
</xs:schema>