| <!DOCTYPE html> |
| <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| |
| |
| <title>XML Parser - Apache Apex Malhar Documentation</title> |
| |
| |
| <link rel="shortcut icon" href="../../favicon.ico"> |
| |
| |
| |
| <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'> |
| |
| <link rel="stylesheet" href="../../css/theme.css" type="text/css" /> |
| <link rel="stylesheet" href="../../css/theme_extra.css" type="text/css" /> |
| <link rel="stylesheet" href="../../css/highlight.css"> |
| |
| |
| <script> |
| // Current page data |
| var mkdocs_page_name = "XML Parser"; |
| var mkdocs_page_input_path = "operators/xmlParserOperator.md"; |
| var mkdocs_page_url = "/operators/xmlParserOperator/"; |
| </script> |
| |
| <script src="../../js/jquery-2.1.1.min.js"></script> |
| <script src="../../js/modernizr-2.8.3.min.js"></script> |
| <script type="text/javascript" src="../../js/highlight.pack.js"></script> |
| <script src="../../js/theme.js"></script> |
| |
| |
| </head> |
| |
| <body class="wy-body-for-nav" role="document"> |
| |
| <div class="wy-grid-for-nav"> |
| |
| |
| <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav"> |
| <div class="wy-side-nav-search"> |
| <a href="../.." class="icon icon-home"> Apache Apex Malhar Documentation</a> |
| <div role="search"> |
| <form id ="rtd-search-form" class="wy-form" action="../../search.html" method="get"> |
| <input type="text" name="q" placeholder="Search docs" /> |
| </form> |
| </div> |
| </div> |
| |
| <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation"> |
| <ul class="current"> |
| |
| <li> |
| <li class="toctree-l1 "> |
| <a class="" href="../..">Apache Apex Malhar</a> |
| |
| </li> |
| <li> |
| |
| <li> |
| <ul class="subnav"> |
| <li><span>APIs</span></li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../../apis/calcite/">SQL</a> |
| |
| </li> |
| |
| |
| </ul> |
| <li> |
| |
| <li> |
| <ul class="subnav"> |
| <li><span>Operators</span></li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../block_reader/">Block Reader</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../csvformatter/">CSV Formatter</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../csvParserOperator/">CSV Parser</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../deduper/">Deduper</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../enricher/">Enricher</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../fsInputOperator/">File Input</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../file_output/">File Output</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../file_splitter/">File Splitter</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../filter/">Filter</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../fixedWidthParserOperator/">Fixed Width Parser</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../ftpInputOperator/">FTP Input Operator</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../AbstractJdbcTransactionableOutputOperator/">Jdbc Output Operator</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../jdbcPollInputOperator/">JDBC Poller Input</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../jmsInputOperator/">JMS Input</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../jsonFormatter/">JSON Formatter</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../jsonParser/">JSON Parser</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../kafkaInputOperator/">Kafka Input</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../regexparser/">Regex Parser</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../s3outputmodule/">S3 Output Module</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../transform/">Transformer</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 "> |
| <a class="" href="../windowedOperator/">Windowed Operator</a> |
| |
| </li> |
| |
| |
| |
| <li class="toctree-l1 current"> |
| <a class="current" href="./">XML Parser</a> |
| |
| <ul> |
| |
| <li class="toctree-l3"><a href="#xml-parser">Xml Parser</a></li> |
| |
| <li><a class="toctree-l4" href="#operator-objective">Operator Objective</a></li> |
| |
| <li><a class="toctree-l4" href="#class-diagram">Class Diagram</a></li> |
| |
| <li><a class="toctree-l4" href="#operator-information">Operator Information</a></li> |
| |
| <li><a class="toctree-l4" href="#properties-attributes-and-ports">Properties, Attributes and Ports</a></li> |
| |
| <li><a class="toctree-l4" href="#partitioning">Partitioning</a></li> |
| |
| <li><a class="toctree-l4" href="#example">Example</a></li> |
| |
| |
| </ul> |
| |
| </li> |
| |
| |
| </ul> |
| <li> |
| |
| </ul> |
| </div> |
| |
| </nav> |
| |
| <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"> |
| |
| |
| <nav class="wy-nav-top" role="navigation" aria-label="top navigation"> |
| <i data-toggle="wy-nav-top" class="fa fa-bars"></i> |
| <a href="../..">Apache Apex Malhar Documentation</a> |
| </nav> |
| |
| |
| <div class="wy-nav-content"> |
| <div class="rst-content"> |
| <div role="navigation" aria-label="breadcrumbs navigation"> |
| <ul class="wy-breadcrumbs"> |
| <li><a href="../..">Docs</a> »</li> |
| |
| |
| |
| <li>Operators »</li> |
| |
| |
| |
| <li>XML Parser</li> |
| <li class="wy-breadcrumbs-aside"> |
| |
| </li> |
| </ul> |
| <hr/> |
| </div> |
| <div role="main"> |
| <div class="section"> |
| |
| <h1 id="xml-parser">Xml Parser</h1> |
| <h2 id="operator-objective">Operator Objective</h2> |
| <p>The XmlParser operator parses XML records and constructs POJOs ("Plain Old Java Objects") from them. The operator also emits each record as a DOM Document if the relevant output port is connected. User can also provide a XSD (XML Schema Definition) to validate incoming XML records. Valid records will be emitted as POJOs / DOM Document while invalid ones are emitted on error port with an error message if the error port is connected.</p> |
| <p>XmlParser is <strong>idempotent</strong>, <strong>fault-tolerant</strong> and <strong>statically/dynamically partitionable</strong>.</p> |
| <h2 id="class-diagram">Class Diagram</h2> |
| <p><img alt="" src="../images/xmlParser/XmlParser.png" /></p> |
| <h2 id="operator-information">Operator Information</h2> |
| <ol> |
| <li>Operator location: <strong><em>malhar-library</em></strong></li> |
| <li>Available since: <strong><em>3.2.0</em></strong></li> |
| <li>Operator state: <strong><em>Evolving</em></strong></li> |
| <li>Java Package: <a href="https://github.com/apache/apex-malhar/blob/master/library/src/main/java/com/datatorrent/lib/parser/XmlParser.java">com.datatorrent.lib.parser.XmlParser</a></li> |
| </ol> |
| <h2 id="properties-attributes-and-ports">Properties, Attributes and Ports</h2> |
| <h3 id="properties-of-xml-parser"><a name="props"></a>Properties of Xml Parser</h3> |
| <table> |
| <thead> |
| <tr> |
| <th><strong>Property</strong></th> |
| <th><strong>Description</strong></th> |
| <th><strong>Type</strong></th> |
| <th><strong>Mandatory</strong></th> |
| <th><strong>Default Value</strong></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td><em>schemaXSDFile</em></td> |
| <td>[XSD] describing XML data. Incoming records can be validated using the schemaXSDFile. If the data is not as per the requirements specified in schemaXSDFile, they are emitted on the error port. This is an optional property. If the XSD is not provided, incoming tuples are simply converted to POJOs or DOM Documents without any validations</td> |
| <td>String</td> |
| <td>No</td> |
| <td>N/A</td> |
| </tr> |
| </tbody> |
| </table> |
| <h3 id="platform-attributes-that-influence-operator-behavior">Platform Attributes that influence operator behavior</h3> |
| <table> |
| <thead> |
| <tr> |
| <th><strong>Attribute</strong></th> |
| <th><strong>Description</strong></th> |
| <th><strong>Type</strong></th> |
| <th><strong>Mandatory</strong></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td><em>out.TUPLE_CLASS</em></td> |
| <td>TUPLE_CLASS attribute on output port which tells operator the class of POJO which needs to be emitted. The name of the field members of the class must match with the names in incoming POJO. The operator ignores unknown properties i.e. fields present in POJO but not in TUPLE_CLASS or vice versa.</td> |
| <td>Class or FQCN</td> |
| <td>Yes</td> |
| </tr> |
| </tbody> |
| </table> |
| <h3 id="ports">Ports</h3> |
| <table> |
| <thead> |
| <tr> |
| <th><strong>Port</strong></th> |
| <th><strong>Description</strong></th> |
| <th><strong>Type</strong></th> |
| <th><strong>Mandatory</strong></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td><em>in</em></td> |
| <td>Tuples that needs to be parsed are received on this port</td> |
| <td>byte[]</td> |
| <td>Yes</td> |
| </tr> |
| <tr> |
| <td><em>out</em></td> |
| <td>Valid Tuples that are emitted as pojo. Tuples are converted to POJO only if the port is connected.</td> |
| <td>Object (POJO)</td> |
| <td>No</td> |
| </tr> |
| <tr> |
| <td><em>parsedOutput</em></td> |
| <td>Valid Tuples that are emitted as DOM Document. Tuples are converted to DOM Document only if the port is connected.</td> |
| <td>DOM Document</td> |
| <td>No</td> |
| </tr> |
| <tr> |
| <td><em>err</em></td> |
| <td>Invalid Tuples are emitted with error message. Invalid tuples are discarded if the port is not connected.</td> |
| <td>KeyValPair <String, String></td> |
| <td>No</td> |
| </tr> |
| </tbody> |
| </table> |
| <h2 id="partitioning">Partitioning</h2> |
| <p>XML Parser is both statically and dynamically partitionable.</p> |
| <h3 id="static-partitioning">Static Partitioning</h3> |
| <p>This can be achieved in 2 ways</p> |
| <ol> |
| <li>Specifying the partitioner and number of partitions in the 'populateDAG()' method.</li> |
| </ol> |
| <pre><code class="java">XmlParser xmlParser = dag.addOperator("xmlParser", XmlParser.class); |
| StatelessPartitioner<XmlParser> partitioner1 = new StatelessPartitioner<XmlParser>(2); |
| dag.setAttribute(xmlParser, Context.OperatorContext.PARTITIONER, partitioner1 ); |
| </code></pre> |
| |
| <ol> |
| <li>Specifying the partitioner and number of partitions in properties file.</li> |
| </ol> |
| <pre><code class="xml"> <property> |
| <name>dt.operator.{OperatorName}.attr.PARTITIONER</name> |
| <value>com.datatorrent.common.partitioner.StatelessPartitioner:2</value> |
| </property> |
| </code></pre> |
| |
| <p>where {OperatorName} is the name of the XmlParser operator. |
| Above lines will partition XmlParser statically 2 times. Above value can be changed accordingly to change the number of static partitions.</p> |
| <h3 id="dynamic-partitioning">Dynamic Partitioning</h3> |
| <p>XmlParser can be dynamically partitioned using an out-of-the-box partitioner:</p> |
| <h4 id="throughput-based">Throughput based</h4> |
| <p>Following code can be added to 'populateDAG' method of application to dynamically partition XmlParser:</p> |
| <pre><code class="java">XmlParser xmlParser = dag.addOperator("xmlParser", XmlParser.class); |
| StatelessThroughputBasedPartitioner<XmlParser> partitioner = new StatelessThroughputBasedPartitioner<>(); |
| partitioner.setCooldownMillis(conf.getLong("dt.cooldown", 10000)); |
| partitioner.setMaximumEvents(conf.getLong("dt.maxThroughput", 30000)); |
| partitioner.setMinimumEvents(conf.getLong("dt.minThroughput", 10000)); |
| dag.setAttribute(xmlParser, OperatorContext.STATS_LISTENERS, Arrays.asList(new StatsListener[]{partitioner})); |
| dag.setAttribute(xmlParser, OperatorContext.PARTITIONER, partitioner); |
| </code></pre> |
| |
| <p>Above code will dynamically partition XmlParser when the throughput changes. |
| If the overall throughput of XmlParser goes beyond 30000 or less than 10000, the platform will repartition XmlParser |
| to balance throughput of a single partition to be between 10000 and 30000. |
| 'dt.cooldown' of 10000 will be used as the threshold time for which the throughput change is observed.</p> |
| <h2 id="example">Example</h2> |
| <p>Example for Xml Parser can be found at: <a href="https://github.com/DataTorrent/examples/tree/master/tutorials/parser">https://github.com/DataTorrent/examples/tree/master/tutorials/parser</a></p> |
| |
| </div> |
| </div> |
| <footer> |
| |
| <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation"> |
| |
| |
| <a href="../windowedOperator/" class="btn btn-neutral" title="Windowed Operator"><span class="icon icon-circle-arrow-left"></span> Previous</a> |
| |
| </div> |
| |
| |
| <hr/> |
| |
| <div role="contentinfo"> |
| <!-- Copyright etc --> |
| |
| </div> |
| |
| Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. |
| </footer> |
| |
| </div> |
| </div> |
| |
| </section> |
| |
| </div> |
| |
| <div class="rst-versions" role="note" style="cursor: pointer"> |
| <span class="rst-current-version" data-toggle="rst-current-version"> |
| |
| |
| <span><a href="../windowedOperator/" style="color: #fcfcfc;">« Previous</a></span> |
| |
| |
| </span> |
| </div> |
| |
| </body> |
| </html> |