blob: b7de812d00502a6e2fbf446a9f2294ccfa0613ad [file] [log] [blame]
<dataConfig>
<dataSource type="URLDataSource"/>
<document>
<entity name="stackoverflow"
url="https://stackoverflow.com/feeds/tag/solr"
processor="XPathEntityProcessor"
forEach="/feed|/feed/entry"
transformer="HTMLStripTransformer,RegexTransformer">
<!-- Pick this value up from the feed level and apply to all documents -->
<field column="lastchecked_dt" xpath="/feed/updated" commonField="true"/>
<!-- Keep only the final numeric part of the URL -->
<field column="id" xpath="/feed/entry/id" regex=".*/" replaceWith=""/>
<field column="title" xpath="/feed/entry/title"/>
<field column="author" xpath="/feed/entry/author/name"/>
<field column="category" xpath="/feed/entry/category/@term"/>
<field column="link" xpath="/feed/entry/link[@rel='alternate']/@href"/>
<!-- Use transformers to convert HTML into plain text.
There is also an UpdateRequestProcess to trim remaining spaces.
-->
<field column="summary" xpath="/feed/entry/summary" stripHTML="true" regex="( |\n)+" replaceWith=" "/>
<!-- Ignore namespaces when matching XPath -->
<field column="rank" xpath="/feed/entry/rank"/>
<field column="published_dt" xpath="/feed/entry/published"/>
<field column="updated_dt" xpath="/feed/entry/updated"/>
</entity>
</document>
</dataConfig>