| <dataConfig> |
| <dataSource type="URLDataSource"/> |
| <document> |
| |
| <entity name="stackoverflow" |
| url="https://stackoverflow.com/feeds/tag/solr" |
| processor="XPathEntityProcessor" |
| forEach="/feed|/feed/entry" |
| transformer="HTMLStripTransformer,RegexTransformer"> |
| |
| <!-- Pick this value up from the feed level and apply to all documents --> |
| <field column="lastchecked_dt" xpath="/feed/updated" commonField="true"/> |
| |
| <!-- Keep only the final numeric part of the URL --> |
| <field column="id" xpath="/feed/entry/id" regex=".*/" replaceWith=""/> |
| |
| <field column="title" xpath="/feed/entry/title"/> |
| <field column="author" xpath="/feed/entry/author/name"/> |
| <field column="category" xpath="/feed/entry/category/@term"/> |
| <field column="link" xpath="/feed/entry/link[@rel='alternate']/@href"/> |
| |
| <!-- Use transformers to convert HTML into plain text. |
| There is also an UpdateRequestProcess to trim remaining spaces. |
| --> |
| <field column="summary" xpath="/feed/entry/summary" stripHTML="true" regex="( |\n)+" replaceWith=" "/> |
| |
| <!-- Ignore namespaces when matching XPath --> |
| <field column="rank" xpath="/feed/entry/rank"/> |
| |
| <field column="published_dt" xpath="/feed/entry/published"/> |
| <field column="updated_dt" xpath="/feed/entry/updated"/> |
| </entity> |
| |
| </document> |
| </dataConfig> |