blob: 0699cfaf70a38b5e4eefe796d4c2484285381159 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.accumulo.examples.wikisearch.ingest;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Reader;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Map;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
import org.apache.accumulo.examples.wikisearch.normalizer.NumberNormalizer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
public class ArticleExtractor {
public final static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'Z");
private static NumberNormalizer nn = new NumberNormalizer();
private static LcNoDiacriticsNormalizer lcdn = new LcNoDiacriticsNormalizer();
public static class Article implements Writable {
int id;
String title;
long timestamp;
String comments;
String text;
public Article(){}
private Article(int id, String title, long timestamp, String comments, String text) {
super();
this.id = id;
this.title = title;
this.timestamp = timestamp;
this.comments = comments;
this.text = text;
}
public int getId() {
return id;
}
public String getTitle() {
return title;
}
public String getComments() {
return comments;
}
public String getText() {
return text;
}
public long getTimestamp() {
return timestamp;
}
public Map<String,Object> getFieldValues() {
Map<String,Object> fields = new HashMap<String,Object>();
fields.put("ID", this.id);
fields.put("TITLE", this.title);
fields.put("TIMESTAMP", this.timestamp);
fields.put("COMMENTS", this.comments);
return fields;
}
public Map<String,String> getNormalizedFieldValues() {
Map<String,String> fields = new HashMap<String,String>();
fields.put("ID", nn.normalizeFieldValue("ID", this.id));
fields.put("TITLE", lcdn.normalizeFieldValue("TITLE", this.title));
fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp));
fields.put("COMMENTS", lcdn.normalizeFieldValue("COMMENTS", this.comments));
return fields;
}
@Override
public void readFields(DataInput in) throws IOException {
id = in.readInt();
Text foo = new Text();
foo.readFields(in);
title = foo.toString();
timestamp = in.readLong();
foo.readFields(in);
comments = foo.toString();
foo.readFields(in);
text = foo.toString();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(id);
(new Text(title)).write(out);
out.writeLong(timestamp);
(new Text(comments)).write(out);
(new Text(text)).write(out);
}
}
public ArticleExtractor() {}
private static XMLInputFactory xmlif = XMLInputFactory.newInstance();
static
{
xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
}
public Article extract(Reader reader) {
XMLStreamReader xmlr = null;
try {
xmlr = xmlif.createXMLStreamReader(reader);
} catch (XMLStreamException e1) {
throw new RuntimeException(e1);
}
QName titleName = QName.valueOf("title");
QName textName = QName.valueOf("text");
QName revisionName = QName.valueOf("revision");
QName timestampName = QName.valueOf("timestamp");
QName commentName = QName.valueOf("comment");
QName idName = QName.valueOf("id");
Map<QName,StringBuilder> tags = new HashMap<QName,StringBuilder>();
for (QName tag : new QName[] {titleName, textName, timestampName, commentName, idName}) {
tags.put(tag, new StringBuilder());
}
StringBuilder articleText = tags.get(textName);
StringBuilder titleText = tags.get(titleName);
StringBuilder timestampText = tags.get(timestampName);
StringBuilder commentText = tags.get(commentName);
StringBuilder idText = tags.get(idName);
StringBuilder current = null;
boolean inRevision = false;
while (true) {
try {
if (!xmlr.hasNext())
break;
xmlr.next();
} catch (XMLStreamException e) {
throw new RuntimeException(e);
}
QName currentName = null;
if (xmlr.hasName()) {
currentName = xmlr.getName();
}
if (xmlr.isStartElement() && tags.containsKey(currentName)) {
if (!inRevision || (!currentName.equals(revisionName) && !currentName.equals(idName))) {
current = tags.get(currentName);
current.setLength(0);
}
} else if (xmlr.isStartElement() && currentName.equals(revisionName)) {
inRevision = true;
} else if (xmlr.isEndElement() && currentName.equals(revisionName)) {
inRevision = false;
} else if (xmlr.isEndElement() && current != null) {
if (textName.equals(currentName)) {
String title = titleText.toString();
String text = articleText.toString();
String comment = commentText.toString();
int id = Integer.parseInt(idText.toString());
long timestamp;
try {
timestamp = dateFormat.parse(timestampText.append("+0000").toString()).getTime();
return new Article(id, title, timestamp, comment, text);
} catch (ParseException e) {
return null;
}
}
current = null;
} else if (current != null && xmlr.hasText()) {
current.append(xmlr.getText());
}
}
return null;
}
}