blob: bbe62219517ba2d5e4447cf5d6c94ec2a0b42045 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.VersionMismatchException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.nutch.metadata.Metadata;
/** A {@link NutchDocument} is the unit of indexing. */
public class NutchDocument implements Writable,
Iterable<Entry<String, NutchField>>, Cloneable {
public static final byte VERSION = 2;
private HashMap<String, NutchField> fields;
private Metadata documentMeta;
private float weight;
public NutchDocument() {
fields = new HashMap<>();
documentMeta = new Metadata();
weight = 1.0f;
}
public void add(String name, Object value) {
NutchField field = fields.get(name);
if (field == null) {
field = new NutchField(value);
fields.put(name, field);
} else {
field.add(value);
}
}
public Object getFieldValue(String name) {
NutchField field = fields.get(name);
if (field == null) {
return null;
}
if (field.getValues().size() == 0) {
return null;
}
return field.getValues().get(0);
}
public NutchField getField(String name) {
return fields.get(name);
}
public NutchField removeField(String name) {
return fields.remove(name);
}
public Collection<String> getFieldNames() {
return fields.keySet();
}
/** Iterate over all fields. */
public Iterator<Entry<String, NutchField>> iterator() {
return fields.entrySet().iterator();
}
public float getWeight() {
return weight;
}
public void setWeight(float weight) {
this.weight = weight;
}
public Metadata getDocumentMeta() {
return documentMeta;
}
public void readFields(DataInput in) throws IOException {
fields.clear();
byte version = in.readByte();
if (version != VERSION) {
throw new VersionMismatchException(VERSION, version);
}
int size = WritableUtils.readVInt(in);
for (int i = 0; i < size; i++) {
String name = Text.readString(in);
NutchField field = new NutchField();
field.readFields(in);
fields.put(name, field);
}
weight = in.readFloat();
documentMeta.readFields(in);
}
public void write(DataOutput out) throws IOException {
out.writeByte(VERSION);
WritableUtils.writeVInt(out, fields.size());
for (Map.Entry<String, NutchField> entry : fields.entrySet()) {
Text.writeString(out, entry.getKey());
NutchField field = entry.getValue();
field.write(out);
}
out.writeFloat(weight);
documentMeta.write(out);
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("doc {\n");
for (Map.Entry<String, NutchField> entry : fields.entrySet()) {
sb.append("\t");
sb.append(entry.getKey());
sb.append(":\t");
sb.append(entry.getValue());
sb.append("\n");
}
sb.append("}\n");
return sb.toString();
}
@Override
public NutchDocument clone() throws CloneNotSupportedException {
NutchDocument clonedDocument = (NutchDocument) super.clone();
clonedDocument.fields = new HashMap<>();
for (Entry<String, NutchField> field : this.fields.entrySet()) {
clonedDocument.fields.put(field.getKey(), field.getValue().clone());
}
return clonedDocument;
}
}