blob: d643159f9707b656b3d78cf6c82b0f920ae7b2bb [file] [log] [blame]
package joshua.decoder.ff;
import joshua.decoder.Decoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* An implementation of a sparse feature vector, using for representing both weights and feature
* values.
*
* @author Matt Post <post@cs.jhu.edu>
*/
public class FeatureVector {
private HashMap<String, Float> features;
public FeatureVector() {
features = new HashMap<String, Float>();
}
public FeatureVector(String feature, float value) {
features = new HashMap<String, Float>();
features.put(feature, value);
}
/**
* This version of the constructor takes an uninitialized feature with potentially intermingled
* labeled and unlabeled feature values, of the format:
*
* [feature1=]value [feature2=]value
*
* It produces a Feature Vector where all unlabeled features have been labeled by appending the
* unlabeled feature index (starting at 0) to the defaultPrefix value.
*
* **IMPORTANT** The feature values are inverted, for historical reasons, which leads to a lot
* of confusion. They have to be inverted here and when the score is actually computed. They
* are inverted here (which is used to build the feature vector representation of a rule's dense
* features) and in {@link BilingualRule::estimateRuleCost()}, where the rule's precomputable
* (weighted) score is cached.
*
* @param featureString, the string of labeled and unlabeled features (probably straight from the
* grammar text file)
* @param prefix, the prefix to use for unlabeled features (probably "tm_OWNER_")
*/
public FeatureVector(String featureString, String prefix) {
/*
* Read through the features on this rule, adding them to the feature vector. Unlabeled features
* are converted to a canonical form.
*
* Note that it's bad form to mix unlabeled features and the named feature index they are mapped
* to, but we are being liberal in what we accept.
*
* IMPORTANT: Note that, for historical reasons, the sign is reversed on all scores.
* This is the source of *no end* of confusion and should be done away with.
*/
features = new HashMap<String, Float>();
int denseFeatureIndex = 0;
if (!featureString.trim().equals("")) {
for (String token : featureString.split("\\s+")) {
if (token.indexOf('=') == -1) {
features.put(String.format("%s%d", prefix, denseFeatureIndex), -Float.parseFloat(token));
denseFeatureIndex++;
} else {
int splitPoint = token.indexOf('=');
features.put(token.substring(0, splitPoint),
Float.parseFloat(token.substring(splitPoint + 1)));
}
}
}
}
public Set<String> keySet() {
return features.keySet();
}
public int size() {
return features.size();
}
public FeatureVector clone() {
FeatureVector newOne = new FeatureVector();
for (String key : this.features.keySet())
newOne.put(key, this.features.get(key));
return newOne;
}
/**
* Subtracts the weights in the other feature vector from this one. Note that this is not set
* subtraction; keys found in the other FeatureVector but not in this one will be initialized with
* a value of 0.0f before subtraction.
*/
public void subtract(FeatureVector other) {
for (String key : other.keySet()) {
float oldValue = (features.containsKey(key)) ? features.get(key) : 0.0f;
features.put(key, oldValue - other.get(key));
}
}
/**
* Adds the weights in the other feature vector to this one. This is set union, with values shared
* between the two being summed.
*/
public void add(FeatureVector other) {
for (String key : other.keySet()) {
if (!features.containsKey(key))
features.put(key, other.get(key));
else
features.put(key, features.get(key) + other.get(key));
}
}
public boolean containsKey(final String feature) {
return features.containsKey(feature);
}
/**
* This method returns the weight of a feature if it exists and otherwise throws a runtime error.
* It is the duty of the programmer to check using the method containsKey if a feature with a
* certain name exists. Previously this method would return 0 if the key did not exists, but this
* lead to bugs in other parts of the code because Feature Names are often specified in capitals
* but then lowercased, but in using the get method the lowercase form is not used consistently.
* It is therefore good defensive programming to just throw an error when someone tries to get a
* feature that does not exist - this will automatically eliminate such hard to debug errors. This
* is what is now implemented.
*
* @param feature
* @return
*/
public float get(String feature) {
if (features.containsKey(feature))
return features.get(feature);
return 0.0f;
}
public void put(String feature, float value) {
features.put(feature, value);
}
public Map<String, Float> getMap() {
return features;
}
/**
* Computes the inner product between this feature vector and another one.
*/
public float innerProduct(FeatureVector other) {
float cost = 0.0f;
for (String key : features.keySet())
if (other.containsKey(key))
cost += features.get(key) * other.get(key);
return cost;
}
public void times(float value) {
for (String key : features.keySet())
features.put(key, features.get(key) * value);
}
/***
* Moses distinguishes sparse features as those containing an underscore, so we have to fake it
* to be compatible with their tuners.
*/
public String mosesString() {
String outputString = "";
HashSet<String> printed_keys = new HashSet<String>();
// First print all the dense feature names in order
for (String key: Decoder.dense_feature_names) {
float value = features.containsKey(key) ? features.get(key) : 0.0f;
outputString += String.format("%s=%.3f ", key.replaceAll("_", "-"), value);
printed_keys.add(key);
}
// Now print the sparse features
ArrayList<String> keys = new ArrayList<String>(features.keySet());
Collections.sort(keys);
for (String key: keys) {
if (! printed_keys.contains(key)) {
float value = features.get(key);
if (key.equals("OOVPenalty"))
// force moses to see it as sparse
key = "OOV_Penalty";
outputString += String.format("%s=%.3f ", key, value);
}
}
return outputString.trim();
}
/***
* Outputs a list of feature names. All dense features are printed. Feature names are printed
* in the order they were read in.
*/
@Override
public String toString() {
String outputString = "";
HashSet<String> printed_keys = new HashSet<String>();
// First print all the dense feature names in order
for (String key: Decoder.dense_feature_names) {
float value = features.containsKey(key) ? features.get(key) : 0.0f;
outputString += String.format("%s=%.3f ", key, value);
printed_keys.add(key);
}
// Now print the rest of the features
ArrayList<String> keys = new ArrayList<String>(features.keySet());
Collections.sort(keys);
for (String key: keys)
if (! printed_keys.contains(key))
outputString += String.format("%s=%.3f ", key, features.get(key));
return outputString.trim();
}
public static boolean isDense(String feature) {
return feature.startsWith("tm_") || feature.startsWith("lm_") || feature.equals("WordPenalty")
|| feature.equals("Distortion") || feature.equals("PhrasePenalty");
}
}