src/joshua/decoder/ff/FeatureVector.java - joshua - Git at Google

 package joshua.decoder.ff;

 import joshua.decoder.Decoder;

 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;

 /**
  * An implementation of a sparse feature vector, using for representing both weights and feature
  * values.
  *
  * @author Matt Post <post@cs.jhu.edu>
  */

 public class FeatureVector {
   private HashMap<String, Float> features;

   public FeatureVector() {
     features = new HashMap<String, Float>();
   }

   public FeatureVector(String feature, float value) {
     features = new HashMap<String, Float>();
     features.put(feature, value);
   }

   /**
    * This version of the constructor takes an uninitialized feature with potentially intermingled
    * labeled and unlabeled feature values, of the format:
    *
    * [feature1=]value [feature2=]value
    *
    * It produces a Feature Vector where all unlabeled features have been labeled by appending the
    * unlabeled feature index (starting at 0) to the defaultPrefix value.
    *
    * **IMPORTANT** The feature values are inverted, for historical reasons, which leads to a lot
    * of confusion. They have to be inverted here and when the score is actually computed. They
    * are inverted here (which is used to build the feature vector representation of a rule's dense
    * features) and in {@link BilingualRule::estimateRuleCost()}, where the rule's precomputable
    * (weighted) score is cached.
    *
    * @param featureString, the string of labeled and unlabeled features (probably straight from the
    *          grammar text file)
    * @param prefix, the prefix to use for unlabeled features (probably "tm_OWNER_")
    */
   public FeatureVector(String featureString, String prefix) {

     /*
      * Read through the features on this rule, adding them to the feature vector. Unlabeled features
      * are converted to a canonical form.
      *
      * Note that it's bad form to mix unlabeled features and the named feature index they are mapped
      * to, but we are being liberal in what we accept.
      *
      * IMPORTANT: Note that, for historical reasons, the sign is reversed on all scores.
      * This is the source of *no end* of confusion and should be done away with.
      */
     features = new HashMap<String, Float>();
     int denseFeatureIndex = 0;

     if (!featureString.trim().equals("")) {
       for (String token : featureString.split("\\s+")) {
         if (token.indexOf('=') == -1) {
           features.put(String.format("%s%d", prefix, denseFeatureIndex), -Float.parseFloat(token));
           denseFeatureIndex++;
         } else {
           int splitPoint = token.indexOf('=');
           features.put(token.substring(0, splitPoint),
               Float.parseFloat(token.substring(splitPoint + 1)));
         }
       }
     }
   }

   public Set<String> keySet() {
     return features.keySet();
   }

   public int size() {
     return features.size();
   }

   public FeatureVector clone() {
     FeatureVector newOne = new FeatureVector();
     for (String key : this.features.keySet())
       newOne.put(key, this.features.get(key));
     return newOne;
   }

   /**
    * Subtracts the weights in the other feature vector from this one. Note that this is not set
    * subtraction; keys found in the other FeatureVector but not in this one will be initialized with
    * a value of 0.0f before subtraction.
    */
   public void subtract(FeatureVector other) {
     for (String key : other.keySet()) {
       float oldValue = (features.containsKey(key)) ? features.get(key) : 0.0f;
       features.put(key, oldValue - other.get(key));
     }
   }

   /**
    * Adds the weights in the other feature vector to this one. This is set union, with values shared
    * between the two being summed.
    */
   public void add(FeatureVector other) {
     for (String key : other.keySet()) {
       if (!features.containsKey(key))
         features.put(key, other.get(key));
       else
         features.put(key, features.get(key) + other.get(key));
     }
   }

   public boolean containsKey(final String feature) {
     return features.containsKey(feature);
   }

   /**
    * This method returns the weight of a feature if it exists and otherwise throws a runtime error.
    * It is the duty of the programmer to check using the method containsKey if a feature with a
    * certain name exists. Previously this method would return 0 if the key did not exists, but this
    * lead to bugs in other parts of the code because Feature Names are often specified in capitals
    * but then lowercased, but in using the get method the lowercase form is not used consistently.
    * It is therefore good defensive programming to just throw an error when someone tries to get a
    * feature that does not exist - this will automatically eliminate such hard to debug errors. This
    * is what is now implemented.
    *
    * @param feature
    * @return
    */
   public float get(String feature) {
     if (features.containsKey(feature))
       return features.get(feature);

     return 0.0f;
   }

   public void put(String feature, float value) {
     features.put(feature, value);
   }

   public Map<String, Float> getMap() {
     return features;
   }

   /**
    * Computes the inner product between this feature vector and another one.
    */
   public float innerProduct(FeatureVector other) {
     float cost = 0.0f;
     for (String key : features.keySet())
       if (other.containsKey(key))
         cost += features.get(key) * other.get(key);

     return cost;
   }

   public void times(float value) {
     for (String key : features.keySet())
       features.put(key, features.get(key) * value);
   }

   /***
    * Moses distinguishes sparse features as those containing an underscore, so we have to fake it
    * to be compatible with their tuners.
    */
   public String mosesString() {
     String outputString = "";

     HashSet<String> printed_keys = new HashSet<String>();

     // First print all the dense feature names in order
     for (String key: Decoder.dense_feature_names) {
       float value = features.containsKey(key) ? features.get(key) : 0.0f;
       outputString += String.format("%s=%.3f ", key.replaceAll("_", "-"), value);
       printed_keys.add(key);
     }

     // Now print the sparse features
     ArrayList<String> keys = new ArrayList<String>(features.keySet());
     Collections.sort(keys);
     for (String key: keys) {
       if (! printed_keys.contains(key)) {
         float value = features.get(key);
         if (key.equals("OOVPenalty"))
           // force moses to see it as sparse
           key = "OOV_Penalty";
         outputString += String.format("%s=%.3f ", key, value);
       }
     }
     return outputString.trim();
   }

   /***
    * Outputs a list of feature names. All dense features are printed. Feature names are printed
    * in the order they were read in.
    */
   @Override
   public String toString() {
     String outputString = "";

     HashSet<String> printed_keys = new HashSet<String>();

     // First print all the dense feature names in order
     for (String key: Decoder.dense_feature_names) {
       float value = features.containsKey(key) ? features.get(key) : 0.0f;
       outputString += String.format("%s=%.3f ", key, value);
       printed_keys.add(key);
     }

     // Now print the rest of the features
     ArrayList<String> keys = new ArrayList<String>(features.keySet());
     Collections.sort(keys);
     for (String key: keys)
       if (! printed_keys.contains(key))
         outputString += String.format("%s=%.3f ", key, features.get(key));

     return outputString.trim();
   }

   public static boolean isDense(String feature) {
     return feature.startsWith("tm_") || feature.startsWith("lm_") || feature.equals("WordPenalty")
         || feature.equals("Distortion") || feature.equals("PhrasePenalty");
   }
 }
	package joshua.decoder.ff;

	import joshua.decoder.Decoder;

	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Map;
	import java.util.Set;

	/**
	* An implementation of a sparse feature vector, using for representing both weights and feature
	* values.
	*
	* @author Matt Post <post@cs.jhu.edu>
	*/

	public class FeatureVector {
	private HashMap<String, Float> features;

	public FeatureVector() {
	features = new HashMap<String, Float>();
	}

	public FeatureVector(String feature, float value) {
	features = new HashMap<String, Float>();
	features.put(feature, value);
	}

	/**
	* This version of the constructor takes an uninitialized feature with potentially intermingled
	* labeled and unlabeled feature values, of the format:
	*
	* [feature1=]value [feature2=]value
	*
	* It produces a Feature Vector where all unlabeled features have been labeled by appending the
	* unlabeled feature index (starting at 0) to the defaultPrefix value.
	*
	* IMPORTANT The feature values are inverted, for historical reasons, which leads to a lot
	* of confusion. They have to be inverted here and when the score is actually computed. They
	* are inverted here (which is used to build the feature vector representation of a rule's dense
	* features) and in {@link BilingualRule::estimateRuleCost()}, where the rule's precomputable
	* (weighted) score is cached.
	*
	* @param featureString, the string of labeled and unlabeled features (probably straight from the
	* grammar text file)
	* @param prefix, the prefix to use for unlabeled features (probably "tm_OWNER_")
	*/
	public FeatureVector(String featureString, String prefix) {

	/*
	* Read through the features on this rule, adding them to the feature vector. Unlabeled features
	* are converted to a canonical form.
	*
	* Note that it's bad form to mix unlabeled features and the named feature index they are mapped
	* to, but we are being liberal in what we accept.
	*
	* IMPORTANT: Note that, for historical reasons, the sign is reversed on all scores.
	* This is the source of no end of confusion and should be done away with.
	*/
	features = new HashMap<String, Float>();
	int denseFeatureIndex = 0;

	if (!featureString.trim().equals("")) {
	for (String token : featureString.split("\\s+")) {
	if (token.indexOf('=') == -1) {
	features.put(String.format("%s%d", prefix, denseFeatureIndex), -Float.parseFloat(token));
	denseFeatureIndex++;
	} else {
	int splitPoint = token.indexOf('=');
	features.put(token.substring(0, splitPoint),
	Float.parseFloat(token.substring(splitPoint + 1)));
	}
	}
	}
	}

	public Set<String> keySet() {
	return features.keySet();
	}

	public int size() {
	return features.size();
	}

	public FeatureVector clone() {
	FeatureVector newOne = new FeatureVector();
	for (String key : this.features.keySet())
	newOne.put(key, this.features.get(key));
	return newOne;
	}

	/**
	* Subtracts the weights in the other feature vector from this one. Note that this is not set
	* subtraction; keys found in the other FeatureVector but not in this one will be initialized with
	* a value of 0.0f before subtraction.
	*/
	public void subtract(FeatureVector other) {
	for (String key : other.keySet()) {
	float oldValue = (features.containsKey(key)) ? features.get(key) : 0.0f;
	features.put(key, oldValue - other.get(key));
	}
	}

	/**
	* Adds the weights in the other feature vector to this one. This is set union, with values shared
	* between the two being summed.
	*/
	public void add(FeatureVector other) {
	for (String key : other.keySet()) {
	if (!features.containsKey(key))
	features.put(key, other.get(key));
	else
	features.put(key, features.get(key) + other.get(key));
	}
	}

	public boolean containsKey(final String feature) {
	return features.containsKey(feature);
	}

	/**
	* This method returns the weight of a feature if it exists and otherwise throws a runtime error.
	* It is the duty of the programmer to check using the method containsKey if a feature with a
	* certain name exists. Previously this method would return 0 if the key did not exists, but this
	* lead to bugs in other parts of the code because Feature Names are often specified in capitals
	* but then lowercased, but in using the get method the lowercase form is not used consistently.
	* It is therefore good defensive programming to just throw an error when someone tries to get a
	* feature that does not exist - this will automatically eliminate such hard to debug errors. This
	* is what is now implemented.
	*
	* @param feature
	* @return
	*/
	public float get(String feature) {
	if (features.containsKey(feature))
	return features.get(feature);

	return 0.0f;
	}

	public void put(String feature, float value) {
	features.put(feature, value);
	}

	public Map<String, Float> getMap() {
	return features;
	}

	/**
	* Computes the inner product between this feature vector and another one.
	*/
	public float innerProduct(FeatureVector other) {
	float cost = 0.0f;
	for (String key : features.keySet())
	if (other.containsKey(key))
	cost += features.get(key) * other.get(key);

	return cost;
	}

	public void times(float value) {
	for (String key : features.keySet())
	features.put(key, features.get(key) * value);
	}

	/***
	* Moses distinguishes sparse features as those containing an underscore, so we have to fake it
	* to be compatible with their tuners.
	*/
	public String mosesString() {
	String outputString = "";

	HashSet<String> printed_keys = new HashSet<String>();

	// First print all the dense feature names in order
	for (String key: Decoder.dense_feature_names) {
	float value = features.containsKey(key) ? features.get(key) : 0.0f;
	outputString += String.format("%s=%.3f ", key.replaceAll("_", "-"), value);
	printed_keys.add(key);
	}

	// Now print the sparse features
	ArrayList<String> keys = new ArrayList<String>(features.keySet());
	Collections.sort(keys);
	for (String key: keys) {
	if (! printed_keys.contains(key)) {
	float value = features.get(key);
	if (key.equals("OOVPenalty"))
	// force moses to see it as sparse
	key = "OOV_Penalty";
	outputString += String.format("%s=%.3f ", key, value);
	}
	}
	return outputString.trim();
	}

	/***
	* Outputs a list of feature names. All dense features are printed. Feature names are printed
	* in the order they were read in.
	*/
	@Override
	public String toString() {
	String outputString = "";

	HashSet<String> printed_keys = new HashSet<String>();

	// First print all the dense feature names in order
	for (String key: Decoder.dense_feature_names) {
	float value = features.containsKey(key) ? features.get(key) : 0.0f;
	outputString += String.format("%s=%.3f ", key, value);
	printed_keys.add(key);
	}

	// Now print the rest of the features
	ArrayList<String> keys = new ArrayList<String>(features.keySet());
	Collections.sort(keys);
	for (String key: keys)
	if (! printed_keys.contains(key))
	outputString += String.format("%s=%.3f ", key, features.get(key));

	return outputString.trim();
	}

	public static boolean isDense(String feature) {
	return feature.startsWith("tm_") \|\| feature.startsWith("lm_") \|\| feature.equals("WordPenalty")
	\|\| feature.equals("Distortion") \|\| feature.equals("PhrasePenalty");
	}
	}