src/joshua/decoder/ff/FeatureVector.java - joshua - Git at Google

 package joshua.decoder.ff;

 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;

 /**
  * An implementation of a sparse feature vector, using for representing both weights and feature
  * values.
  *
  * This class is used to hold both the decoder weights and the feature values accumulated across
  * each edge. When features are read in upon decoder startup, they all start out as sparse features
  * and are stored in the hash table. After the feature functions have been loaded, the decoder
  * queries each of them for their sparse features via {@link registerDenseFeatures}. Those features
  * returned by each decoder are then *removed* from the sparse feature hash and placed in the dense
  * feature array. Therefore, when a feature registers a dense feature, it should take care to
  * query either {@link getDense()} or {@link getSparse} when asking for the feature values later on.
  *
  * @author Matt Post <post@cs.jhu.edu>
  */

 public class FeatureVector {
   /*
    * A list of the dense feature names. Increased via calls to registerDenseFeatures()
    */
   public static ArrayList<String> DENSE_FEATURE_NAMES = new ArrayList<String>();

   /*
    * The values of each of the dense features, defaulting to 0.
    */
   private ArrayList<Float> denseFeatures = null;

   /*
    * Value of sparse features.
    */
   private HashMap<String, Float> sparseFeatures;

   public FeatureVector() {
     sparseFeatures = new HashMap<String, Float>();
     denseFeatures = new ArrayList<Float>(DENSE_FEATURE_NAMES.size());
     for (int i = 0; i < denseFeatures.size(); i++)
       denseFeatures.set(i, 0.0f);
   }

   /**
    * This version of the constructor takes an uninitialized feature with potentially intermingled
    * labeled and unlabeled feature values, of the format:
    *
    * [feature1=]value [feature2=]value
    *
    * It produces a Feature Vector where all unlabeled features have been labeled by appending the
    * unlabeled feature index (starting at 0) to the defaultPrefix value.
    *
    * **IMPORTANT** The feature values are inverted, for historical reasons, which leads to a lot
    * of confusion. They have to be inverted here and when the score is actually computed. They
    * are inverted here (which is used to build the feature vector representation of a rule's dense
    * features) and in {@link BilingualRule::estimateRuleCost()}, where the rule's precomputable
    * (weighted) score is cached.
    *
    * @param featureString, the string of labeled and unlabeled features (probably straight from the
    *          grammar text file)
    * @param prefix, the prefix to use for unlabeled features (probably "tm_OWNER_")
    */
   public FeatureVector(String featureString, String prefix) {

 //    System.err.println(String.format("FEATURES_OF(%s, %s)", featureString, prefix));

     /*
      * Read through the features on this rule, adding them to the feature vector. Unlabeled features
      * are converted to a canonical form.
      *
      * Note that it's bad form to mix unlabeled features and the named feature index they are mapped
      * to, but we are being liberal in what we accept.
      *
      * IMPORTANT: Note that, for historical reasons, the sign is reversed on all *dense* scores.
      * This is the source of *no end* of confusion and should be done away with.
      */
     sparseFeatures = new HashMap<String, Float>();
     denseFeatures = new ArrayList<Float>(DENSE_FEATURE_NAMES.size());
     for (int i = 0; i < denseFeatures.size(); i++)
       denseFeatures.set(i, 0.0f);

     int denseFeatureIndex = 0;

     if (!featureString.trim().equals("")) {
       for (String token : featureString.split("\\s+")) {
         if (token.indexOf('=') == -1) {
           /*
            * If we encounter an unlabeled feature, it is the next dense feature
            */
           while (denseFeatures.size() <= denseFeatureIndex)
             denseFeatures.add(0.0f);
           denseFeatures.set(denseFeatureIndex, -Float.parseFloat(token));
           denseFeatureIndex++;
         } else {
           /*
            * Labeled features are of two types: if they start with the prefix, they are actually
            * dense feature in disguise; otherwise, they are proper sparse features.
            */
           int splitPoint = token.indexOf('=');
           if (token.startsWith(prefix)) {
 //            System.err.println(String.format("  PREFIX=%s '%s'.substring(%d,%d) = %s", prefix, token, prefix.length(), splitPoint,
 //                token.substring(prefix.length(), splitPoint)));
             int index = Integer.parseInt(token.substring(prefix.length(), splitPoint));
             while (denseFeatures.size() <= index)
               denseFeatures.add(0.0f);
             denseFeatures.set(index, 1.0f * Float.parseFloat(token.substring(splitPoint + 1)));
           } else {
             sparseFeatures.put(token.substring(0, splitPoint),
                 Float.parseFloat(token.substring(splitPoint + 1)));
           }
         }
       }
     }
   }

   /**
    * Register one or more dense features with the global weight vector. This assumes them global
    * IDs, and then returns the index of the first feature (from which the calling feature function
    * can infer them all). This *must* be called by every feature function wishing to register
    * dense features!
    *
    * @param names
    * @return
    */
   public void registerDenseFeatures(ArrayList<FeatureFunction> featureFunctions) {
     for (FeatureFunction feature: featureFunctions) {
       ArrayList<String> names = feature.reportDenseFeatures(denseFeatures.size());
       for (String name: names) {
         DENSE_FEATURE_NAMES.add(name);
         denseFeatures.add(getSparse(name));
         sparseFeatures.remove(name);
       }
     }
   }

   public ArrayList<Float> getDenseFeatures() {
     return denseFeatures;
   }

   public HashMap<String,Float> getSparseFeatures() {
     return sparseFeatures;
   }

   public Set<String> keySet() {
     return sparseFeatures.keySet();
   }

   public int size() {
     return sparseFeatures.size() + denseFeatures.size();
   }

   public FeatureVector clone() {
     FeatureVector newOne = new FeatureVector();
     for (String key : this.sparseFeatures.keySet())
       newOne.set(key, this.sparseFeatures.get(key));
     for (int i = 0; i < denseFeatures.size(); i++)
       newOne.set(i, denseFeatures.get(i));
     return newOne;
   }

   /**
    * Subtracts the weights in the other feature vector from this one. Note that this is not set
    * subtraction; keys found in the other FeatureVector but not in this one will be initialized with
    * a value of 0.0f before subtraction.
    */
   public void subtract(FeatureVector other) {
     for (int i = 0; i < denseFeatures.size(); i++)
       denseFeatures.set(i, denseFeatures.get(i) - other.getDense(i));

     for (String key : other.keySet()) {
       float oldValue = (sparseFeatures.containsKey(key)) ? sparseFeatures.get(key) : 0.0f;
       sparseFeatures.put(key, oldValue - other.getSparse(key));
     }
   }

   /**
    * Adds the weights in the other feature vector to this one. This is set union, with values shared
    * between the two being summed.
    */
   public void add(FeatureVector other) {
     while (denseFeatures.size() < other.denseFeatures.size())
       denseFeatures.add(0.0f);

     for (int i = 0; i < other.denseFeatures.size(); i++)
       increment(i, other.getDense(i));

     for (String key : other.keySet()) {
       if (!sparseFeatures.containsKey(key))
         sparseFeatures.put(key, other.getSparse(key));
       else
         sparseFeatures.put(key, sparseFeatures.get(key) + other.getSparse(key));
     }
   }

   /**
    * Return the weight of a sparse feature, indexed by its name.
    *
    * @param feature
    * @return the sparse feature's weight, or 0 if not found.
    */
   public float getSparse(String feature) {
     if (sparseFeatures.containsKey(feature))
       return sparseFeatures.get(feature);
     return 0.0f;
   }

   public boolean hasValue(String name) {
     return sparseFeatures.containsKey(name);
   }

   /**
    * Return the weight of a dense feature, indexed by its feature index.
    *
    * @param id
    * @return the dense feature's value, or 0 if not found.
    */
   public float getDense(int id) {
     if (id < denseFeatures.size())
       return denseFeatures.get(id);
     return 0.0f;
   }

   public void increment(String feature, float value) {
     sparseFeatures.put(feature, getSparse(feature) + value);
   }

   public void increment(int id, float value) {
     while (id >= denseFeatures.size())
       denseFeatures.add(0.0f);
     denseFeatures.set(id, getDense(id) + value);
   }

   public void set(String feature, float value) {
     sparseFeatures.put(feature, value);
   }

   public void set(int id, float value) {
     while (id >= denseFeatures.size())
       denseFeatures.add(0.0f);
     denseFeatures.set(id, value);
   }

   public Map<String, Float> getMap() {
     return sparseFeatures;
   }

   /**
    * Computes the inner product between this feature vector and another one.
    */
   public float innerProduct(FeatureVector other) {
     float cost = 0.0f;
     for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++)
       cost += getDense(i) * other.getDense(i);

     for (String key : sparseFeatures.keySet())
       cost += sparseFeatures.get(key) * other.getSparse(key);

     return cost;
   }

   public void times(float value) {
     for (String key : sparseFeatures.keySet())
       sparseFeatures.put(key, sparseFeatures.get(key) * value);
   }

   /***
    * Moses distinguishes sparse features as those containing an underscore, so we have to fake it
    * to be compatible with their tuners.
    */
   public String mosesString() {
     String outputString = "";

     HashSet<String> printed_keys = new HashSet<String>();

     // First print all the dense feature names in order
     for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
       outputString += String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i).replaceAll("_", "-"), denseFeatures.get(i));
       printed_keys.add(DENSE_FEATURE_NAMES.get(i));
     }

     // Now print the sparse features
     ArrayList<String> keys = new ArrayList<String>(sparseFeatures.keySet());
     Collections.sort(keys);
     for (String key: keys) {
       if (! printed_keys.contains(key)) {
         float value = sparseFeatures.get(key);
         if (key.equals("OOVPenalty"))
           // force moses to see it as sparse
           key = "OOV_Penalty";
         outputString += String.format("%s=%.3f ", key, value);
       }
     }
     return outputString.trim();
   }

   /***
    * Outputs a list of feature names. All dense features are printed. Feature names are printed
    * in the order they were read in.
    */
   @Override
   public String toString() {
     String outputString = "";

     HashSet<String> printed_keys = new HashSet<String>();

     // First print all the dense feature names in order
     for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
       outputString += String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i), getDense(i));
       printed_keys.add(DENSE_FEATURE_NAMES.get(i));
     }

     // Now print the rest of the features
     ArrayList<String> keys = new ArrayList<String>(sparseFeatures.keySet());
     Collections.sort(keys);
     for (String key: keys)
       if (! printed_keys.contains(key))
         outputString += String.format("%s=%.3f ", key, sparseFeatures.get(key));

     return outputString.trim();
   }
 }
	package joshua.decoder.ff;

	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Map;
	import java.util.Set;

	/**
	* An implementation of a sparse feature vector, using for representing both weights and feature
	* values.
	*
	* This class is used to hold both the decoder weights and the feature values accumulated across
	* each edge. When features are read in upon decoder startup, they all start out as sparse features
	* and are stored in the hash table. After the feature functions have been loaded, the decoder
	* queries each of them for their sparse features via {@link registerDenseFeatures}. Those features
	* returned by each decoder are then removed from the sparse feature hash and placed in the dense
	* feature array. Therefore, when a feature registers a dense feature, it should take care to
	* query either {@link getDense()} or {@link getSparse} when asking for the feature values later on.
	*
	* @author Matt Post <post@cs.jhu.edu>
	*/

	public class FeatureVector {
	/*
	* A list of the dense feature names. Increased via calls to registerDenseFeatures()
	*/
	public static ArrayList<String> DENSE_FEATURE_NAMES = new ArrayList<String>();

	/*
	* The values of each of the dense features, defaulting to 0.
	*/
	private ArrayList<Float> denseFeatures = null;

	/*
	* Value of sparse features.
	*/
	private HashMap<String, Float> sparseFeatures;

	public FeatureVector() {
	sparseFeatures = new HashMap<String, Float>();
	denseFeatures = new ArrayList<Float>(DENSE_FEATURE_NAMES.size());
	for (int i = 0; i < denseFeatures.size(); i++)
	denseFeatures.set(i, 0.0f);
	}

	/**
	* This version of the constructor takes an uninitialized feature with potentially intermingled
	* labeled and unlabeled feature values, of the format:
	*
	* [feature1=]value [feature2=]value
	*
	* It produces a Feature Vector where all unlabeled features have been labeled by appending the
	* unlabeled feature index (starting at 0) to the defaultPrefix value.
	*
	* IMPORTANT The feature values are inverted, for historical reasons, which leads to a lot
	* of confusion. They have to be inverted here and when the score is actually computed. They
	* are inverted here (which is used to build the feature vector representation of a rule's dense
	* features) and in {@link BilingualRule::estimateRuleCost()}, where the rule's precomputable
	* (weighted) score is cached.
	*
	* @param featureString, the string of labeled and unlabeled features (probably straight from the
	* grammar text file)
	* @param prefix, the prefix to use for unlabeled features (probably "tm_OWNER_")
	*/
	public FeatureVector(String featureString, String prefix) {

	// System.err.println(String.format("FEATURES_OF(%s, %s)", featureString, prefix));

	/*
	* Read through the features on this rule, adding them to the feature vector. Unlabeled features
	* are converted to a canonical form.
	*
	* Note that it's bad form to mix unlabeled features and the named feature index they are mapped
	* to, but we are being liberal in what we accept.
	*
	* IMPORTANT: Note that, for historical reasons, the sign is reversed on all dense scores.
	* This is the source of no end of confusion and should be done away with.
	*/
	sparseFeatures = new HashMap<String, Float>();
	denseFeatures = new ArrayList<Float>(DENSE_FEATURE_NAMES.size());
	for (int i = 0; i < denseFeatures.size(); i++)
	denseFeatures.set(i, 0.0f);

	int denseFeatureIndex = 0;

	if (!featureString.trim().equals("")) {
	for (String token : featureString.split("\\s+")) {
	if (token.indexOf('=') == -1) {
	/*
	* If we encounter an unlabeled feature, it is the next dense feature
	*/
	while (denseFeatures.size() <= denseFeatureIndex)
	denseFeatures.add(0.0f);
	denseFeatures.set(denseFeatureIndex, -Float.parseFloat(token));
	denseFeatureIndex++;
	} else {
	/*
	* Labeled features are of two types: if they start with the prefix, they are actually
	* dense feature in disguise; otherwise, they are proper sparse features.
	*/
	int splitPoint = token.indexOf('=');
	if (token.startsWith(prefix)) {
	// System.err.println(String.format(" PREFIX=%s '%s'.substring(%d,%d) = %s", prefix, token, prefix.length(), splitPoint,
	// token.substring(prefix.length(), splitPoint)));
	int index = Integer.parseInt(token.substring(prefix.length(), splitPoint));
	while (denseFeatures.size() <= index)
	denseFeatures.add(0.0f);
	denseFeatures.set(index, 1.0f * Float.parseFloat(token.substring(splitPoint + 1)));
	} else {
	sparseFeatures.put(token.substring(0, splitPoint),
	Float.parseFloat(token.substring(splitPoint + 1)));
	}
	}
	}
	}
	}

	/**
	* Register one or more dense features with the global weight vector. This assumes them global
	* IDs, and then returns the index of the first feature (from which the calling feature function
	* can infer them all). This must be called by every feature function wishing to register
	* dense features!
	*
	* @param names
	* @return
	*/
	public void registerDenseFeatures(ArrayList<FeatureFunction> featureFunctions) {
	for (FeatureFunction feature: featureFunctions) {
	ArrayList<String> names = feature.reportDenseFeatures(denseFeatures.size());
	for (String name: names) {
	DENSE_FEATURE_NAMES.add(name);
	denseFeatures.add(getSparse(name));
	sparseFeatures.remove(name);
	}
	}
	}

	public ArrayList<Float> getDenseFeatures() {
	return denseFeatures;
	}

	public HashMap<String,Float> getSparseFeatures() {
	return sparseFeatures;
	}

	public Set<String> keySet() {
	return sparseFeatures.keySet();
	}

	public int size() {
	return sparseFeatures.size() + denseFeatures.size();
	}

	public FeatureVector clone() {
	FeatureVector newOne = new FeatureVector();
	for (String key : this.sparseFeatures.keySet())
	newOne.set(key, this.sparseFeatures.get(key));
	for (int i = 0; i < denseFeatures.size(); i++)
	newOne.set(i, denseFeatures.get(i));
	return newOne;
	}

	/**
	* Subtracts the weights in the other feature vector from this one. Note that this is not set
	* subtraction; keys found in the other FeatureVector but not in this one will be initialized with
	* a value of 0.0f before subtraction.
	*/
	public void subtract(FeatureVector other) {
	for (int i = 0; i < denseFeatures.size(); i++)
	denseFeatures.set(i, denseFeatures.get(i) - other.getDense(i));

	for (String key : other.keySet()) {
	float oldValue = (sparseFeatures.containsKey(key)) ? sparseFeatures.get(key) : 0.0f;
	sparseFeatures.put(key, oldValue - other.getSparse(key));
	}
	}

	/**
	* Adds the weights in the other feature vector to this one. This is set union, with values shared
	* between the two being summed.
	*/
	public void add(FeatureVector other) {
	while (denseFeatures.size() < other.denseFeatures.size())
	denseFeatures.add(0.0f);

	for (int i = 0; i < other.denseFeatures.size(); i++)
	increment(i, other.getDense(i));

	for (String key : other.keySet()) {
	if (!sparseFeatures.containsKey(key))
	sparseFeatures.put(key, other.getSparse(key));
	else
	sparseFeatures.put(key, sparseFeatures.get(key) + other.getSparse(key));
	}
	}

	/**
	* Return the weight of a sparse feature, indexed by its name.
	*
	* @param feature
	* @return the sparse feature's weight, or 0 if not found.
	*/
	public float getSparse(String feature) {
	if (sparseFeatures.containsKey(feature))
	return sparseFeatures.get(feature);
	return 0.0f;
	}

	public boolean hasValue(String name) {
	return sparseFeatures.containsKey(name);
	}

	/**
	* Return the weight of a dense feature, indexed by its feature index.
	*
	* @param id
	* @return the dense feature's value, or 0 if not found.
	*/
	public float getDense(int id) {
	if (id < denseFeatures.size())
	return denseFeatures.get(id);
	return 0.0f;
	}

	public void increment(String feature, float value) {
	sparseFeatures.put(feature, getSparse(feature) + value);
	}

	public void increment(int id, float value) {
	while (id >= denseFeatures.size())
	denseFeatures.add(0.0f);
	denseFeatures.set(id, getDense(id) + value);
	}

	public void set(String feature, float value) {
	sparseFeatures.put(feature, value);
	}

	public void set(int id, float value) {
	while (id >= denseFeatures.size())
	denseFeatures.add(0.0f);
	denseFeatures.set(id, value);
	}

	public Map<String, Float> getMap() {
	return sparseFeatures;
	}

	/**
	* Computes the inner product between this feature vector and another one.
	*/
	public float innerProduct(FeatureVector other) {
	float cost = 0.0f;
	for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++)
	cost += getDense(i) * other.getDense(i);

	for (String key : sparseFeatures.keySet())
	cost += sparseFeatures.get(key) * other.getSparse(key);

	return cost;
	}

	public void times(float value) {
	for (String key : sparseFeatures.keySet())
	sparseFeatures.put(key, sparseFeatures.get(key) * value);
	}

	/***
	* Moses distinguishes sparse features as those containing an underscore, so we have to fake it
	* to be compatible with their tuners.
	*/
	public String mosesString() {
	String outputString = "";

	HashSet<String> printed_keys = new HashSet<String>();

	// First print all the dense feature names in order
	for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
	outputString += String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i).replaceAll("_", "-"), denseFeatures.get(i));
	printed_keys.add(DENSE_FEATURE_NAMES.get(i));
	}

	// Now print the sparse features
	ArrayList<String> keys = new ArrayList<String>(sparseFeatures.keySet());
	Collections.sort(keys);
	for (String key: keys) {
	if (! printed_keys.contains(key)) {
	float value = sparseFeatures.get(key);
	if (key.equals("OOVPenalty"))
	// force moses to see it as sparse
	key = "OOV_Penalty";
	outputString += String.format("%s=%.3f ", key, value);
	}
	}
	return outputString.trim();
	}

	/***
	* Outputs a list of feature names. All dense features are printed. Feature names are printed
	* in the order they were read in.
	*/
	@Override
	public String toString() {
	String outputString = "";

	HashSet<String> printed_keys = new HashSet<String>();

	// First print all the dense feature names in order
	for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
	outputString += String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i), getDense(i));
	printed_keys.add(DENSE_FEATURE_NAMES.get(i));
	}

	// Now print the rest of the features
	ArrayList<String> keys = new ArrayList<String>(sparseFeatures.keySet());
	Collections.sort(keys);
	for (String key: keys)
	if (! printed_keys.contains(key))
	outputString += String.format("%s=%.3f ", key, sparseFeatures.get(key));

	return outputString.trim();
	}
	}