| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.joshua.decoder.ff; |
| |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Map; |
| import java.util.Set; |
| |
| /** |
| * An implementation of a sparse feature vector, using for representing both weights and feature |
| * values. |
| * |
| * This class is used to hold both the decoder weights and the feature values accumulated across |
| * each edge. When features are read in upon decoder startup, they all start out as sparse features |
| * and are stored in the hash table. After the feature functions have been loaded, the decoder |
| * queries each of them for their sparse features via {@link registerDenseFeatures}. Those features |
| * returned by each decoder are then *removed* from the sparse feature hash and placed in the dense |
| * feature array. Therefore, when a feature registers a dense feature, it should take care to |
| * query either {@link org.apache.joshua.decoder.ff.FeatureVector#getDense(int)} or |
| * {@link org.apache.joshua.decoder.ff.FeatureVector#getSparse(String)} when asking for the feature |
| * values later on. |
| * |
| * @author Matt Post post@cs.jhu.edu |
| */ |
| |
| public class FeatureVector { |
| /* |
| * A list of the dense feature names. Increased via calls to registerDenseFeatures() |
| */ |
| public static final ArrayList<String> DENSE_FEATURE_NAMES = new ArrayList<>(); |
| |
| /* |
| * The values of each of the dense features, defaulting to 0. |
| */ |
| private ArrayList<Float> denseFeatures = null; |
| |
| /* |
| * Value of sparse features. |
| */ |
| private final HashMap<String, Float> sparseFeatures; |
| |
| public FeatureVector() { |
| sparseFeatures = new HashMap<>(); |
| denseFeatures = new ArrayList<>(DENSE_FEATURE_NAMES.size()); |
| for (int i = 0; i < denseFeatures.size(); i++) |
| denseFeatures.set(i, 0.0f); |
| } |
| |
| /** |
| * This version of the constructor takes an uninitialized feature with potentially intermingled |
| * labeled and unlabeled feature values, of the format: |
| * |
| * [feature1=]value [feature2=]value |
| * |
| * It produces a Feature Vector where all unlabeled features have been labeled by appending the |
| * unlabeled feature index (starting at 0) to the defaultPrefix value. |
| * |
| * **IMPORTANT** The feature values are inverted, for historical reasons, which leads to a lot |
| * of confusion. They have to be inverted here and when the score is actually computed. They |
| * are inverted here (which is used to build the feature vector representation of a rule's dense |
| * features) and in {@link org.apache.joshua.decoder.ff.tm.Rule#estimateRuleCost(java.util.List)} |
| * , where the rule's precomputable (weighted) score is cached. |
| * |
| * @param featureString, the string of labeled and unlabeled features (probably straight from the |
| * grammar text file) |
| * @param prefix, the prefix to use for unlabeled features (probably "tm_OWNER_") |
| */ |
| public FeatureVector(String featureString, String prefix) { |
| |
| // System.err.println(String.format("FEATURES_OF(%s, %s)", featureString, prefix)); |
| |
| /* |
| * Read through the features on this rule, adding them to the feature vector. Unlabeled features |
| * are converted to a canonical form. |
| * |
| * Note that it's bad form to mix unlabeled features and the named feature index they are mapped |
| * to, but we are being liberal in what we accept. |
| * |
| * IMPORTANT: Note that, for historical reasons, the sign is reversed on all *dense* scores. |
| * This is the source of *no end* of confusion and should be done away with. |
| */ |
| this(); |
| |
| int denseFeatureIndex = 0; |
| |
| if (!featureString.trim().equals("")) { |
| for (String token : featureString.split("\\s+")) { |
| if (token.indexOf('=') == -1) { |
| /* |
| * If we encounter an unlabeled feature, it is the next dense feature |
| */ |
| while (denseFeatures.size() <= denseFeatureIndex) |
| denseFeatures.add(0.0f); |
| denseFeatures.set(denseFeatureIndex, -Float.parseFloat(token)); |
| denseFeatureIndex++; |
| } else { |
| /* |
| * Labeled features are of two types: if they start with the prefix, they are actually |
| * dense feature in disguise; otherwise, they are proper sparse features. |
| */ |
| int splitPoint = token.indexOf('='); |
| if (token.startsWith(prefix)) { |
| // System.err.println(String.format(" PREFIX=%s '%s'.substring(%d,%d) = %s", prefix, token, prefix.length(), splitPoint, |
| // token.substring(prefix.length(), splitPoint))); |
| int index = Integer.parseInt(token.substring(prefix.length(), splitPoint)); |
| while (denseFeatures.size() <= index) |
| denseFeatures.add(0.0f); |
| denseFeatures.set(index, 1.0f * Float.parseFloat(token.substring(splitPoint + 1))); |
| } else { |
| sparseFeatures.put(token.substring(0, splitPoint), |
| Float.parseFloat(token.substring(splitPoint + 1))); |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * Register one or more dense features with the global weight vector. This assumes them global |
| * IDs, and then returns the index of the first feature (from which the calling feature function |
| * can infer them all). This *must* be called by every feature function wishing to register |
| * dense features! |
| * |
| * @param featureFunctions {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s |
| */ |
| public void registerDenseFeatures(ArrayList<FeatureFunction> featureFunctions) { |
| for (FeatureFunction feature: featureFunctions) { |
| ArrayList<String> names = feature.reportDenseFeatures(denseFeatures.size()); |
| for (String name: names) { |
| DENSE_FEATURE_NAMES.add(name); |
| denseFeatures.add(getSparse(name)); |
| sparseFeatures.remove(name); |
| } |
| } |
| } |
| |
| public ArrayList<Float> getDenseFeatures() { |
| return denseFeatures; |
| } |
| |
| public HashMap<String,Float> getSparseFeatures() { |
| return sparseFeatures; |
| } |
| |
| public Set<String> keySet() { |
| return sparseFeatures.keySet(); |
| } |
| |
| public int size() { |
| return sparseFeatures.size() + denseFeatures.size(); |
| } |
| |
| public FeatureVector clone() { |
| FeatureVector newOne = new FeatureVector(); |
| for (String key : this.sparseFeatures.keySet()) |
| newOne.set(key, this.sparseFeatures.get(key)); |
| for (int i = 0; i < denseFeatures.size(); i++) |
| newOne.set(i, getDense(i)); |
| return newOne; |
| } |
| |
| /** |
| * Subtracts the weights in the other feature vector from this one. Note that this is not set |
| * subtraction; keys found in the other FeatureVector but not in this one will be initialized with |
| * a value of 0.0f before subtraction. |
| * |
| * @param other another {@link org.apache.joshua.decoder.ff.FeatureVector} from which to subtract its score |
| */ |
| public void subtract(FeatureVector other) { |
| for (int i = 0; i < denseFeatures.size(); i++) |
| denseFeatures.set(i, getDense(i) - other.getDense(i)); |
| |
| for (String key : other.keySet()) { |
| float oldValue = (sparseFeatures.containsKey(key)) ? sparseFeatures.get(key) : 0.0f; |
| sparseFeatures.put(key, oldValue - other.getSparse(key)); |
| } |
| } |
| |
| /** |
| * Adds the weights in the other feature vector to this one. This is set union, with values shared |
| * between the two being summed. |
| * |
| * @param other another {@link org.apache.joshua.decoder.ff.FeatureVector} from which to add its score |
| */ |
| public void add(FeatureVector other) { |
| while (denseFeatures.size() < other.denseFeatures.size()) |
| denseFeatures.add(0.0f); |
| |
| for (int i = 0; i < other.denseFeatures.size(); i++) |
| increment(i, other.getDense(i)); |
| |
| for (String key : other.keySet()) { |
| if (!sparseFeatures.containsKey(key)) |
| sparseFeatures.put(key, other.getSparse(key)); |
| else |
| sparseFeatures.put(key, sparseFeatures.get(key) + other.getSparse(key)); |
| } |
| } |
| |
| /** |
| * Return the weight of a feature by name, after checking to determine if it is sparse or dense. |
| * |
| * @param feature String name of some feature |
| * @return the feature's weight |
| */ |
| public float getWeight(String feature) { |
| for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) { |
| if (DENSE_FEATURE_NAMES.get(i).equals(feature)) { |
| return getDense(i); |
| } |
| } |
| return getSparse(feature); |
| } |
| |
| /** |
| * Return the weight of a sparse feature, indexed by its name. |
| * |
| * @param feature String name of some feature |
| * @return the sparse feature's weight, or 0 if not found. |
| */ |
| public float getSparse(String feature) { |
| if (sparseFeatures.containsKey(feature)) |
| return sparseFeatures.get(feature); |
| return 0.0f; |
| } |
| |
| public boolean hasValue(String name) { |
| return sparseFeatures.containsKey(name); |
| } |
| |
| /** |
| * Return the weight of a dense feature, indexed by its feature index, or 0.0f, if the feature |
| * is not found. In other words, this is a safe way to query the dense feature vector. |
| * |
| * @param id int representing of some dense feature |
| * @return the dense feature's value, or 0 if not found. |
| */ |
| public float getDense(int id) { |
| if (id < denseFeatures.size()) |
| return denseFeatures.get(id); |
| return 0.0f; |
| } |
| |
| public void increment(String feature, float value) { |
| sparseFeatures.put(feature, getSparse(feature) + value); |
| } |
| |
| public void increment(int id, float value) { |
| while (id >= denseFeatures.size()) |
| denseFeatures.add(0.0f); |
| denseFeatures.set(id, getDense(id) + value); |
| } |
| |
| /** |
| * Set the value of a feature. We need to first determine whether the feature is a dense or |
| * sparse one, then set accordingly. |
| * |
| * @param feature String name of some feature |
| * @param value float value to set to the featue with the associated name |
| */ |
| public void set(String feature, float value) { |
| for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) { |
| if (DENSE_FEATURE_NAMES.get(i).equals(feature)) { |
| denseFeatures.set(i, value); |
| return; |
| } |
| } |
| // No dense feature was found; assume it's sparse |
| sparseFeatures.put(feature, value); |
| } |
| |
| public void set(int id, float value) { |
| while (id >= denseFeatures.size()) |
| denseFeatures.add(0.0f); |
| denseFeatures.set(id, value); |
| } |
| |
| public Map<String, Float> getMap() { |
| Map<String, Float> allFeatures = new HashMap<>(sparseFeatures.size() + denseFeatures.size()); |
| allFeatures.putAll(sparseFeatures); |
| for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) { |
| allFeatures.put(DENSE_FEATURE_NAMES.get(i), getDense(i)); |
| } |
| return allFeatures; |
| } |
| |
| /** |
| * Computes the inner product between this feature vector and another one. |
| * |
| * @param other a {@link org.apache.joshua.decoder.ff.FeatureVector} with which to compute the inner product |
| * @return float value representing the computation |
| */ |
| public float innerProduct(FeatureVector other) { |
| float cost = 0.0f; |
| for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) |
| cost += getDense(i) * other.getDense(i); |
| |
| for (String key : sparseFeatures.keySet()) |
| cost += sparseFeatures.get(key) * other.getSparse(key); |
| |
| return cost; |
| } |
| |
| public void times(float value) { |
| for (String key : sparseFeatures.keySet()) |
| sparseFeatures.put(key, sparseFeatures.get(key) * value); |
| } |
| |
| /*** |
| * Moses distinguishes sparse features as those containing an underscore, so we have to fake it |
| * to be compatible with their tuners. |
| * |
| * @return trimmed Moses output string |
| */ |
| public String mosesString() { |
| StringBuilder outputString = new StringBuilder(); |
| |
| HashSet<String> printed_keys = new HashSet<>(); |
| |
| // First print all the dense feature names in order |
| for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) { |
| outputString.append(String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i).replace("_", "-"), getDense(i))); |
| printed_keys.add(DENSE_FEATURE_NAMES.get(i)); |
| } |
| |
| // Now print the sparse features |
| ArrayList<String> keys = new ArrayList<>(sparseFeatures.keySet()); |
| Collections.sort(keys); |
| for (String key: keys) { |
| if (! printed_keys.contains(key)) { |
| float value = sparseFeatures.get(key); |
| if (key.equals("OOVPenalty")) |
| // force moses to see it as sparse |
| key = "OOV_Penalty"; |
| outputString.append(String.format("%s=%.3f ", key, value)); |
| } |
| } |
| return outputString.toString().trim(); |
| } |
| |
| /*** |
| * Outputs a list of feature names. All dense features are printed. Feature names are printed |
| * in the order they were read in. |
| */ |
| @Override |
| public String toString() { |
| StringBuilder outputString = new StringBuilder(); |
| |
| HashSet<String> printed_keys = new HashSet<>(); |
| |
| // First print all the dense feature names in order |
| for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) { |
| outputString.append(String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i), getDense(i))); |
| printed_keys.add(DENSE_FEATURE_NAMES.get(i)); |
| } |
| |
| // Now print the rest of the features |
| ArrayList<String> keys = new ArrayList<>(sparseFeatures.keySet()); |
| Collections.sort(keys); |
| keys.stream().filter(key -> !printed_keys.contains(key)).forEach( |
| key -> outputString.append(String.format("%s=%.3f ", key, sparseFeatures.get(key)))); |
| |
| return outputString.toString().trim(); |
| } |
| } |