blob: 173100bcbf69201d5af2fbc5bd3f5724f5dcbf29 [file] [log] [blame]
/*
* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU
* Lesser General Public License as published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License along with this library;
* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
* 02111-1307 USA
*/
/*
* This file uses code from the edu.umd.clip.mt.subsample.Subsampler class from the University of
* Maryland's jmtTools project (in conjunction with the umd-hadoop-mt-0.01 project). That project is
* released under the terms of the Apache License 2.0, but with special permission for the Joshua
* Machine Translation System to release modifications under the LGPL version 2.1. LGPL version 3
* requires no special permission since it is compatible with Apache License 2.0
*/
package joshua.subsample;
import java.io.IOException;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
/**
* This class defines a callback closure to allow "overriding" the main function in subclasses of
* {@link Subsampler}, without duplicating code. For all subclasses, CLI <code>Options</code> should
* be members of the class (so they're visible to <code>runSubsampler</code> as well as
* <code>getCliOptions</code>), the <code>getCliOptions</code> method should be overridden to add
* the additional options (via <code>super</code> to keep the old options), and the
* <code>runSubsampler</code> method should be overridden to do the primary work for main. The
* <code>runMain</code> method ties everything together and should not need modification. Due to the
* one-use nature of subclasses of <code>SubsampleCLI</code>, they generally should be implemented
* as anonymous local classes.
*
* @author wren ng thornton <wren@users.sourceforge.net>
* @version $LastChangedDate$
*/
@SuppressWarnings("static-access")
public class SubsamplerCLI {
// TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
protected final Option ot = OptionBuilder.withArgName("listfile").hasArg()
.withDescription("A file containing a list of training file basenames (what to sample from)")
.isRequired().create("training");
// TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
protected final Option otest = OptionBuilder.withArgName("file").hasArgs()
.withDescription("The test file (what to sample for)").isRequired().create("test");
// TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
protected final Option ooutput = OptionBuilder.withArgName("basename").hasArgs()
.withDescription("File basename for output training corpus").isRequired().create("output");
// TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
protected final Option of = OptionBuilder.withArgName("lang").hasArg()
.withDescription("Foreign language extension").isRequired().create("f");
// TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
protected final Option oe = OptionBuilder.withArgName("lang").hasArg()
.withDescription("Native language extension").isRequired().create("e");
// TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
protected final Option ofpath = OptionBuilder.withArgName("path").hasArg()
.withDescription("Directory containing foreign language files").create("fpath");
// TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
protected final Option oepath = OptionBuilder.withArgName("path").hasArg()
.withDescription("Directory containing native language files").create("epath");
// TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
protected final Option oratio = OptionBuilder.withArgName("ratio").hasArg()
.withDescription("Target F/E ratio").create("ratio");
/**
* Return all Options. The HelpFormatter will print them in sorted order, so it doesn't matter
* when we add them. Subclasses should override this method by adding more options.
*/
public Options getCliOptions() {
return new Options().addOption(ot).addOption(otest).addOption(of).addOption(oe)
.addOption(ofpath).addOption(oepath).addOption(oratio).addOption(ooutput);
}
/**
* This method should be overridden to return the class used in runSubsampler.
*/
public String getClassName() {
return Subsampler.class.getName();
}
/**
* Callback to run the subsampler. This function needs access to the variables holding each
* Option, thus all this closure nonsense.
*/
public void runSubsampler(String[] testFiles, int maxN, int targetCount, float ratio)
throws IOException {
new Subsampler(testFiles, maxN, targetCount).subsample(ot.getValue(), ratio, of.getValue(),
oe.getValue(), ofpath.getValue(), oepath.getValue(), ooutput.getValue());
}
/**
* Non-static version of main so that we can define anonymous local classes to override or extend
* the above.
*/
public void runMain(String[] args) {
Options o = this.getCliOptions();
try {
new GnuParser().parse(o, args);
} catch (ParseException pe) {
// The message from pe is ugly, so we omit it.
System.err.println("Error parsing command line");
new HelpFormatter().printHelp(this.getClassName(), o);
System.exit(1);
}
try {
float ratio = 0.8f;
if (this.oratio.getValue() != null) {
ratio = Float.parseFloat(this.oratio.getValue());
}
this.runSubsampler(this.otest.getValues(), 12, 20, ratio);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}