blob: 14d5ead98468707d0706c903a5fcdebf04f8a26a [file] [log] [blame]
package joshua.decoder.ff.tm;
import static joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME;
import static joshua.util.FormatUtils.cleanNonTerminal;
import static joshua.util.FormatUtils.isNonterminal;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Logger;
import joshua.corpus.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.util.io.LineReader;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
public class CreateGlueGrammar {
private final Set<String> nonTerminalSymbols = new HashSet<>();
private static final Logger log = Logger.getLogger(CreateGlueGrammar.class.getName());
@Option(name = "--grammar", aliases = {"-g"}, required = true, usage = "provide grammar to determine list of NonTerminal symbols.")
private String grammarPath;
@Option(name = "--goal", aliases = {"-goal"}, required = false, usage = "specify custom GOAL symbol. Default: 'GOAL'")
private String goalSymbol = cleanNonTerminal(new JoshuaConfiguration().goal_symbol);
/* Rule templates */
// [GOAL] ||| <s> ||| <s> ||| 0
private static final String R_START = "[%1$s] ||| <s> ||| <s> ||| 0";
// [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
private static final String R_TWO = "[%1$s] ||| [%1$s,1] [%2$s,2] ||| [%1$s,1] [%2$s,2] ||| -1";
// [GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
private static final String R_END = "[%1$s] ||| [%1$s,1] </s> ||| [%1$s,1] </s> ||| 0";
// [GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0
private static final String R_TOP = "[%1$s] ||| <s> [%2$s,1] </s> ||| <s> [%2$s,1] </s> ||| 0";
private void run() throws IOException {
File grammar_file = new File(grammarPath);
if (!grammar_file.exists()) {
throw new IOException("Grammar file doesn't exist: " + grammarPath);
}
// in case of a packedGrammar, we read the serialized vocabulary,
// collecting all cleaned nonTerminal symbols.
if (grammar_file.isDirectory()) {
Vocabulary.read(new File(grammarPath + File.separator + VOCABULARY_FILENAME));
for (int i = 0; i < Vocabulary.size(); ++i) {
final String token = Vocabulary.word(i);
if (isNonterminal(token)) {
nonTerminalSymbols.add(cleanNonTerminal(token));
}
}
// otherwise we collect cleaned left-hand sides from the rules in the text grammar.
} else {
final LineReader reader = new LineReader(grammarPath);
while (reader.hasNext()) {
final String line = reader.next();
int lhsStart = line.indexOf("[") + 1;
int lhsEnd = line.indexOf("]");
if (lhsStart < 1 || lhsEnd < 0) {
log.info(String.format("malformed rule: %s\n", line));
continue;
}
final String lhs = line.substring(lhsStart, lhsEnd);
System.err.println(lhs);
nonTerminalSymbols.add(lhs);
}
}
log.info(
String.format("%d nonTerminal symbols read: %s",
nonTerminalSymbols.size(),
nonTerminalSymbols.toString()));
// write glue rules to stdout
System.out.println(String.format(R_START, goalSymbol));
for (String nt : nonTerminalSymbols)
System.out.println(String.format(R_TWO, goalSymbol, nt));
System.out.println(String.format(R_END, goalSymbol));
for (String nt : nonTerminalSymbols)
System.out.println(String.format(R_TOP, goalSymbol, nt));
}
public static void main(String[] args) throws IOException {
final CreateGlueGrammar glueCreator = new CreateGlueGrammar();
final CmdLineParser parser = new CmdLineParser(glueCreator);
try {
parser.parseArgument(args);
glueCreator.run();
} catch (CmdLineException e) {
log.info(e.toString());
parser.printUsage(System.err);
System.exit(1);
}
}
}