DictionaryAnnotator/src/main/java/org/apache/uima/annotator/dict_annot/dictionary/impl/DictionaryCreator.java - uima-addons - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.uima.annotator.dict_annot.dictionary.impl;

 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
 import java.util.HashMap;
 import java.util.List;
 import java.util.StringTokenizer;

 import org.apache.uima.UIMAFramework;
 import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.internal.util.CommandLineParser;
 import org.apache.uima.pear.tools.PackageBrowser;
 import org.apache.uima.pear.tools.PackageInstaller;
 import org.apache.uima.resource.ResourceSpecifier;
 import org.apache.uima.util.FileUtils;
 import org.apache.uima.util.XMLInputSource;

 /**
  * The DicrionaryCreator creates an dictionary XML file based on the given input
  * parameters. The input file must contain one dictionary entry for each line.
  * The line is tokenized into multi words either with the specified tokenizer
  * PEAR file or with the separator character.
  *
  * The dictionary output XML file (UTF-8) must be updated after the creation
  * with some metadata information.
  */
 public class DictionaryCreator {

    // input file command line parameter
    private static final String INPUT_FILE_PARAM = "-input";

    // output file command line parameter
    private static final String OUTPUT_FILE_PARAM = "-output";

    // input file encoding command line parameter
    private static final String INPUT_FILE_ENCODING_PARAM = "-encoding";

    // input file language command line parameter
    private static final String INPUT_FILE_LANGUAGE_PARAM = "-lang";

    // tokenizer command line parameter
    private static final String TOKENIZER_PARAM = "-tokenizer";

    // tokenizer token type command line parameter
    private static final String TOKEN_TYPE_PARAM = "-tokenType";

    // token separator character command line parameter
    private static final String SEPARATOR_CHAR_PARAM = "-separator";

    // default separator character
    private static final String SEPARATOR_CHAR = " ";

    // default output separator character
    private static final String OUTPUT_SEPARATOR_CHAR = "|";

    private static HashMap<Character, String> entities;

    static {
       entities = new HashMap<Character, String>(5);
       entities.put('<', "&lt;");
       entities.put('>', "&gt;");
       entities.put('&', "&amp;");
       entities.put('\"', "&quot;");
       entities.put('\'', "&apos;");
    }

    /**
     * creates the CommandLine parser used to parse the DictionaryCreator command
     * line.
     *
     * @return returns the CommandLineParser for the DictionaryCreator
     */
    private static final CommandLineParser createCmdLineParser() {
       CommandLineParser parser = new CommandLineParser();
       parser.addParameter(INPUT_FILE_PARAM, true);
       parser.addParameter(INPUT_FILE_LANGUAGE_PARAM, true);
       parser.addParameter(OUTPUT_FILE_PARAM, true);
       parser.addParameter(INPUT_FILE_ENCODING_PARAM, true);
       parser.addParameter(TOKENIZER_PARAM, true);
       parser.addParameter(TOKEN_TYPE_PARAM, true);
       parser.addParameter(SEPARATOR_CHAR_PARAM, true);
       return parser;
    }

    /**
     * prints the DictionaryCreator usage to the command line
     */
    private static final void printUsage() {
       System.out
             .println("Usage: java org.apache.uima.annotator.dict_annot.dictionary.impl.DictionaryCreator -input <InputFile> -encoding <InputFileEncoding> -output <OutputFile> [-tokenizer <TokenizerPear> -tokenType <tokenType>] [-separator <separatorChar>] ");
       System.out.println("Additional optional parameters:");
       System.out.println("  -lang <dictionaryLanguage>");

    }

    /**
     * checks the command line syntax for errors
     *
     * @param clp
     *           current CommandLineParser
     *
     * @return returns false in case of command line syntax errors, otherwise
     *         true
     */
    private static final boolean checkCmdLineSyntax(CommandLineParser clp) {
       boolean error = false;
       // check mandatory command line parameters
       if (!clp.isInArgsList(INPUT_FILE_PARAM)) {
          System.err.println("InputFile parameter -input is missing");
          error = true;
       }
       if (!clp.isInArgsList(INPUT_FILE_ENCODING_PARAM)) {
          System.err
                .println("InputFile encoding parameter -encoding is missing");
          error = true;
       }
       if (!clp.isInArgsList(OUTPUT_FILE_PARAM)) {
          System.err.println("OutputFile parameter -output is missing");
          error = true;
       }
       if (clp.isInArgsList(TOKENIZER_PARAM)) {
          if (!clp.isInArgsList(TOKEN_TYPE_PARAM)) {
             System.err
                   .println("If a tokenizer is used, the -tokenType parameter must be specified");
             error = true;
          }
       }

       // if there was an error, return false
       if (error == true) {
          return false;
       }

       return true;
    }

    /**
     * main method for DictionaryCreator
     *
     * @param args
     *           command line arguments
     */
    public static void main(String[] args) {

       // create command line parser
       CommandLineParser clp = createCmdLineParser();
       try {
          // parse command line
          clp.parseCmdLine(args);
          // check command line syntax
          if (!checkCmdLineSyntax(clp)) {
             printUsage();
             System.exit(-1);
          }
       } catch (Exception e) {
          System.err.println("Error parsing command line: " + e.getMessage());
       }

       // get command line arguments
       String inputFile = clp.getParamArgument(INPUT_FILE_PARAM);
       String language = clp.getParamArgument(INPUT_FILE_LANGUAGE_PARAM);
       String encoding = clp.getParamArgument(INPUT_FILE_ENCODING_PARAM);
       String outputFile = clp.getParamArgument(OUTPUT_FILE_PARAM);
       String tokenizerFile = clp.getParamArgument(TOKENIZER_PARAM);
       String tokenTypeStr = clp.getParamArgument(TOKEN_TYPE_PARAM);
       String separatorChar = clp.getParamArgument(SEPARATOR_CHAR_PARAM);

       // create dictionary
       try {
          DictionaryCreator.createDictionary(inputFile, encoding, outputFile,
                language, tokenizerFile, tokenTypeStr, separatorChar);

          System.out.println("The dictionary was sucessfully created at: "
                + outputFile);
       } catch (Exception ex) {
          ex.printStackTrace();
       }

    }

    public static boolean createDictionary(String inputFile, String encoding,
          String outputFile, String language, String tokenizerFile,
          String tokenTypeStr, String separatorChar) throws Exception {

       String outputSeparatorChar = OUTPUT_SEPARATOR_CHAR;

       // check input file command line argument
       File inFile = new File(inputFile);
       if (!inFile.canRead()) {
          throw new Exception("Error: Input file " + inputFile
                + " cannot be read!");
       }

       // check tokenizer pear command line arguments
       AnalysisEngine ae = null;
       Type tokenType = null;
       CAS cas = null;
       File tempDir = null;

       if (tokenizerFile != null) {
          // if a tokenizer is specified, check if the file can be read
          File pearFile = new File(tokenizerFile);
          if (!pearFile.canRead()) {
             throw new Exception("Error: Tokenizer file " + tokenizerFile
                   + " cannot be read!");
          }
          if (tokenTypeStr == null) {
             throw new Exception("Error: Tokenizer tokenType not specified");
          }
          try {
             // create temp directory to install PEAR
             tempDir = new File(System.getProperty("java.io.tmpdir"), "~tokenizer_temp_install");
             tempDir.deleteOnExit();
             tempDir.mkdir();

             // Install PEAR package
             PackageBrowser instPear = PackageInstaller.installPackage(tempDir,
                   pearFile, true);

             // Create analysis engine from the installed PEAR package
             XMLInputSource in = new XMLInputSource(instPear
                   .getComponentPearDescPath());
             ResourceSpecifier specifier = UIMAFramework.getXMLParser()
                   .parseResourceSpecifier(in);
             ae = UIMAFramework.produceAnalysisEngine(specifier);

             // create CAS and initialize tokenType
             cas = ae.newCAS();
             tokenType = cas.getTypeSystem().getType(tokenTypeStr);

          } catch (Exception ex) {
             throw new Exception("Error creating tokenizer: " + ex.getMessage(),
                   ex);
          }
       }

       // check separator char command line argument
       if (separatorChar == null) {
          // use default separator character
          separatorChar = SEPARATOR_CHAR;
       } else {
          // if set, use specified separator char also as output separator
          outputSeparatorChar = separatorChar;
       }

       // initialize input and output files
       BufferedReader reader = new BufferedReader(new InputStreamReader(
             new FileInputStream(inputFile), encoding));
       BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
             new FileOutputStream(outputFile), "UTF-8"));

       // write dictionary XML lead in
       writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
       writer
             .write("<dictionary xmlns=\"http://incubator.apache.org/uima\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"dictionary.xsd\">\n");
       writer.write("<typeCollection>\n");
       writer
             .write("<dictionaryMetaData caseNormalization=\"true\" multiWordEntries=\"true\" multiWordSeparator=\""
                   + outputSeparatorChar + "\"/>\n");
       if (language != null) {
          writer.write("<languageId>" + language + "</languageId>\n");
       }
       writer.write("<typeDescription>\n");
       writer.write("<typeName> ADD DICTIONARY OUTPUT TYPE HERE</typeName>\n");
       writer.write("</typeDescription>\n");
       writer.write("<entries>\n");

       // start adding dictionary entries
       String line = reader.readLine();
       while (line != null) {

          // multi workd string buffer - contains the tokens for each entry
          StringBuffer multiWordString = new StringBuffer();

          // tokenize entry line
          if (ae != null) { // use tokenizer
             cas.setDocumentText(line);
             if (language != null) {
                cas.setDocumentLanguage(language);
             }
             // tokenize line
             ae.process(cas);

             // read results
             FSIterator it = cas.getAnnotationIndex(tokenType).iterator();
             while (it.hasNext()) {
                multiWordString.append(((AnnotationFS) it.next())
                      .getCoveredText());
                multiWordString.append(outputSeparatorChar);
             }
             cas.reset();

          } else { // use separator char

             StringTokenizer tokenizer = new StringTokenizer(line, separatorChar);
             while (tokenizer.hasMoreTokens()) {
                multiWordString.append(tokenizer.nextToken());
                multiWordString.append(outputSeparatorChar);
             }
          }

          // trim string and remove separator char at the end
          String multiWordTokenString = multiWordString.toString().trim();
          if (multiWordTokenString.endsWith(outputSeparatorChar)) {
             int separatorLength = outputSeparatorChar.length();
             int length = multiWordTokenString.length();
             multiWordTokenString = multiWordTokenString.substring(0, length
                   - separatorLength);
          }

          // replace XML entities
          multiWordTokenString = replaceXMLEntities(multiWordTokenString);

          // write dictionary entry to XML
          writer.write("<entry>\n");
          writer.write("<key>" + multiWordTokenString + "</key>\n");
          writer.write("</entry>\n");

          // get next line
          line = reader.readLine();
       } // all dictionary lines are processed
       reader.close();

       // write dictionary XML lead out
       writer.write("</entries>\n");
       writer.write("</typeCollection>\n");
       writer.write("</dictionary>\n");
       writer.close();

       // try to delete PEAR temp dir
       if (tempDir != null) {
          FileUtils.deleteRecursive(tempDir);
          if (tempDir != null) {
             List files = FileUtils.getFiles(tempDir, true);
             if (files != null) {
                for (int i = 0; i < files.size(); i++) {
                   ((File) files.get(i)).deleteOnExit();
                }
             }
          }
       }

       return true;
    }

    /**
     * returns text with replaced XML entities
     *
     * @param text
     *           input text
     *
     * @return XML valid text
     */
    private static String replaceXMLEntities(String text) {
       StringBuffer buffer = new StringBuffer();
       for (int i = 0; i < text.length(); i++) {
          char character = text.charAt(i);
          if (DictionaryCreator.entities.containsKey(character)) {
             buffer.append(DictionaryCreator.entities.get(character));
          } else {
             buffer.append(character);
          }
       }
       return buffer.toString();
    }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.uima.annotator.dict_annot.dictionary.impl;

	import java.io.BufferedReader;
	import java.io.BufferedWriter;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.InputStreamReader;
	import java.io.OutputStreamWriter;
	import java.util.HashMap;
	import java.util.List;
	import java.util.StringTokenizer;

	import org.apache.uima.UIMAFramework;
	import org.apache.uima.analysis_engine.AnalysisEngine;
	import org.apache.uima.cas.CAS;
	import org.apache.uima.cas.FSIterator;
	import org.apache.uima.cas.Type;
	import org.apache.uima.cas.text.AnnotationFS;
	import org.apache.uima.internal.util.CommandLineParser;
	import org.apache.uima.pear.tools.PackageBrowser;
	import org.apache.uima.pear.tools.PackageInstaller;
	import org.apache.uima.resource.ResourceSpecifier;
	import org.apache.uima.util.FileUtils;
	import org.apache.uima.util.XMLInputSource;

	/**
	* The DicrionaryCreator creates an dictionary XML file based on the given input
	* parameters. The input file must contain one dictionary entry for each line.
	* The line is tokenized into multi words either with the specified tokenizer
	* PEAR file or with the separator character.
	*
	* The dictionary output XML file (UTF-8) must be updated after the creation
	* with some metadata information.
	*/
	public class DictionaryCreator {

	// input file command line parameter
	private static final String INPUT_FILE_PARAM = "-input";

	// output file command line parameter
	private static final String OUTPUT_FILE_PARAM = "-output";

	// input file encoding command line parameter
	private static final String INPUT_FILE_ENCODING_PARAM = "-encoding";

	// input file language command line parameter
	private static final String INPUT_FILE_LANGUAGE_PARAM = "-lang";

	// tokenizer command line parameter
	private static final String TOKENIZER_PARAM = "-tokenizer";

	// tokenizer token type command line parameter
	private static final String TOKEN_TYPE_PARAM = "-tokenType";

	// token separator character command line parameter
	private static final String SEPARATOR_CHAR_PARAM = "-separator";

	// default separator character
	private static final String SEPARATOR_CHAR = " ";

	// default output separator character
	private static final String OUTPUT_SEPARATOR_CHAR = "\|";

	private static HashMap<Character, String> entities;

	static {
	entities = new HashMap<Character, String>(5);
	entities.put('<', "<");
	entities.put('>', ">");
	entities.put('&', "&");
	entities.put('\"', """);
	entities.put('\'', "'");
	}

	/**
	* creates the CommandLine parser used to parse the DictionaryCreator command
	* line.
	*
	* @return returns the CommandLineParser for the DictionaryCreator
	*/
	private static final CommandLineParser createCmdLineParser() {
	CommandLineParser parser = new CommandLineParser();
	parser.addParameter(INPUT_FILE_PARAM, true);
	parser.addParameter(INPUT_FILE_LANGUAGE_PARAM, true);
	parser.addParameter(OUTPUT_FILE_PARAM, true);
	parser.addParameter(INPUT_FILE_ENCODING_PARAM, true);
	parser.addParameter(TOKENIZER_PARAM, true);
	parser.addParameter(TOKEN_TYPE_PARAM, true);
	parser.addParameter(SEPARATOR_CHAR_PARAM, true);
	return parser;
	}

	/**
	* prints the DictionaryCreator usage to the command line
	*/
	private static final void printUsage() {
	System.out
	.println("Usage: java org.apache.uima.annotator.dict_annot.dictionary.impl.DictionaryCreator -input <InputFile> -encoding <InputFileEncoding> -output <OutputFile> [-tokenizer <TokenizerPear> -tokenType <tokenType>] [-separator <separatorChar>] ");
	System.out.println("Additional optional parameters:");
	System.out.println(" -lang <dictionaryLanguage>");

	}

	/**
	* checks the command line syntax for errors
	*
	* @param clp
	* current CommandLineParser
	*
	* @return returns false in case of command line syntax errors, otherwise
	* true
	*/
	private static final boolean checkCmdLineSyntax(CommandLineParser clp) {
	boolean error = false;
	// check mandatory command line parameters
	if (!clp.isInArgsList(INPUT_FILE_PARAM)) {
	System.err.println("InputFile parameter -input is missing");
	error = true;
	}
	if (!clp.isInArgsList(INPUT_FILE_ENCODING_PARAM)) {
	System.err
	.println("InputFile encoding parameter -encoding is missing");
	error = true;
	}
	if (!clp.isInArgsList(OUTPUT_FILE_PARAM)) {
	System.err.println("OutputFile parameter -output is missing");
	error = true;
	}
	if (clp.isInArgsList(TOKENIZER_PARAM)) {
	if (!clp.isInArgsList(TOKEN_TYPE_PARAM)) {
	System.err
	.println("If a tokenizer is used, the -tokenType parameter must be specified");
	error = true;
	}
	}

	// if there was an error, return false
	if (error == true) {
	return false;
	}

	return true;
	}

	/**
	* main method for DictionaryCreator
	*
	* @param args
	* command line arguments
	*/
	public static void main(String[] args) {

	// create command line parser
	CommandLineParser clp = createCmdLineParser();
	try {
	// parse command line
	clp.parseCmdLine(args);
	// check command line syntax
	if (!checkCmdLineSyntax(clp)) {
	printUsage();
	System.exit(-1);
	}
	} catch (Exception e) {
	System.err.println("Error parsing command line: " + e.getMessage());
	}

	// get command line arguments
	String inputFile = clp.getParamArgument(INPUT_FILE_PARAM);
	String language = clp.getParamArgument(INPUT_FILE_LANGUAGE_PARAM);
	String encoding = clp.getParamArgument(INPUT_FILE_ENCODING_PARAM);
	String outputFile = clp.getParamArgument(OUTPUT_FILE_PARAM);
	String tokenizerFile = clp.getParamArgument(TOKENIZER_PARAM);
	String tokenTypeStr = clp.getParamArgument(TOKEN_TYPE_PARAM);
	String separatorChar = clp.getParamArgument(SEPARATOR_CHAR_PARAM);

	// create dictionary
	try {
	DictionaryCreator.createDictionary(inputFile, encoding, outputFile,
	language, tokenizerFile, tokenTypeStr, separatorChar);

	System.out.println("The dictionary was sucessfully created at: "
	+ outputFile);
	} catch (Exception ex) {
	ex.printStackTrace();
	}

	}

	public static boolean createDictionary(String inputFile, String encoding,
	String outputFile, String language, String tokenizerFile,
	String tokenTypeStr, String separatorChar) throws Exception {

	String outputSeparatorChar = OUTPUT_SEPARATOR_CHAR;

	// check input file command line argument
	File inFile = new File(inputFile);
	if (!inFile.canRead()) {
	throw new Exception("Error: Input file " + inputFile
	+ " cannot be read!");
	}

	// check tokenizer pear command line arguments
	AnalysisEngine ae = null;
	Type tokenType = null;
	CAS cas = null;
	File tempDir = null;

	if (tokenizerFile != null) {
	// if a tokenizer is specified, check if the file can be read
	File pearFile = new File(tokenizerFile);
	if (!pearFile.canRead()) {
	throw new Exception("Error: Tokenizer file " + tokenizerFile
	+ " cannot be read!");
	}
	if (tokenTypeStr == null) {
	throw new Exception("Error: Tokenizer tokenType not specified");
	}
	try {
	// create temp directory to install PEAR
	tempDir = new File(System.getProperty("java.io.tmpdir"), "~tokenizer_temp_install");
	tempDir.deleteOnExit();
	tempDir.mkdir();

	// Install PEAR package
	PackageBrowser instPear = PackageInstaller.installPackage(tempDir,
	pearFile, true);

	// Create analysis engine from the installed PEAR package
	XMLInputSource in = new XMLInputSource(instPear
	.getComponentPearDescPath());
	ResourceSpecifier specifier = UIMAFramework.getXMLParser()
	.parseResourceSpecifier(in);
	ae = UIMAFramework.produceAnalysisEngine(specifier);

	// create CAS and initialize tokenType
	cas = ae.newCAS();
	tokenType = cas.getTypeSystem().getType(tokenTypeStr);

	} catch (Exception ex) {
	throw new Exception("Error creating tokenizer: " + ex.getMessage(),
	ex);
	}
	}

	// check separator char command line argument
	if (separatorChar == null) {
	// use default separator character
	separatorChar = SEPARATOR_CHAR;
	} else {
	// if set, use specified separator char also as output separator
	outputSeparatorChar = separatorChar;
	}

	// initialize input and output files
	BufferedReader reader = new BufferedReader(new InputStreamReader(
	new FileInputStream(inputFile), encoding));
	BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
	new FileOutputStream(outputFile), "UTF-8"));

	// write dictionary XML lead in
	writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
	writer
	.write("<dictionary xmlns=\"http://incubator.apache.org/uima\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"dictionary.xsd\">\n");
	writer.write("<typeCollection>\n");
	writer
	.write("<dictionaryMetaData caseNormalization=\"true\" multiWordEntries=\"true\" multiWordSeparator=\""
	+ outputSeparatorChar + "\"/>\n");
	if (language != null) {
	writer.write("<languageId>" + language + "</languageId>\n");
	}
	writer.write("<typeDescription>\n");
	writer.write("<typeName> ADD DICTIONARY OUTPUT TYPE HERE</typeName>\n");
	writer.write("</typeDescription>\n");
	writer.write("<entries>\n");

	// start adding dictionary entries
	String line = reader.readLine();
	while (line != null) {

	// multi workd string buffer - contains the tokens for each entry
	StringBuffer multiWordString = new StringBuffer();

	// tokenize entry line
	if (ae != null) { // use tokenizer
	cas.setDocumentText(line);
	if (language != null) {
	cas.setDocumentLanguage(language);
	}
	// tokenize line
	ae.process(cas);

	// read results
	FSIterator it = cas.getAnnotationIndex(tokenType).iterator();
	while (it.hasNext()) {
	multiWordString.append(((AnnotationFS) it.next())
	.getCoveredText());
	multiWordString.append(outputSeparatorChar);
	}
	cas.reset();

	} else { // use separator char

	StringTokenizer tokenizer = new StringTokenizer(line, separatorChar);
	while (tokenizer.hasMoreTokens()) {
	multiWordString.append(tokenizer.nextToken());
	multiWordString.append(outputSeparatorChar);
	}
	}

	// trim string and remove separator char at the end
	String multiWordTokenString = multiWordString.toString().trim();
	if (multiWordTokenString.endsWith(outputSeparatorChar)) {
	int separatorLength = outputSeparatorChar.length();
	int length = multiWordTokenString.length();
	multiWordTokenString = multiWordTokenString.substring(0, length
	- separatorLength);
	}

	// replace XML entities
	multiWordTokenString = replaceXMLEntities(multiWordTokenString);

	// write dictionary entry to XML
	writer.write("<entry>\n");
	writer.write("<key>" + multiWordTokenString + "</key>\n");
	writer.write("</entry>\n");

	// get next line
	line = reader.readLine();
	} // all dictionary lines are processed
	reader.close();

	// write dictionary XML lead out
	writer.write("</entries>\n");
	writer.write("</typeCollection>\n");
	writer.write("</dictionary>\n");
	writer.close();

	// try to delete PEAR temp dir
	if (tempDir != null) {
	FileUtils.deleteRecursive(tempDir);
	if (tempDir != null) {
	List files = FileUtils.getFiles(tempDir, true);
	if (files != null) {
	for (int i = 0; i < files.size(); i++) {
	((File) files.get(i)).deleteOnExit();
	}
	}
	}
	}

	return true;
	}

	/**
	* returns text with replaced XML entities
	*
	* @param text
	* input text
	*
	* @return XML valid text
	*/
	private static String replaceXMLEntities(String text) {
	StringBuffer buffer = new StringBuffer();
	for (int i = 0; i < text.length(); i++) {
	char character = text.charAt(i);
	if (DictionaryCreator.entities.containsKey(character)) {
	buffer.append(DictionaryCreator.entities.get(character));
	} else {
	buffer.append(character);
	}
	}
	return buffer.toString();
	}
	}