opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java - opennlp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.cmdline.namefind;

 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;

 import opennlp.tools.cmdline.AbstractTrainerTool;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
 import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool.TrainerToolParams;
 import opennlp.tools.cmdline.params.TrainingToolParams;
 import opennlp.tools.namefind.BilouCodec;
 import opennlp.tools.namefind.BioCodec;
 import opennlp.tools.namefind.NameSample;
 import opennlp.tools.namefind.NameSampleTypeFilter;
 import opennlp.tools.namefind.TokenNameFinderFactory;
 import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.SequenceCodec;
 import opennlp.tools.util.featuregen.GeneratorFactory;
 import opennlp.tools.util.model.ArtifactSerializer;
 import opennlp.tools.util.model.ModelUtil;

 import org.w3c.dom.Element;

 public final class TokenNameFinderTrainerTool
     extends AbstractTrainerTool<NameSample, TrainerToolParams> {

   interface TrainerToolParams extends TrainingParams, TrainingToolParams {

   }

   public TokenNameFinderTrainerTool() {
     super(NameSample.class, TrainerToolParams.class);
   }

   public String getShortDescription() {
     return "trainer for the learnable name finder";
   }

   static byte[] openFeatureGeneratorBytes(String featureGenDescriptorFile) {
     if(featureGenDescriptorFile != null) {
       return openFeatureGeneratorBytes(new File(featureGenDescriptorFile));
     }
     return null;
   }

   static byte[] openFeatureGeneratorBytes(File featureGenDescriptorFile) {
     byte featureGeneratorBytes[] = null;
     // load descriptor file into memory
     if (featureGenDescriptorFile != null) {

       try (InputStream bytesIn = CmdLineUtil.openInFile(featureGenDescriptorFile)) {
         featureGeneratorBytes = ModelUtil.read(bytesIn);
       } catch (IOException e) {
         throw new TerminateToolException(-1, "IO error while reading training data or indexing data: "
             + e.getMessage(), e);
       }
     }
     return featureGeneratorBytes;
   }

   /**
    * Load the resources, such as dictionaries, by reading the feature xml descriptor
    * and looking into the directory passed as argument.
    * @param resourcePath the directory in which the resources are to be found
    * @param featureGenDescriptor the feature xml descriptor
    * @return a map consisting of the file name of the resource and its corresponding Object
    */
   public static Map<String, Object> loadResources(File resourcePath, File featureGenDescriptor) {
     Map<String, Object> resources = new HashMap<String, Object>();

     if (resourcePath != null) {

       Map<String, ArtifactSerializer> artifactSerializers = TokenNameFinderModel
           .createArtifactSerializers();
       List<Element> elements = new ArrayList<Element>();
       ArtifactSerializer serializer = null;


       // TODO: If there is descriptor file, it should be consulted too
       if (featureGenDescriptor != null) {

         try (InputStream xmlDescriptorIn = CmdLineUtil.openInFile(featureGenDescriptor)) {
           artifactSerializers.putAll(GeneratorFactory.extractCustomArtifactSerializerMappings(xmlDescriptorIn));
         } catch (IOException e) {
           // TODO: Improve error handling!
           e.printStackTrace();
         }

         try (InputStream inputStreamXML = CmdLineUtil.openInFile(featureGenDescriptor)) {
           elements = GeneratorFactory.getDescriptorElements(inputStreamXML);
         } catch (IOException e) {
           e.printStackTrace();
         }
       }

       File resourceFiles[] = resourcePath.listFiles();

       for (File resourceFile : resourceFiles) {
         String resourceName = resourceFile.getName();
         //gettting the serializer key from the element tag name
         //if the element contains a dict attribute
         for (Element xmlElement : elements) {
           String dictName = xmlElement.getAttribute("dict");
           if (dictName != null && dictName.equals(resourceName)) {
             serializer = artifactSerializers.get(xmlElement.getTagName());
           }
         }
         // TODO: Do different? For now just ignore ....
         if (serializer == null)
           continue;

         try (InputStream resourceIn = CmdLineUtil.openInFile(resourceFile)) {
           resources.put(resourceName, serializer.create(resourceIn));
         } catch (InvalidFormatException e) {
           // TODO: Fix exception handling
           e.printStackTrace();
         } catch (IOException e) {
           // TODO: Fix exception handling
           e.printStackTrace();
         }
       }
     }
     return resources;
   }

   /**
    * Calls a loadResources method above to load any external resource required for training.
    * @param resourceDirectory the directory where the resources are to be found
    * @param featureGeneratorDescriptor the xml feature generator
    * @return a map containing the file name of the resource and its mapped Object
    */
   static Map<String, Object> loadResources(String resourceDirectory, File featureGeneratorDescriptor) {

     if (resourceDirectory != null) {
       File resourcePath = new File(resourceDirectory);

       return loadResources(resourcePath, featureGeneratorDescriptor);
     }

     return new HashMap<String, Object>();
   }

   public void run(String format, String[] args) {
     super.run(format, args);

     mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), true);
     if(mlParams == null) {
       mlParams = ModelUtil.createDefaultTrainingParameters();
     }

     File modelOutFile = params.getModel();

     byte featureGeneratorBytes[] = openFeatureGeneratorBytes(params.getFeaturegen());


     // TODO: Support Custom resources:
     //       Must be loaded into memory, or written to tmp file until descriptor
     //       is loaded which defines parses when model is loaded

     Map<String, Object> resources = loadResources(params.getResources(), params.getFeaturegen());

     CmdLineUtil.checkOutputFile("name finder model", modelOutFile);

     if (params.getNameTypes() != null) {
       String nameTypes[] = params.getNameTypes().split(",");
       sampleStream = new NameSampleTypeFilter(nameTypes, sampleStream);
     }

     String sequenceCodecImplName = params.getSequenceCodec();

     if ("BIO".equals(sequenceCodecImplName)) {
       sequenceCodecImplName = BioCodec.class.getName();
     }
     else if ("BILOU".equals(sequenceCodecImplName)) {
       sequenceCodecImplName = BilouCodec.class.getName();
     }

     SequenceCodec<String> sequenceCodec = TokenNameFinderFactory.instantiateSequenceCodec(sequenceCodecImplName);

     TokenNameFinderFactory nameFinderFactory = null;
     try {
       nameFinderFactory = TokenNameFinderFactory.create(params.getFactory(),
           featureGeneratorBytes, resources, sequenceCodec);
     } catch (InvalidFormatException e) {
       throw new TerminateToolException(-1, e.getMessage(), e);
     }

     NameSampleCountersStream counters = new NameSampleCountersStream(sampleStream);
     sampleStream = counters;

     TokenNameFinderModel model;
     try {
       model = opennlp.tools.namefind.NameFinderME.train(
           params.getLang(), params.getType(), sampleStream, mlParams,
           nameFinderFactory);
     }
     catch (IOException e) {
       throw new TerminateToolException(-1, "IO error while reading training data or indexing data: "
           + e.getMessage(), e);
     }
     finally {
       try {
         sampleStream.close();
       } catch (IOException e) {
         // sorry that this can fail
       }
     }

     System.out.println();
     counters.printSummary();
     System.out.println();

     CmdLineUtil.writeModel("name finder", modelOutFile, model);

   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.cmdline.namefind;

	import java.io.File;
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;

	import opennlp.tools.cmdline.AbstractTrainerTool;
	import opennlp.tools.cmdline.CmdLineUtil;
	import opennlp.tools.cmdline.TerminateToolException;
	import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool.TrainerToolParams;
	import opennlp.tools.cmdline.params.TrainingToolParams;
	import opennlp.tools.namefind.BilouCodec;
	import opennlp.tools.namefind.BioCodec;
	import opennlp.tools.namefind.NameSample;
	import opennlp.tools.namefind.NameSampleTypeFilter;
	import opennlp.tools.namefind.TokenNameFinderFactory;
	import opennlp.tools.namefind.TokenNameFinderModel;
	import opennlp.tools.util.InvalidFormatException;
	import opennlp.tools.util.SequenceCodec;
	import opennlp.tools.util.featuregen.GeneratorFactory;
	import opennlp.tools.util.model.ArtifactSerializer;
	import opennlp.tools.util.model.ModelUtil;

	import org.w3c.dom.Element;

	public final class TokenNameFinderTrainerTool
	extends AbstractTrainerTool<NameSample, TrainerToolParams> {

	interface TrainerToolParams extends TrainingParams, TrainingToolParams {

	}

	public TokenNameFinderTrainerTool() {
	super(NameSample.class, TrainerToolParams.class);
	}

	public String getShortDescription() {
	return "trainer for the learnable name finder";
	}

	static byte[] openFeatureGeneratorBytes(String featureGenDescriptorFile) {
	if(featureGenDescriptorFile != null) {
	return openFeatureGeneratorBytes(new File(featureGenDescriptorFile));
	}
	return null;
	}

	static byte[] openFeatureGeneratorBytes(File featureGenDescriptorFile) {
	byte featureGeneratorBytes[] = null;
	// load descriptor file into memory
	if (featureGenDescriptorFile != null) {

	try (InputStream bytesIn = CmdLineUtil.openInFile(featureGenDescriptorFile)) {
	featureGeneratorBytes = ModelUtil.read(bytesIn);
	} catch (IOException e) {
	throw new TerminateToolException(-1, "IO error while reading training data or indexing data: "
	+ e.getMessage(), e);
	}
	}
	return featureGeneratorBytes;
	}

	/**
	* Load the resources, such as dictionaries, by reading the feature xml descriptor
	* and looking into the directory passed as argument.
	* @param resourcePath the directory in which the resources are to be found
	* @param featureGenDescriptor the feature xml descriptor
	* @return a map consisting of the file name of the resource and its corresponding Object
	*/
	public static Map<String, Object> loadResources(File resourcePath, File featureGenDescriptor) {
	Map<String, Object> resources = new HashMap<String, Object>();

	if (resourcePath != null) {

	Map<String, ArtifactSerializer> artifactSerializers = TokenNameFinderModel
	.createArtifactSerializers();
	List<Element> elements = new ArrayList<Element>();
	ArtifactSerializer serializer = null;


	// TODO: If there is descriptor file, it should be consulted too
	if (featureGenDescriptor != null) {

	try (InputStream xmlDescriptorIn = CmdLineUtil.openInFile(featureGenDescriptor)) {
	artifactSerializers.putAll(GeneratorFactory.extractCustomArtifactSerializerMappings(xmlDescriptorIn));
	} catch (IOException e) {
	// TODO: Improve error handling!
	e.printStackTrace();
	}

	try (InputStream inputStreamXML = CmdLineUtil.openInFile(featureGenDescriptor)) {
	elements = GeneratorFactory.getDescriptorElements(inputStreamXML);
	} catch (IOException e) {
	e.printStackTrace();
	}
	}

	File resourceFiles[] = resourcePath.listFiles();

	for (File resourceFile : resourceFiles) {
	String resourceName = resourceFile.getName();
	//gettting the serializer key from the element tag name
	//if the element contains a dict attribute
	for (Element xmlElement : elements) {
	String dictName = xmlElement.getAttribute("dict");
	if (dictName != null && dictName.equals(resourceName)) {
	serializer = artifactSerializers.get(xmlElement.getTagName());
	}
	}
	// TODO: Do different? For now just ignore ....
	if (serializer == null)
	continue;

	try (InputStream resourceIn = CmdLineUtil.openInFile(resourceFile)) {
	resources.put(resourceName, serializer.create(resourceIn));
	} catch (InvalidFormatException e) {
	// TODO: Fix exception handling
	e.printStackTrace();
	} catch (IOException e) {
	// TODO: Fix exception handling
	e.printStackTrace();
	}
	}
	}
	return resources;
	}

	/**
	* Calls a loadResources method above to load any external resource required for training.
	* @param resourceDirectory the directory where the resources are to be found
	* @param featureGeneratorDescriptor the xml feature generator
	* @return a map containing the file name of the resource and its mapped Object
	*/
	static Map<String, Object> loadResources(String resourceDirectory, File featureGeneratorDescriptor) {

	if (resourceDirectory != null) {
	File resourcePath = new File(resourceDirectory);

	return loadResources(resourcePath, featureGeneratorDescriptor);
	}

	return new HashMap<String, Object>();
	}

	public void run(String format, String[] args) {
	super.run(format, args);

	mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), true);
	if(mlParams == null) {
	mlParams = ModelUtil.createDefaultTrainingParameters();
	}

	File modelOutFile = params.getModel();

	byte featureGeneratorBytes[] = openFeatureGeneratorBytes(params.getFeaturegen());


	// TODO: Support Custom resources:
	// Must be loaded into memory, or written to tmp file until descriptor
	// is loaded which defines parses when model is loaded

	Map<String, Object> resources = loadResources(params.getResources(), params.getFeaturegen());

	CmdLineUtil.checkOutputFile("name finder model", modelOutFile);

	if (params.getNameTypes() != null) {
	String nameTypes[] = params.getNameTypes().split(",");
	sampleStream = new NameSampleTypeFilter(nameTypes, sampleStream);
	}

	String sequenceCodecImplName = params.getSequenceCodec();

	if ("BIO".equals(sequenceCodecImplName)) {
	sequenceCodecImplName = BioCodec.class.getName();
	}
	else if ("BILOU".equals(sequenceCodecImplName)) {
	sequenceCodecImplName = BilouCodec.class.getName();
	}

	SequenceCodec<String> sequenceCodec = TokenNameFinderFactory.instantiateSequenceCodec(sequenceCodecImplName);

	TokenNameFinderFactory nameFinderFactory = null;
	try {
	nameFinderFactory = TokenNameFinderFactory.create(params.getFactory(),
	featureGeneratorBytes, resources, sequenceCodec);
	} catch (InvalidFormatException e) {
	throw new TerminateToolException(-1, e.getMessage(), e);
	}

	NameSampleCountersStream counters = new NameSampleCountersStream(sampleStream);
	sampleStream = counters;

	TokenNameFinderModel model;
	try {
	model = opennlp.tools.namefind.NameFinderME.train(
	params.getLang(), params.getType(), sampleStream, mlParams,
	nameFinderFactory);
	}
	catch (IOException e) {
	throw new TerminateToolException(-1, "IO error while reading training data or indexing data: "
	+ e.getMessage(), e);
	}
	finally {
	try {
	sampleStream.close();
	} catch (IOException e) {
	// sorry that this can fail
	}
	}

	System.out.println();
	counters.printSummary();
	System.out.println();

	CmdLineUtil.writeModel("name finder", modelOutFile, model);

	}
	}