blob: dcb65543606dab33a44f99962d41cf383d4352cd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.morfologik.tagdict;
import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;
import morfologik.stemming.DictionaryMetadata;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.postag.POSTaggerFactory;
import opennlp.tools.postag.TagDictionary;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.ModelUtil;
public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict";
private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info";
private static final String MORFOLOGIK_POSDICT = "tagdict."
+ MORFOLOGIK_POSDICT_SUF;
private static final String MORFOLOGIK_DICT_INFO = "tagdict."
+ MORFOLOGIK_DICT_INFO_SUF;
private TagDictionary dict;
private byte[] dictInfo;
private byte[] dictData;
public MorfologikPOSTaggerFactory() {
}
public TagDictionary createTagDictionary(File dictionary)
throws InvalidFormatException, FileNotFoundException, IOException {
if(!dictionary.canRead()) {
throw new FileNotFoundException("Could not read dictionary: " + dictionary.getAbsolutePath());
}
Path dictionaryMeta = DictionaryMetadata.getExpectedMetadataLocation(dictionary.toPath());
if(dictionaryMeta == null || !dictionaryMeta.toFile().canRead()) {
throw new FileNotFoundException("Could not read dictionary metadata: " + dictionaryMeta.getFileName());
}
this.dictData = Files.readAllBytes(dictionary.toPath());
this.dictInfo = Files.readAllBytes(dictionaryMeta);
return createMorfologikDictionary(dictData, dictInfo);
}
@Override
protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
super.init(ngramDictionary, null);
this.dict = posDictionary;
// get the dictionary path
String path = System.getProperty("morfologik.dict");
if (path == null) {
throw new IllegalArgumentException(
"The property fsa.dict is missing! -Dmorfologik.dict=path");
}
// now we try to load it...
try {
this.dictData = Files.readAllBytes(Paths.get(path));
this.dictInfo = Files.readAllBytes(getExpectedPropertiesFile(path)
.toPath());
this.dict = createMorfologikDictionary(dictData, dictInfo);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(
"The file is not a Morfologik dictionary!", e);
} catch (IOException e) {
throw new IllegalArgumentException(
"Could not open the Morfologik dictionary or the .info file", e);
}
}
@Override
public TagDictionary getTagDictionary() {
if (this.dict == null) {
if (artifactProvider != null) {
Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT);
if (obj != null) {
byte[] data = (byte[]) artifactProvider
.getArtifact(MORFOLOGIK_POSDICT);
byte[] info = (byte[]) artifactProvider
.getArtifact(MORFOLOGIK_DICT_INFO);
try {
this.dict = createMorfologikDictionary(data, info);
} catch (IllegalArgumentException e) {
throw new RuntimeException(
"Could not load the dictionary files to Morfologik.", e);
} catch (IOException e) {
throw new RuntimeException(
"IO error while reading the Morfologik dictionary files.", e);
}
}
}
}
return this.dict;
}
@Override
public void setTagDictionary(TagDictionary dictionary) {
this.dict = dictionary;
}
@Override
public TagDictionary createEmptyTagDictionary() {
throw new UnsupportedOperationException(
"Morfologik POS Tagger factory does not support this operation");
}
@Override
public TagDictionary createTagDictionary(InputStream in)
throws InvalidFormatException, IOException {
throw new UnsupportedOperationException(
"Morfologik POS Tagger factory does not support this operation");
}
@Override
@SuppressWarnings("rawtypes")
public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
Map<String, ArtifactSerializer> serializers = super
.createArtifactSerializersMap();
serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer());
serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer());
return serializers;
}
@Override
public Map<String, Object> createArtifactMap() {
Map<String, Object> artifactMap = super.createArtifactMap();
artifactMap.put(MORFOLOGIK_POSDICT, this.dictData);
artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo);
return artifactMap;
}
private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
throws IOException {
morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
.read(new ByteArrayInputStream(data), new ByteArrayInputStream(
info));
return new MorfologikTagDictionary(dict);
}
static class ByteArraySerializer implements ArtifactSerializer<byte[]> {
public byte[] create(InputStream in) throws IOException,
InvalidFormatException {
return ModelUtil.read(in);
}
public void serialize(byte[] artifact, OutputStream out) throws IOException {
out.write(artifact);
}
}
}