blob: d6f4a396d0eb24763611478e1181ccc4c8693705 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.resource;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;
import org.apache.uima.internal.util.XMLUtils;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
public class MultiTreeWordListPersistence {
/**
* Reads the XML-File with the specified path and creates a TreeWordList.
*
* @param root
* - the root node of the tree
* @param path
* - path of the word list
* @throws IOException
* - when there is a problem reading the stream
*/
public void readMTWL(MultiTextNode root, String path) throws IOException {
try (FileInputStream stream = new FileInputStream(path)) {
readMTWL(root, stream, "UTF-8");
}
}
/**
* Sniffs the content type for xml type.
*
* @param is
* the inputStream to sniff. Must support {@link InputStream#markSupported()}
* @return true if this stream starts with '{@literal <?xml}'
* @throws IOException
* - when there is a problem reading the stream
*/
public static boolean isSniffedXmlContentType(InputStream is) throws IOException {
if (is == null)
throw new IOException("Stream is null");
if (!is.markSupported()) {
throw new IOException("Cannot mark stream. just wrap it in a BufferedInputStream");
}
byte[] bytes = new byte[5]; // peek first five letters
is.mark(5);
is.read(bytes);
String prefix = new String(bytes);
is.reset();
if ("<?xml".equals(prefix)) {
return true;
}
return false;
}
public void readMTWL(MultiTextNode root, InputStream stream, String encoding) throws IOException {
try {
InputStream is = new BufferedInputStream(stream); // adds mark/reset support
boolean isXml = isSniffedXmlContentType(is);
if (!isXml) { // MTWL is encoded
is = new ZipInputStream(is);
((ZipInputStream) is).getNextEntry(); // zip must contain a single entry
}
InputStreamReader streamReader = new InputStreamReader(is, encoding);
TrieXMLEventHandler handler = new TrieXMLEventHandler(root);
XMLReader reader = XMLUtils.createXMLReader();
reader.setContentHandler(handler);
reader.setErrorHandler(handler);
reader.parse(new InputSource(streamReader));
} catch (SAXException e) {
throw new IllegalStateException(e);
}
}
public void createMTWLFile(MultiTextNode root, String path) throws IOException {
createMTWLFile(root, path, true, "UTF-8");
}
public void createMTWLFile(MultiTextNode root, boolean compressed, String path)
throws IOException {
createMTWLFile(root, path, compressed, "UTF-8");
}
public void createMTWLFile(MultiTextNode root, String path, boolean compressed, String encoding)
throws IOException {
if (compressed) {
writeCompressedMTWLFile(root, path, encoding);
} else {
writeUncompressedMTWLFile(root, path, encoding);
}
}
private void writeCompressedMTWLFile(MultiTextNode root, String path, String encoding)
throws IOException {
FileOutputStream fos = new FileOutputStream(path);
BufferedOutputStream bos = new BufferedOutputStream(fos);
ZipOutputStream zos = new ZipOutputStream(bos);
OutputStreamWriter writer = new OutputStreamWriter(zos, encoding);
zos.putNextEntry(new ZipEntry(path));
writeMTWLFile(root, writer);
writer.flush();
zos.closeEntry();
writer.close();
}
private void writeUncompressedMTWLFile(MultiTextNode root, String path, String encoding)
throws IOException {
FileOutputStream output = new FileOutputStream(path);
OutputStreamWriter writer = new OutputStreamWriter(output, encoding);
writeMTWLFile(root, writer);
writer.close();
}
private void writeMTWLFile(MultiTextNode root, OutputStreamWriter writer) throws IOException {
writer.write("<?xml version=\"1.0\" ?><root>");
for (MultiTextNode node : root.getChildren().values()) {
writeTextNode(writer, node);
}
writer.write("</root>");
}
private void writeTextNode(Writer writer, MultiTextNode node) {
try {
writer.write("\n");
// String s = "<n e=\"" + Boolean.toString(node.isWordEnd()) + "\">";
String s = "<n>";
writer.write(s);
writer.write("<c><![CDATA[" + node.getValue() + "]]></c>");
if (Boolean.valueOf(node.isWordEnd())) {
for (String type : node.getTypes()) {
String t = "<t>" + type + "</t>";
writer.write(t);
}
}
for (MultiTextNode child : node.getChildren().values()) {
writeTextNode(writer, child);
}
writer.write("</n>");
} catch (IOException e) {
e.printStackTrace();
}
}
}