blob: 3a5d48c6d32d67b942979184005af4db7f9c4466 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.textruler.core;
import java.io.File;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
/**
*
* TextRulerExampleDocumentSet encapsulates an input set of documents, e.g. examples for a learning
* algorithm. It creates an instance of TextRulerExampleDocument for each found XMI file of the
* passed input folder
*
* For loading CASes you have to provide an CasCache. If you use TextRulerBasicLearner, this is done
* for you automatically.
*
* hint: this could be renamed to MLDocumentSet instead of TextRulerExampleDocumentSet ?
*/
public class TextRulerExampleDocumentSet {
protected List<TextRulerExampleDocument> documents;
protected CasCache casCache;
public TextRulerExampleDocumentSet(String xmiFolderName, CasCache casCache) {
super();
documents = new ArrayList<TextRulerExampleDocument>();
this.casCache = casCache;
File trainingFolder = new File(xmiFolderName);
File[] files = trainingFolder.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return (name.endsWith(".xmi"));
}
});
for (File file : files) {
TextRulerToolkit.log("found document XMI file: " + file.getName());
documents.add(new TextRulerExampleDocument(file.getAbsolutePath(), casCache));
}
}
// for subset creations:
protected TextRulerExampleDocumentSet(String[] inputXmiFiles, CasCache casCache) {
super();
this.casCache = casCache;
documents = new ArrayList<TextRulerExampleDocument>();
for (String fileName : inputXmiFiles)
documents.add(new TextRulerExampleDocument(fileName, casCache));
}
public void createExamplesForTarget(TextRulerTarget target) {
TextRulerExampleDocument[] sortedDocs = getSortedDocumentsInCacheOptimizedOrder();
for (TextRulerExampleDocument doc : sortedDocs) {
doc.createExamplesForTarget(target);
}
}
public void clearCurrentExamples() {
for (TextRulerExampleDocument doc : documents)
doc.clearCurrentExamples();
}
public Collection<CAS> getCachedCASes() {
return casCache.getCachedCASes();
}
public boolean casCacheContainsKey(String key) {
return casCache.containsElementWithKey(key);
}
public List<TextRulerExample> getAllExamples() {
return getAllExamples(false);
}
public List<TextRulerExample> getAllPositiveExamples() {
return getAllExamples(true);
}
public List<TextRulerExample> getAllExamples(boolean onlyPositives) {
List<TextRulerExample> result = new ArrayList<TextRulerExample>();
for (TextRulerExampleDocument doc : documents) {
result.addAll(doc.getPositiveExamples());
if (!onlyPositives)
result.addAll(doc.getNegativeExamples());
}
return result;
}
public List<TextRulerExampleDocument> getDocuments() {
return documents;
}
public TextRulerExampleDocument[] getSortedDocumentsInCacheOptimizedOrder(
Collection<TextRulerExampleDocument> documents) {
Set<TextRulerExampleDocument> docsLeft = new HashSet<TextRulerExampleDocument>(documents);
TextRulerExampleDocument[] sortedDocs = new TextRulerExampleDocument[documents.size()];
// "sort" the currently cached documents to the front of the document
// list, so that
// we can use them directly and do not have to reload all docs everytime
// we come here!
int i = 0;
for (TextRulerExampleDocument doc : documents) {
if (casCacheContainsKey(doc.getCasFileName())) {
docsLeft.remove(doc);
sortedDocs[i] = doc;
i++;
}
}
for (TextRulerExampleDocument doc : docsLeft) {
sortedDocs[i] = doc;
i++;
}
if (TextRulerToolkit.DEBUG) {
TextRulerToolkit.logIf(i != documents.size(), "ERROR, SIZE MISMATCH!");
}
return sortedDocs;
}
public TextRulerExampleDocument[] getSortedDocumentsInCacheOptimizedOrder() {
return getSortedDocumentsInCacheOptimizedOrder(documents);
}
public List<Integer> getTokenCountHistogrammForSlotName(String slotName, Set<String> filterSet) {
HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
int maxLen = 0;
TextRulerExampleDocument[] sortedDocs = getSortedDocumentsInCacheOptimizedOrder(documents);
for (TextRulerExampleDocument doc : sortedDocs) {
CAS aCas = doc.getCAS();
List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(aCas, slotName);
TypeSystem ts = aCas.getTypeSystem();
for (AnnotationFS a : slots) {
List<AnnotationFS> slotTokens = TextRulerToolkit.getAnnotationsWithinBounds(aCas, a
.getBegin(), a.getEnd(), TextRulerToolkit.getFilterSetWithSlotName(slotName,
filterSet), ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME));
int len = slotTokens.size();
if (len > maxLen)
maxLen = len;
Integer key = new Integer(len);
int current = map.containsKey(key) ? map.get(key) : 0;
map.put(key, len + current);
}
}
List<Integer> resultList = new ArrayList<Integer>(maxLen + 1);
for (int i = 0; i <= maxLen; i++) {
int value = map.containsKey(i) ? map.get(i) : 0;
resultList.add(value);
}
return resultList;
}
public CAS getCAS(String key) {
return casCache.getCAS(key);
}
public int size() {
return documents.size();
}
public TextRulerExampleDocument getDocumentForFileName(String fileName) {
for (TextRulerExampleDocument doc : documents)
if (doc.getCasFileName().equals(fileName))
return doc;
return null;
}
// TODO this is not tested yet!
public List<TextRulerExampleDocumentSet> partitionIntoSubsets(int[] percentages) {
List<TextRulerExampleDocumentSet> result = new ArrayList<TextRulerExampleDocumentSet>();
int sum = 0;
for (int p : percentages) {
if (p == 0) {
TextRulerToolkit
.log("[TextRulerExampleDocumentSet.partitionIntoSubsets] a percentage must not be zero!");
return null;
}
sum += p;
}
if (sum != 100) {
TextRulerToolkit
.log("[TextRulerExampleDocumentSet.partitionIntoSubsets] percentages has to be 100 in total!");
return null;
}
int rest = size();
int docIndex = 0;
for (int i = 0; i < percentages.length; i++) {
int partSize;
if (i == percentages.length - 1) {
partSize = Math.round((((percentages[i] * size()) / 100.0f)));
if (partSize == 0)
partSize = 1;
} else
partSize = rest;
if (partSize == 0) {
TextRulerToolkit
.log("[TextRulerExampleDocumentSet.partitionIntoSubsets] a percentage must not be zero! too few example documents for your partition?");
return null;
}
String[] fileNames = new String[partSize];
for (int doc = 0; doc < partSize; doc++)
fileNames[doc] = documents.get(doc + docIndex).getCasFileName();
docIndex += partSize;
result.add(new TextRulerExampleDocumentSet(fileNames, casCache));
rest -= partSize;
}
return result;
}
}