blob: 56880072bc07e90bdac80e4a37b2c37214d47716 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.util.impl;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.uima.internal.util.IntVector;
/**
* Share common underlying char[] among strings: Optimize sets of strings for
* efficient storage
*
* Apply it to a set of strings.
*
* Lifecycle:
* 1) make an instance of this class
* 2) call .add (String or String[]) for all strings in the set
*
* or - skip 1 and 2 and pass in a String [] that can be modified (sorted]
*
* 3) call .optimize()
*
* 4) call
* .getString(String) or .getStringArray(String[]) - returns new string objects
* .updateStringArray(String[]) to replace original Strings
* .getOffset(String) - returns int offset in some common string
* .getCommonStringIndex(String)
* 5) call getCommonStrings to get all the common strings
* 6) Let the GC collect the instance of this class
*
* The strings are added first to a big list, instead of directly to a
* stringbuilder in order to improve reuse by sorting for longest strings first
*
* The commonStrings are kept in an array. There are more than one to support
* very large amounts, in excess of 2GB.
*
* Nulls, passed in as strings, are mostly ignored, but handled appropriately.
*
*
*/
public class OptimizeStrings {
/**
* splitSize = 100 means
* the common string can be as large as 100
* the common string can hold a string of length 100
*/
// not final static for testing
private int splitSize = Integer.MAX_VALUE - 2; // avoid boundary issues
private ArrayList<String> inStrings = new ArrayList<String>();
/**
* A two hop map from strings to offsets is used.
* "index" (int):
* The first hop: goes from string to an int index, incrementing 1 per
* unique shared common string segment
* "offset" (int):
* The 2nd hop: goes from the first index to the actual int offset in
* the particular associated piece of the common strings.
* The lastIndexInCommonStringA array holds the last index value
* within each of the common string segments
*/
// the value is either
// an int index into the offset table which has the offset in the common string
// the intindex in the str aux heap for this string, in the deserialized form
// These are distinguished by having the aux heap index be negative
private Map<String, Integer> stringToIndexMap;
// map from string index its offset in the piece of the common string array
private int[] offsets;
private String[] commonStringsA;
/**
* holds the last index (counting down in sort order) that is valid for
* the corresponding common string segment.
*/
private int[] lastIndexInCommonStringsA;
private Map<String, String> returnedStrings = new HashMap<String, String>();
private long savedCharsExact = 0;
private long savedCharsSubstr = 0;
private int nextSeq = -1;
private final boolean doMeasurement;
public OptimizeStrings(boolean doMeasurement) {
this.doMeasurement = doMeasurement;
}
// for testing, mainly
public OptimizeStrings(boolean doMeasurement, int splitSize) {
this.doMeasurement = doMeasurement;
this.splitSize = splitSize;
}
/**
* The number of characters saved - for statistical reporting only
* @return the number of characters saved
*/
public long getSavedCharsExact() {
return savedCharsExact;
}
public long getSavedCharsSubstr() {
return savedCharsSubstr;
}
/**
* The list of common strings
* @return the list of common strings
*/
public String[] getCommonStrings() {
return commonStringsA;
}
/**
* null strings not added
* 0 length strings added
* @param s -
*/
public void add(String s) {
if (inStrings.size() == (Integer.MAX_VALUE -1)) {
throw new RuntimeException(String.format("Exceeded size limit, size = %,d%n", inStrings.size()));
}
if (null != s) {
inStrings.add(s);
}
}
public void add(String[] sa) {
if (null != sa) {
for (String s : sa) {
add(s);
}
}
}
public int getStringIndex(String s) {
if (null == s) {
return -1;
}
return stringToIndexMap.get(s);
}
/**
*
* @param s must not be null
* @return a (positive or 0) or negative number.
* If positive, it is the offset in the common string
* If negative, -v is the index (starting at 1) that sequentially
* increases, for each new unique string fetched using this
* method.
*/
public int getIndexOrSeqIndex(String s) {
if (null == s) {
throw new RuntimeException();
}
int v = stringToIndexMap.get(s);
// v is 0 for ""
if (v >= 0) {
stringToIndexMap.put(s, nextSeq--);
}
return v;
}
/**
* return a string which is made as a substring of the common string
*
* @param s -
* @return an equal string, made as substring of common string instance
* equal results return the same string
*/
public String getString(String s) {
if (null == s) {
return null;
}
int i = getStringIndex(s);
int offset = getOffset(i);
String r = commonStringsA[getCommonStringIndex(i)].substring(offset, offset + s.length());
String rs = returnedStrings.get(r);
if (null != rs) {
return rs;
}
returnedStrings.put(r, r);
return r;
}
public long getOffset(String s) {
if (null == s) {
return -1;
}
return offsets[getStringIndex(s)];
}
public int getOffset(int i) {
return offsets[i];
}
/**
* @param index an index (not offset) to the sorted strings,
* @return the index of the segment it belongs to
*/
public int getCommonStringIndex(int index) {
for (int i = 0; i < lastIndexInCommonStringsA.length; i++) {
if (index >= lastIndexInCommonStringsA[i]) {
return i;
}
}
throw new IllegalArgumentException("index out of range, must be >= 0, was " + index);
}
public String[] getStringArray(String[] sai) {
if (null == sai) {
return null;
}
String[] sa = new String[sai.length];
for (int i = 0; i < sai.length; i++) {
sa[i] = getString(sai[i]);
}
return sa;
}
public void updateStringArray(String[] sa) {
if (null == sa) {
return;
}
for (int i = 0; i < sa.length; i++) {
sa[i] = getString(sa[i]);
}
}
/**
* Fully checking indexof for every new string is prohibitively expensive We
* do a partial check - only checking if a string is a substring of the
* previous one
*/
public void optimize() {
String[] sa = inStrings.toArray(new String[inStrings.size()]);
optimizeI(sa);
inStrings = new ArrayList<String>(); // release space
}
private void optimizeI(String[] sortedStrings) {
savedCharsExact = 0;
savedCharsSubstr = 0;
StringBuilder sb = new StringBuilder();
String[] sortedStrings2 = sortStrings(sortedStrings);
int ssLength;
if (sortedStrings2 != sortedStrings) {
// in this case, duplicates have already been eliminated
ssLength = sortedStrings2.length;
sortedStrings = sortedStrings2;
} else {
ssLength = eliminateSortedStringDuplicates(sortedStrings);
}
// create Offsets table
// sorted so longer ones follow shorter subsumed string
String previous = "";
int previousOffset = 0;
offsets = new int[ssLength];
IntVector lastIndexInCommonStrings = new IntVector();
List<String> commonStrings = new ArrayList<String>();
for (int i = ssLength - 1; i >= 0; i--) {
String s = sortedStrings[i];
int sLength = s.length();
if (sLength > splitSize) {
throw new RuntimeException(String.format("String too long, length = %,d, max length allowed is %,d", sLength, splitSize));
}
if (previous.startsWith(s)) {
offsets[i] = previousOffset;
if (doMeasurement) { // equals case counted in dupl removal
savedCharsSubstr += sLength;
}
} else {
if (sb.length() + sLength > splitSize) {
commonStrings.add(sb.toString());
sb = new StringBuilder();
lastIndexInCommonStrings.add(i + 1); // previous index because index counting down
}
offsets[i] = previousOffset = sb.length();
sb.append(previous = s);
}
}
commonStrings.add(sb.toString()); // add the last (partial) one
lastIndexInCommonStrings.add(0); // the last index
// convert List<Integer> to int[]
lastIndexInCommonStrings.toArray();
lastIndexInCommonStringsA = lastIndexInCommonStrings.toArray();
commonStringsA = commonStrings.toArray(new String[commonStrings.size()]);
// prepare map from original string object to index in sorted arrays and offsets
// index also used to find common string segment.
stringToIndexMap = new HashMap<String, Integer>(ssLength);
for (int i = ssLength - 1; i >= 0; i--) {
stringToIndexMap.put(sortedStrings[i], i);
}
}
/**
* Scan sorted strings, removing duplicates.
* Copy refs so that final array has no dups up to new length
* Return new length, but don't trim array
* @param sortedStrings
* @return new length
*/
private int eliminateSortedStringDuplicates(String[] sortedStrings) {
if (sortedStrings.length == 0) {
return 0;
}
String prev = sortedStrings[0];
int to = 1;
for(int from = 1; from < sortedStrings.length; from++) {
String s = sortedStrings[from];
if (s.equals(prev)) {
if (doMeasurement) {
savedCharsExact += s.length();
}
continue;
}
prev = s;
sortedStrings[to] = s;
to++;
}
return to; // to is length, is also where next string would be copied to
}
private String[] sortStrings(String[] sa) {
try {
Arrays.sort(sa);
// testing
// Set<String> orderedSet = new TreeSet<String>();
// for (String s : inStrings) {
// orderedSet.add(s);
// }
// Iterator<String> it = orderedSet.iterator();
// for (int i = 0; i < sa.length; i++) {
// String s = it.next();
// if (sa[i] != s) {
// throw new RuntimeException(String.format(
// "Mismatch, old way sort(i = %,d) = %s, new way is %s", i, s));
// }
// while ((i < sa.length - 1) && (sa[i + 1].equals(sa[i]))) {
// i++;
// }
// }
// end of test
return sa;
} catch (StackOverflowError e) {
// see https://issues.apache.org/jira/browse/UIMA-2515
// debug/test
// System.out.println("hit stack overflow");
Set<String> orderedSet = new TreeSet<String>();
for (String s : inStrings) {
if (!orderedSet.add(s)) {
savedCharsExact += s.length();
}
}
sa = new String[orderedSet.size()]; // has dups removed already
Iterator<String> it = orderedSet.iterator();
for (int i = 0; i < sa.length; i++) {
sa[i] = it.next();
}
return sa;
}
}
}