uimaj-core/src/main/java/org/apache/uima/util/impl/OptimizeStrings.java - uima-uimaj - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.uima.util.impl;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;

 import org.apache.uima.internal.util.IntVector;

 /**
  * Share common underlying char[] among strings: Optimize sets of strings for
  * efficient storage
  *
  * Apply it to a set of strings.
  *
  * Lifecycle:
  *   1) make an instance of this class
  *   2) call .add (String or String[]) for all strings in the set
  *
  *   or - skip 1 and 2 and pass in a String [] that can be modified (sorted]
  *
  *   3) call .optimize()
  *
  *   4) call
  *        .getString(String) or .getStringArray(String[]) - returns new string objects
  *        .updateStringArray(String[]) to replace original Strings
  *        .getOffset(String) - returns int offset in some common string
  *        .getCommonStringIndex(String)
  *   5) call getCommonStrings to get all the common strings
  *   6) Let the GC collect the instance of this class
  *
  * The strings are added first to a big list, instead of directly to a
  * stringbuilder in order to improve reuse by sorting for longest strings first
  *
  * The commonStrings are kept in an array.  There are more than one to support
  * very large amounts, in excess of 2GB.
  *
  * Nulls, passed in as strings, are mostly ignored, but handled appropriately.
  *
  *
  */
 public class OptimizeStrings {


   /**
    * splitSize = 100 means
    * the common string can be as large as 100
    * the common string can hold a string of length 100
    */
   // not final static for testing
   private int splitSize = Integer.MAX_VALUE - 2;  // avoid boundary issues

   private ArrayList<String> inStrings = new ArrayList<String>();

   /**
    * A two hop map from strings to offsets is used.
    * "index" (int):
    *    The first hop: goes from string to an int index, incrementing 1 per
    *      unique shared common string segment
    * "offset" (int):
    * The 2nd hop: goes from the first index to the actual int offset in
    *   the particular associated piece of the common strings.
    * The lastIndexInCommonStringA array holds the last index value
    *   within each of the common string segments
    */

   // the value is either
   //   an int index into the offset table which has the offset in the common string
   //   the intindex in the str aux heap for this string, in the deserialized form
   //   These are distinguished by having the aux heap index be negative
   private Map<String, Integer> stringToIndexMap;
   // map from string index its offset in the piece of the common string array
   private int[]             offsets;
   private String[]          commonStringsA;
   /**
    * holds the last index (counting down in sort order) that is valid for
    * the corresponding common string segment.
    */
   private int[] lastIndexInCommonStringsA;

   private Map<String, String> returnedStrings = new HashMap<String, String>();

   private long              savedCharsExact   = 0;
   private long              savedCharsSubstr  = 0;
   private int               nextSeq           = -1;

   private final boolean     doMeasurement;

   public OptimizeStrings(boolean doMeasurement) {
     this.doMeasurement = doMeasurement;
   }

   // for testing, mainly
   public OptimizeStrings(boolean doMeasurement, int splitSize) {
     this.doMeasurement = doMeasurement;
     this.splitSize = splitSize;
   }

   /**
    * The number of characters saved - for statistical reporting only
    * @return the number of characters saved
    */
   public long getSavedCharsExact() {
     return savedCharsExact;
   }

   public long getSavedCharsSubstr() {
     return savedCharsSubstr;
   }

   /**
    * The list of common strings
    * @return the list of common strings
    */
   public String[] getCommonStrings() {
     return commonStringsA;
   }

   /**
    * null strings not added
    * 0 length strings added
    * @param s -
    */
   public void add(String s) {
     if (inStrings.size() == (Integer.MAX_VALUE -1)) {
       throw new RuntimeException(String.format("Exceeded size limit, size = %,d%n", inStrings.size()));
     }
     if (null != s) {
       inStrings.add(s);
     }
   }

   public void add(String[] sa) {
     if (null != sa) {
       for (String s : sa) {
         add(s);
       }
     }
   }

   public int getStringIndex(String s) {
     if (null == s) {
       return -1;
     }
     return stringToIndexMap.get(s);
   }

   /**
    *
    * @param s  must not be null
    * @return a (positive or 0) or negative number.
    * If positive, it is the offset in the common string
    * If negative, -v is the index (starting at 1) that sequentially
    * increases, for each new unique string fetched using this
    * method.
    */
   public int getIndexOrSeqIndex(String s) {
     if (null == s) {
       throw new RuntimeException();
     }
     int v = stringToIndexMap.get(s);
     // v is 0 for ""
     if (v >= 0) {
       stringToIndexMap.put(s, nextSeq--);
     }
     return v;
   }

   /**
    * return a string which is made as a substring of the common string
    *
    * @param s -
    * @return an equal string, made as substring of common string instance
    *         equal results return the same string
    */
   public String getString(String s) {
     if (null == s) {
       return null;
     }
     int i = getStringIndex(s);
     int offset = getOffset(i);
     String r = commonStringsA[getCommonStringIndex(i)].substring(offset, offset + s.length());
     String rs = returnedStrings.get(r);
     if (null != rs) {
       return rs;
     }
     returnedStrings.put(r, r);
     return r;
   }

   public long getOffset(String s) {
     if (null == s) {
       return -1;
     }
     return offsets[getStringIndex(s)];
   }

   public int getOffset(int i) {
     return offsets[i];
   }

   /**
    * @param index an index (not offset) to the sorted strings,
    * @return the index of the segment it belongs to
    */
   public int getCommonStringIndex(int index) {
     for (int i = 0; i < lastIndexInCommonStringsA.length; i++) {
       if (index >= lastIndexInCommonStringsA[i]) {
         return i;
       }
     }
     throw new IllegalArgumentException("index out of range, must be >= 0, was " + index);
   }

   public String[] getStringArray(String[] sai) {
     if (null == sai) {
       return null;
     }
     String[] sa = new String[sai.length];
     for (int i = 0; i < sai.length; i++) {
       sa[i] = getString(sai[i]);
     }
     return sa;
   }

   public void updateStringArray(String[] sa) {
     if (null == sa) {
       return;
     }
     for (int i = 0; i < sa.length; i++) {
       sa[i] = getString(sa[i]);
     }
   }

   /**
    * Fully checking indexof for every new string is prohibitively expensive We
    * do a partial check - only checking if a string is a substring of the
    * previous one
    */
   public void optimize() {
     String[] sa = inStrings.toArray(new String[inStrings.size()]);
     optimizeI(sa);
     inStrings = new ArrayList<String>();  // release space
   }

   private void optimizeI(String[] sortedStrings) {
     savedCharsExact = 0;
     savedCharsSubstr = 0;
     StringBuilder sb = new StringBuilder();

     String[] sortedStrings2 = sortStrings(sortedStrings);

     int ssLength;
     if (sortedStrings2 != sortedStrings) {
       // in this case, duplicates have already been eliminated
       ssLength = sortedStrings2.length;
       sortedStrings = sortedStrings2;
     } else {
       ssLength = eliminateSortedStringDuplicates(sortedStrings);
     }
     // create Offsets table
     // sorted so longer ones follow shorter subsumed string
     String previous = "";
     int previousOffset = 0;
     offsets = new int[ssLength];
     IntVector lastIndexInCommonStrings = new IntVector();
     List<String> commonStrings = new ArrayList<String>();

     for (int i = ssLength - 1; i >= 0; i--) {
       String s = sortedStrings[i];
       int sLength = s.length();
       if (sLength > splitSize) {
         throw new RuntimeException(String.format("String too long, length = %,d, max length allowed is %,d", sLength, splitSize));
       }
       if (previous.startsWith(s)) {
         offsets[i] = previousOffset;
         if (doMeasurement) {  // equals case counted in dupl removal
           savedCharsSubstr += sLength;
         }
       } else {
         if (sb.length() + sLength  > splitSize) {
           commonStrings.add(sb.toString());
           sb = new StringBuilder();
           lastIndexInCommonStrings.add(i + 1);  // previous index because index counting down
         }
         offsets[i] = previousOffset = sb.length();
         sb.append(previous = s);
       }
     }
     commonStrings.add(sb.toString());  // add the last (partial) one
     lastIndexInCommonStrings.add(0);   // the last index

     // convert List<Integer> to int[]
     lastIndexInCommonStrings.toArray();
     lastIndexInCommonStringsA = lastIndexInCommonStrings.toArray();

     commonStringsA = commonStrings.toArray(new String[commonStrings.size()]);


     // prepare map from original string object to index in sorted arrays and offsets
     // index also used to find common string segment.
     stringToIndexMap = new HashMap<String, Integer>(ssLength);
     for (int i = ssLength - 1; i >= 0; i--) {
       stringToIndexMap.put(sortedStrings[i], i);
     }
   }

   /**
    * Scan sorted strings, removing duplicates.
    * Copy refs so that final array has no dups up to new length
    * Return new length, but don't trim array
    * @param sortedStrings
    * @return new length
    */
   private int eliminateSortedStringDuplicates(String[] sortedStrings) {
     if (sortedStrings.length == 0) {
       return 0;
     }
     String prev = sortedStrings[0];
     int to = 1;
     for(int from = 1; from < sortedStrings.length; from++) {
       String s = sortedStrings[from];
       if (s.equals(prev)) {
         if (doMeasurement) {
           savedCharsExact += s.length();
         }
         continue;
       }
       prev = s;
       sortedStrings[to] = s;
       to++;
     }
     return to;  // to is length, is also where next string would be copied to
   }

   private String[] sortStrings(String[] sa) {
     try {
       Arrays.sort(sa);

       // testing
 //      Set<String> orderedSet = new TreeSet<String>();
 //      for (String s : inStrings) {
 //        orderedSet.add(s);
 //      }
 //      Iterator<String> it = orderedSet.iterator();
 //      for (int i = 0; i < sa.length; i++) {
 //        String s = it.next();
 //        if (sa[i] != s) {
 //          throw new RuntimeException(String.format(
 //              "Mismatch, old way sort(i = %,d) = %s, new way is %s", i, s));
 //        }
 //        while ((i < sa.length - 1) && (sa[i + 1].equals(sa[i]))) {
 //         i++;
 //        }
 //      }
       // end of test


       return sa;
     } catch (StackOverflowError e) {
       // see https://issues.apache.org/jira/browse/UIMA-2515
       // debug/test
 //      System.out.println("hit stack overflow");
       Set<String> orderedSet = new TreeSet<String>();
       for (String s : inStrings) {
         if (!orderedSet.add(s)) {
           savedCharsExact += s.length();
         }
       }
       sa = new String[orderedSet.size()]; // has dups removed already
       Iterator<String> it = orderedSet.iterator();
       for (int i = 0; i < sa.length; i++) {
         sa[i] = it.next();
       }
       return sa;
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.uima.util.impl;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.HashMap;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import java.util.TreeSet;

	import org.apache.uima.internal.util.IntVector;

	/**
	* Share common underlying char[] among strings: Optimize sets of strings for
	* efficient storage
	*
	* Apply it to a set of strings.
	*
	* Lifecycle:
	* 1) make an instance of this class
	* 2) call .add (String or String[]) for all strings in the set
	*
	* or - skip 1 and 2 and pass in a String [] that can be modified (sorted]
	*
	* 3) call .optimize()
	*
	* 4) call
	* .getString(String) or .getStringArray(String[]) - returns new string objects
	* .updateStringArray(String[]) to replace original Strings
	* .getOffset(String) - returns int offset in some common string
	* .getCommonStringIndex(String)
	* 5) call getCommonStrings to get all the common strings
	* 6) Let the GC collect the instance of this class
	*
	* The strings are added first to a big list, instead of directly to a
	* stringbuilder in order to improve reuse by sorting for longest strings first
	*
	* The commonStrings are kept in an array. There are more than one to support
	* very large amounts, in excess of 2GB.
	*
	* Nulls, passed in as strings, are mostly ignored, but handled appropriately.
	*
	*
	*/
	public class OptimizeStrings {


	/**
	* splitSize = 100 means
	* the common string can be as large as 100
	* the common string can hold a string of length 100
	*/
	// not final static for testing
	private int splitSize = Integer.MAX_VALUE - 2; // avoid boundary issues

	private ArrayList<String> inStrings = new ArrayList<String>();

	/**
	* A two hop map from strings to offsets is used.
	* "index" (int):
	* The first hop: goes from string to an int index, incrementing 1 per
	* unique shared common string segment
	* "offset" (int):
	* The 2nd hop: goes from the first index to the actual int offset in
	* the particular associated piece of the common strings.
	* The lastIndexInCommonStringA array holds the last index value
	* within each of the common string segments
	*/

	// the value is either
	// an int index into the offset table which has the offset in the common string
	// the intindex in the str aux heap for this string, in the deserialized form
	// These are distinguished by having the aux heap index be negative
	private Map<String, Integer> stringToIndexMap;
	// map from string index its offset in the piece of the common string array
	private int[] offsets;
	private String[] commonStringsA;
	/**
	* holds the last index (counting down in sort order) that is valid for
	* the corresponding common string segment.
	*/
	private int[] lastIndexInCommonStringsA;

	private Map<String, String> returnedStrings = new HashMap<String, String>();

	private long savedCharsExact = 0;
	private long savedCharsSubstr = 0;
	private int nextSeq = -1;

	private final boolean doMeasurement;

	public OptimizeStrings(boolean doMeasurement) {
	this.doMeasurement = doMeasurement;
	}

	// for testing, mainly
	public OptimizeStrings(boolean doMeasurement, int splitSize) {
	this.doMeasurement = doMeasurement;
	this.splitSize = splitSize;
	}

	/**
	* The number of characters saved - for statistical reporting only
	* @return the number of characters saved
	*/
	public long getSavedCharsExact() {
	return savedCharsExact;
	}

	public long getSavedCharsSubstr() {
	return savedCharsSubstr;
	}

	/**
	* The list of common strings
	* @return the list of common strings
	*/
	public String[] getCommonStrings() {
	return commonStringsA;
	}

	/**
	* null strings not added
	* 0 length strings added
	* @param s -
	*/
	public void add(String s) {
	if (inStrings.size() == (Integer.MAX_VALUE -1)) {
	throw new RuntimeException(String.format("Exceeded size limit, size = %,d%n", inStrings.size()));
	}
	if (null != s) {
	inStrings.add(s);
	}
	}

	public void add(String[] sa) {
	if (null != sa) {
	for (String s : sa) {
	add(s);
	}
	}
	}

	public int getStringIndex(String s) {
	if (null == s) {
	return -1;
	}
	return stringToIndexMap.get(s);
	}

	/**
	*
	* @param s must not be null
	* @return a (positive or 0) or negative number.
	* If positive, it is the offset in the common string
	* If negative, -v is the index (starting at 1) that sequentially
	* increases, for each new unique string fetched using this
	* method.
	*/
	public int getIndexOrSeqIndex(String s) {
	if (null == s) {
	throw new RuntimeException();
	}
	int v = stringToIndexMap.get(s);
	// v is 0 for ""
	if (v >= 0) {
	stringToIndexMap.put(s, nextSeq--);
	}
	return v;
	}

	/**
	* return a string which is made as a substring of the common string
	*
	* @param s -
	* @return an equal string, made as substring of common string instance
	* equal results return the same string
	*/
	public String getString(String s) {
	if (null == s) {
	return null;
	}
	int i = getStringIndex(s);
	int offset = getOffset(i);
	String r = commonStringsA[getCommonStringIndex(i)].substring(offset, offset + s.length());
	String rs = returnedStrings.get(r);
	if (null != rs) {
	return rs;
	}
	returnedStrings.put(r, r);
	return r;
	}

	public long getOffset(String s) {
	if (null == s) {
	return -1;
	}
	return offsets[getStringIndex(s)];
	}

	public int getOffset(int i) {
	return offsets[i];
	}

	/**
	* @param index an index (not offset) to the sorted strings,
	* @return the index of the segment it belongs to
	*/
	public int getCommonStringIndex(int index) {
	for (int i = 0; i < lastIndexInCommonStringsA.length; i++) {
	if (index >= lastIndexInCommonStringsA[i]) {
	return i;
	}
	}
	throw new IllegalArgumentException("index out of range, must be >= 0, was " + index);
	}

	public String[] getStringArray(String[] sai) {
	if (null == sai) {
	return null;
	}
	String[] sa = new String[sai.length];
	for (int i = 0; i < sai.length; i++) {
	sa[i] = getString(sai[i]);
	}
	return sa;
	}

	public void updateStringArray(String[] sa) {
	if (null == sa) {
	return;
	}
	for (int i = 0; i < sa.length; i++) {
	sa[i] = getString(sa[i]);
	}
	}

	/**
	* Fully checking indexof for every new string is prohibitively expensive We
	* do a partial check - only checking if a string is a substring of the
	* previous one
	*/
	public void optimize() {
	String[] sa = inStrings.toArray(new String[inStrings.size()]);
	optimizeI(sa);
	inStrings = new ArrayList<String>(); // release space
	}

	private void optimizeI(String[] sortedStrings) {
	savedCharsExact = 0;
	savedCharsSubstr = 0;
	StringBuilder sb = new StringBuilder();

	String[] sortedStrings2 = sortStrings(sortedStrings);

	int ssLength;
	if (sortedStrings2 != sortedStrings) {
	// in this case, duplicates have already been eliminated
	ssLength = sortedStrings2.length;
	sortedStrings = sortedStrings2;
	} else {
	ssLength = eliminateSortedStringDuplicates(sortedStrings);
	}
	// create Offsets table
	// sorted so longer ones follow shorter subsumed string
	String previous = "";
	int previousOffset = 0;
	offsets = new int[ssLength];
	IntVector lastIndexInCommonStrings = new IntVector();
	List<String> commonStrings = new ArrayList<String>();

	for (int i = ssLength - 1; i >= 0; i--) {
	String s = sortedStrings[i];
	int sLength = s.length();
	if (sLength > splitSize) {
	throw new RuntimeException(String.format("String too long, length = %,d, max length allowed is %,d", sLength, splitSize));
	}
	if (previous.startsWith(s)) {
	offsets[i] = previousOffset;
	if (doMeasurement) { // equals case counted in dupl removal
	savedCharsSubstr += sLength;
	}
	} else {
	if (sb.length() + sLength > splitSize) {
	commonStrings.add(sb.toString());
	sb = new StringBuilder();
	lastIndexInCommonStrings.add(i + 1); // previous index because index counting down
	}
	offsets[i] = previousOffset = sb.length();
	sb.append(previous = s);
	}
	}
	commonStrings.add(sb.toString()); // add the last (partial) one
	lastIndexInCommonStrings.add(0); // the last index

	// convert List<Integer> to int[]
	lastIndexInCommonStrings.toArray();
	lastIndexInCommonStringsA = lastIndexInCommonStrings.toArray();

	commonStringsA = commonStrings.toArray(new String[commonStrings.size()]);


	// prepare map from original string object to index in sorted arrays and offsets
	// index also used to find common string segment.
	stringToIndexMap = new HashMap<String, Integer>(ssLength);
	for (int i = ssLength - 1; i >= 0; i--) {
	stringToIndexMap.put(sortedStrings[i], i);
	}
	}

	/**
	* Scan sorted strings, removing duplicates.
	* Copy refs so that final array has no dups up to new length
	* Return new length, but don't trim array
	* @param sortedStrings
	* @return new length
	*/
	private int eliminateSortedStringDuplicates(String[] sortedStrings) {
	if (sortedStrings.length == 0) {
	return 0;
	}
	String prev = sortedStrings[0];
	int to = 1;
	for(int from = 1; from < sortedStrings.length; from++) {
	String s = sortedStrings[from];
	if (s.equals(prev)) {
	if (doMeasurement) {
	savedCharsExact += s.length();
	}
	continue;
	}
	prev = s;
	sortedStrings[to] = s;
	to++;
	}
	return to; // to is length, is also where next string would be copied to
	}

	private String[] sortStrings(String[] sa) {
	try {
	Arrays.sort(sa);

	// testing
	// Set<String> orderedSet = new TreeSet<String>();
	// for (String s : inStrings) {
	// orderedSet.add(s);
	// }
	// Iterator<String> it = orderedSet.iterator();
	// for (int i = 0; i < sa.length; i++) {
	// String s = it.next();
	// if (sa[i] != s) {
	// throw new RuntimeException(String.format(
	// "Mismatch, old way sort(i = %,d) = %s, new way is %s", i, s));
	// }
	// while ((i < sa.length - 1) && (sa[i + 1].equals(sa[i]))) {
	// i++;
	// }
	// }
	// end of test


	return sa;
	} catch (StackOverflowError e) {
	// see https://issues.apache.org/jira/browse/UIMA-2515
	// debug/test
	// System.out.println("hit stack overflow");
	Set<String> orderedSet = new TreeSet<String>();
	for (String s : inStrings) {
	if (!orderedSet.add(s)) {
	savedCharsExact += s.length();
	}
	}
	sa = new String[orderedSet.size()]; // has dups removed already
	Iterator<String> it = orderedSet.iterator();
	for (int i = 0; i < sa.length; i++) {
	sa[i] = it.next();
	}
	return sa;
	}
	}
	}