duplicates/ide/impl/src/org/netbeans/modules/jackpot30/impl/duplicates/ComputeDuplicates.java - netbeans-jackpot30 - Git at Google

 /*
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
  *
  * Copyright 2009-2010 Sun Microsystems, Inc. All rights reserved.
  *
  * The contents of this file are subject to the terms of either the GNU
  * General Public License Version 2 only ("GPL") or the Common
  * Development and Distribution License("CDDL") (collectively, the
  * "License"). You may not use this file except in compliance with the
  * License. You can obtain a copy of the License at
  * http://www.netbeans.org/cddl-gplv2.html
  * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
  * specific language governing permissions and limitations under the
  * License.  When distributing the software, include this License Header
  * Notice in each file and include the License file at
  * nbbuild/licenses/CDDL-GPL-2-CP.  Sun designates this
  * particular file as subject to the "Classpath" exception as provided
  * by Sun in the GPL Version 2 section of the License file that
  * accompanied this code. If applicable, add the following below the
  * License Header, with the fields enclosed by brackets [] replaced by
  * your own identifying information:
  * "Portions Copyrighted [year] [name of copyright owner]"
  *
  * If you wish your version of this file to be governed by only the CDDL
  * or only the GPL Version 2, indicate your decision by adding
  * "[Contributor] elects to include this software in this distribution
  * under the [CDDL or GPL Version 2] license." If you do not indicate a
  * single choice of license, a recipient has the option to distribute
  * your version of this file under either the CDDL, the GPL Version 2 or
  * to extend the choice of license to its licensees as provided above.
  * However, if you add GPL Version 2 code and therefore, elected the GPL
  * Version 2 license, then the option applies only if the new code is
  * made subject to such option by the copyright holder.
  *
  * Contributor(s):
  *
  * Portions Copyrighted 2009-2010 Sun Microsystems, Inc.
  */
 package org.netbeans.modules.jackpot30.impl.duplicates;

 import com.sun.source.tree.CompilationUnitTree;
 import com.sun.source.tree.IdentifierTree;
 import com.sun.source.tree.NewClassTree;
 import com.sun.source.tree.Tree;
 import com.sun.source.tree.VariableTree;
 import com.sun.source.util.SourcePositions;
 import com.sun.source.util.TreePath;
 import com.sun.source.util.TreePathScanner;
 import com.sun.source.util.Trees;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
 import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.security.DigestOutputStream;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.BitSet;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.NoSuchElementException;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicBoolean;
 import javax.lang.model.element.Element;
 import javax.lang.model.element.Modifier;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.FSDirectory;
 import org.netbeans.api.annotations.common.CheckForNull;
 import org.netbeans.api.annotations.common.NonNull;
 import org.netbeans.api.java.classpath.ClassPath;
 import org.netbeans.api.java.classpath.GlobalPathRegistry;
 import org.netbeans.api.java.source.CompilationInfo;
 import org.netbeans.api.progress.ProgressHandle;
 import org.netbeans.modules.jackpot30.common.api.LuceneHelpers.BitSetCollector;
 import org.netbeans.modules.jackpot30.impl.duplicates.indexing.DuplicatesCustomIndexerImpl;
 import org.netbeans.modules.jackpot30.impl.duplicates.indexing.DuplicatesIndex;
 import org.netbeans.modules.parsing.impl.indexing.CacheFolder;
 import org.openide.filesystems.FileObject;
 import org.openide.filesystems.FileUtil;
 import org.openide.filesystems.URLMapper;
 import org.openide.util.Exceptions;


 /**
  *
  * @author lahvac
  */
 public class ComputeDuplicates {

     public Iterator<? extends DuplicateDescription> computeDuplicatesForAllOpenedProjects(ProgressHandle progress, AtomicBoolean cancel) throws IOException {
         Set<URL> urls = new HashSet<URL>();

         for (ClassPath cp : GlobalPathRegistry.getDefault().getPaths(ClassPath.SOURCE)) {
             for (ClassPath.Entry e : cp.entries()) {
                 urls.add(e.getURL());
             }
         }

         long start = System.currentTimeMillis();
         try {
             return computeDuplicates(urls, progress, cancel);
         } finally {
             System.err.println("duplicates for all open projects: " + (System.currentTimeMillis() - start));
         }
     }

     public Iterator<? extends DuplicateDescription> computeDuplicates(Set<URL> forURLs, ProgressHandle progress, AtomicBoolean cancel) throws IOException {
         Map<IndexReader, FileObject> readers2Roots = new LinkedHashMap<IndexReader, FileObject>();

         progress.progress("Updating indices");

         for (URL u : forURLs) {
             try {
                 //TODO: needs to be removed for server mode
                 new DuplicatesCustomIndexerImpl.FactoryImpl().updateIndex(u, cancel); //TODO: show updating progress to the user

                 File cacheRoot = cacheRoot(u);

                 File dir = new File(cacheRoot, DuplicatesIndex.NAME);

                 if (dir.listFiles() != null && dir.listFiles().length > 0) {
                     IndexReader reader = IndexReader.open(FSDirectory.open(dir), true);

                     readers2Roots.put(reader, URLMapper.findFileObject(u));
                 }
             } catch (IOException ex) {
                 Exceptions.printStackTrace(ex);
             }
         }

         progress.progress("Searching for duplicates");

         MultiReader r = new MultiReader(readers2Roots.keySet().toArray(new IndexReader[0]));

         List<String> dd = new ArrayList<String>(getDuplicatedValues(r, "duplicatesGeneralized", cancel));

         sortHashes(dd);

         //TODO: only show valuable duplicates?:
 //        dd = dd.subList(0, dd.size() / 10 + 1);

         return new DuplicatesIterator(readers2Roots, dd, 2);
     }

     public static Iterator<? extends DuplicateDescription> XXXduplicatesOf(Map<IndexReader, FileObject> readers2Roots, Collection<String> hashes) {
         List<String> hashesList = new ArrayList<String>(hashes);
         sortHashes(hashesList);
         return new DuplicatesIterator(readers2Roots, hashesList, 1);
     }

     private static File cacheRoot(URL sourceRoot) throws IOException {
         FileObject dataFolder = CacheFolder.getDataFolder(sourceRoot);
         FileObject cacheFO  = dataFolder.getFileObject(DuplicatesIndex.NAME + "/" +DuplicatesIndex.VERSION);
         File cache = cacheFO != null ? FileUtil.toFile(cacheFO) : null;

         return cache;
     }

     private static final class DuplicatesIterator implements Iterator<DuplicateDescription> {
         private final Map<IndexReader, FileObject> readers2Roots;
         private final Iterator<String> duplicateCandidates;
         private final int minDuplicates;
         private final List<DuplicateDescription> result = new LinkedList<DuplicateDescription>();

         public DuplicatesIterator(Map<IndexReader, FileObject> readers2Roots, Iterable<String> duplicateCandidates, int minDuplicates) {
             this.readers2Roots = readers2Roots;
             this.duplicateCandidates = duplicateCandidates.iterator();
             this.minDuplicates = minDuplicates;
         }

         private DuplicateDescription nextDescription() throws IOException {
         while (duplicateCandidates.hasNext()) {
             String longest = duplicateCandidates.next();
             List<Span> foundDuplicates = new LinkedList<Span>();

             Query query = new TermQuery(new Term("duplicatesGeneralized", longest));

             for (Entry<IndexReader, FileObject> e : readers2Roots.entrySet()) {
                 Searcher s = new IndexSearcher(e.getKey());
                 BitSet matchingDocuments = new BitSet(e.getKey().maxDoc());
                 Collector c = new BitSetCollector(matchingDocuments);

                 s.search(query, c);

                 for (int docNum = matchingDocuments.nextSetBit(0); docNum >= 0; docNum = matchingDocuments.nextSetBit(docNum + 1)) {
                     final Document doc = e.getKey().document(docNum);
                     int pos = Arrays.binarySearch(doc.getValues("duplicatesGeneralized"), longest);

                     if (pos < 0) {
                         continue;
                     }

                     String spanSpec = doc.getValues("duplicatesPositions")[pos];
                     String relPath = doc.getField("duplicatesPath").stringValue();

                     for (String spanPart : spanSpec.split(";")) {
                         Span span = Span.of(e.getValue().getFileObject(relPath), spanPart);

                         if (span != null) {
                             foundDuplicates.add(span);
                         }
                     }
                 }
             }

             if (foundDuplicates.size() >= minDuplicates) {
                 DuplicateDescription current = DuplicateDescription.of(foundDuplicates, getValue(longest), longest);
                 boolean add = true;

                 for (Iterator<DuplicateDescription> it = result.iterator(); it.hasNext();) {
                     DuplicateDescription existing = it.next();

                     if (subsumes(existing, current)) {
                         add = false;
                         break;
                     }

                     if (subsumes(current, existing)) {
                         //can happen? (note that the duplicates are sorted by value)
                         it.remove();
                     }
                 }

                 if (add) {
                     result.add(current);
                     return current;
                 }
             }

         }
         return null;
         }

         private DuplicateDescription next;

         public boolean hasNext() {
             if (next == null) {
                 try {
                     next = nextDescription();
                 } catch (IOException ex) {
                     Exceptions.printStackTrace(ex);
                 }
             }

             return next != null;
         }

         public DuplicateDescription next() {
             if (!hasNext()) {
                 throw new NoSuchElementException();
             }

             DuplicateDescription r = next;

             next = null;
             return r;
         }

         public void remove() {
             throw new UnsupportedOperationException("Not supported.");
         }

     }

     private static List<String> getDuplicatedValues(IndexReader ir, String field, AtomicBoolean cancel) throws IOException {
         List<String> values = new ArrayList<String>();
         TermEnum terms = ir.terms( new Term(field));
         //while (terms.next()) {
         do {
             if (cancel.get()) return Collections.emptyList();

             final Term term =  terms.term();

             if ( !field.equals( term.field() ) ) {
                 break;
             }

             if (terms.docFreq() < 2) continue;

             values.add(term.text());
         }
         while (terms.next());
         return values;
     }

     private static long getValue(String encoded) {
         return Long.parseLong(encoded.substring(encoded.lastIndexOf(":") + 1));
     }

     private static void sortHashes(List<String> hashes) {
         Collections.sort(hashes, new Comparator<String>() {
             public int compare(String arg0, String arg1) {
                 return (int) Math.signum(getValue(arg1) - getValue(arg0));
             }
         });
     }

     private static boolean subsumes(DuplicateDescription bigger, DuplicateDescription smaller) {
         Set<FileObject> bFiles = new HashSet<FileObject>();

         for (Span s : bigger.dupes) {
             bFiles.add(s.file);
         }

         Set<FileObject> sFiles = new HashSet<FileObject>();

         for (Span s : smaller.dupes) {
             sFiles.add(s.file);
         }

         if (!bFiles.equals(sFiles)) return false;

         Span testAgainst = bigger.dupes.get(0);

         for (Span s : smaller.dupes) {
             if (s.file == testAgainst.file) {
                 if (   (testAgainst.startOff <= s.startOff && testAgainst.endOff > s.endOff)
                     || (testAgainst.startOff < s.startOff && testAgainst.endOff >= s.endOff)) {
                     return true;
                 }
             }
         }

         return false;
     }

     public static Map<String, long[]> encodeGeneralized(CompilationInfo info) {
         return encodeGeneralized(info.getTrees(), info.getCompilationUnit());
     }

     public static Map<String, long[]> encodeGeneralized(final Trees trees, final CompilationUnitTree cut) {
         final SourcePositions sp = trees.getSourcePositions();
         final Map<String, Collection<Long>> positions = new HashMap<String, Collection<Long>>();

         new TreePathScanner<Void, Void>() {
             @Override
             public Void scan(Tree tree, Void p) {
                 if (tree == null) return null;
                 if (getCurrentPath() != null) {
                     DigestOutputStream baos = null;
                     PrintWriter out = null;
                     try {
                         baos = new DigestOutputStream(new ByteArrayOutputStream(), MessageDigest.getInstance("MD5"));
                         out = new PrintWriter(new OutputStreamWriter(baos, "UTF-8"));
                         GeneralizePattern gen = new GeneralizePattern(out, trees);
                         gen.scan(new TreePath(getCurrentPath(), tree), null);
                         out.close();
                         if (gen.value >= MINIMAL_VALUE) {
                             StringBuilder text = new StringBuilder();
                             byte[] bytes = baos.getMessageDigest().digest();
                             for (int cntr = 0; cntr < 4; cntr++) {
                                 text.append(String.format("%02X", bytes[cntr]));
                             }
                             text.append(':').append(gen.value);
                             String enc = text.toString();
                             Collection<Long> spanSpecs = positions.get(enc);
                             if (spanSpecs == null) {
                                 positions.put(enc, spanSpecs = new LinkedList<Long>());
 //                            } else {
 //                                spanSpecs.append(";");
                             }
                             long start = sp.getStartPosition(cut, tree);
 //                            spanSpecs.append(start).append(":").append(sp.getEndPosition(cut, tree) - start);
                             spanSpecs.add(start);
                             spanSpecs.add(sp.getEndPosition(cut, tree));
                         }
                     } catch (UnsupportedEncodingException ex) {
                         Exceptions.printStackTrace(ex);
                     } catch (NoSuchAlgorithmException ex) {
                         Exceptions.printStackTrace(ex);
                     } finally {
                         try {
                             baos.close();
                         } catch (IOException ex) {
                             Exceptions.printStackTrace(ex);
                         }
                         out.close();
                     }
                 }
                 return super.scan(tree, p);
             }
         }.scan(cut, null);

         Map<String, long[]> result = new TreeMap<String, long[]>();

         for (Entry<String, Collection<Long>> e : positions.entrySet()) {
             long[] spans = new long[e.getValue().size()];
             int idx = 0;

             for (Long l : e.getValue()) {
                 spans[idx++] = l;
             }

             result.put(e.getKey(), spans);
         }

         return result;
     }

     private static final class GeneralizePattern extends TreePathScanner<Void, Void> {

         public final Map<Tree, Tree> tree2Variable = new HashMap<Tree, Tree>();
         private final Map<Element, String> element2Variable = new HashMap<Element, String>();
         private final PrintWriter to;
         private final Trees javacTrees;
         private long value;

         private int currentVariableIndex = 0;

         public GeneralizePattern(PrintWriter to, Trees javacTrees) {
             this.to = to;
             this.javacTrees = javacTrees;
         }

         private @NonNull String getVariable(@NonNull Element el) {
             String var = element2Variable.get(el);

             if (var == null) {
                 element2Variable.put(el, var = "$" + currentVariableIndex++);
             }

             return var;
         }

         private boolean shouldBeGeneralized(@NonNull Element el) {
             if (el.getModifiers().contains(Modifier.PRIVATE)) {
                 return true;
             }

             switch (el.getKind()) {
                 case LOCAL_VARIABLE:
                 case EXCEPTION_PARAMETER:
                 case PARAMETER:
                     return true;
             }

             return false;
         }

         @Override
         public Void scan(Tree tree, Void p) {
             if (tree != null) {
                 to.append(tree.getKind().name());
                 value++;
             }
             return super.scan(tree, p);
         }

         @Override
         public Void visitIdentifier(IdentifierTree node, Void p) {
             Element e = javacTrees.getElement(getCurrentPath());

             if (e != null && shouldBeGeneralized(e)) {
                 to.append(getVariable(e));
                 value--;
                 return null;
             } else {
                 to.append(node.getName());
             }

             return super.visitIdentifier(node, p);
         }

         @Override
         public Void visitVariable(VariableTree node, Void p) {
             Element e = javacTrees.getElement(getCurrentPath());

             if (e != null && shouldBeGeneralized(e)) {
                 to.append(getVariable(e));
             } else {
                 to.append(node.getName());
             }

             return super.visitVariable(node, p);
         }

         @Override
         public Void visitNewClass(NewClassTree node, Void p) {
             return null;
         }

     }

     private static final int MINIMAL_VALUE = 10;

     public static final class DuplicateDescription {

         public final List<Span> dupes;
         public final long value;
         public final String hash;

         private DuplicateDescription(List<Span> dupes, long value, String hash) {
             this.dupes = dupes;
             this.value = value;
             this.hash = hash;
         }

         public static DuplicateDescription of(List<Span> dupes, long value, String hash) {
             return new DuplicateDescription(dupes, value, hash);
         }
     }

     public static final class Span {
         public final FileObject file;
         public final int startOff;
         public final int endOff;

         public Span(FileObject file, int startOff, int endOff) {
             this.file = file;
             this.startOff = startOff;
             this.endOff = endOff;
         }

         public static @CheckForNull Span of(FileObject file, String spanSpec) {
             String[] split = spanSpec.split(":");
             int start = Integer.valueOf(split[0]);
             int end = start + Integer.valueOf(split[1]);
             if (start < 0 || end < 0) return null; //XXX

             return new Span(file, start, end);
         }

     }
 }
	/*
	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
	*
	* Copyright 2009-2010 Sun Microsystems, Inc. All rights reserved.
	*
	* The contents of this file are subject to the terms of either the GNU
	* General Public License Version 2 only ("GPL") or the Common
	* Development and Distribution License("CDDL") (collectively, the
	* "License"). You may not use this file except in compliance with the
	* License. You can obtain a copy of the License at
	* http://www.netbeans.org/cddl-gplv2.html
	* or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
	* specific language governing permissions and limitations under the
	* License. When distributing the software, include this License Header
	* Notice in each file and include the License file at
	* nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this
	* particular file as subject to the "Classpath" exception as provided
	* by Sun in the GPL Version 2 section of the License file that
	* accompanied this code. If applicable, add the following below the
	* License Header, with the fields enclosed by brackets [] replaced by
	* your own identifying information:
	* "Portions Copyrighted [year] [name of copyright owner]"
	*
	* If you wish your version of this file to be governed by only the CDDL
	* or only the GPL Version 2, indicate your decision by adding
	* "[Contributor] elects to include this software in this distribution
	* under the [CDDL or GPL Version 2] license." If you do not indicate a
	* single choice of license, a recipient has the option to distribute
	* your version of this file under either the CDDL, the GPL Version 2 or
	* to extend the choice of license to its licensees as provided above.
	* However, if you add GPL Version 2 code and therefore, elected the GPL
	* Version 2 license, then the option applies only if the new code is
	* made subject to such option by the copyright holder.
	*
	* Contributor(s):
	*
	* Portions Copyrighted 2009-2010 Sun Microsystems, Inc.
	*/
	package org.netbeans.modules.jackpot30.impl.duplicates;

	import com.sun.source.tree.CompilationUnitTree;
	import com.sun.source.tree.IdentifierTree;
	import com.sun.source.tree.NewClassTree;
	import com.sun.source.tree.Tree;
	import com.sun.source.tree.VariableTree;
	import com.sun.source.util.SourcePositions;
	import com.sun.source.util.TreePath;
	import com.sun.source.util.TreePathScanner;
	import com.sun.source.util.Trees;
	import java.io.ByteArrayOutputStream;
	import java.io.File;
	import java.io.IOException;
	import java.io.OutputStreamWriter;
	import java.io.PrintWriter;
	import java.io.UnsupportedEncodingException;
	import java.net.URL;
	import java.security.DigestOutputStream;
	import java.security.MessageDigest;
	import java.security.NoSuchAlgorithmException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.BitSet;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.LinkedHashMap;
	import java.util.LinkedList;
	import java.util.List;
	import java.util.Map;
	import java.util.Map.Entry;
	import java.util.NoSuchElementException;
	import java.util.Set;
	import java.util.TreeMap;
	import java.util.concurrent.atomic.AtomicBoolean;
	import javax.lang.model.element.Element;
	import javax.lang.model.element.Modifier;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.MultiReader;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.TermEnum;
	import org.apache.lucene.search.Collector;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.Searcher;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.store.FSDirectory;
	import org.netbeans.api.annotations.common.CheckForNull;
	import org.netbeans.api.annotations.common.NonNull;
	import org.netbeans.api.java.classpath.ClassPath;
	import org.netbeans.api.java.classpath.GlobalPathRegistry;
	import org.netbeans.api.java.source.CompilationInfo;
	import org.netbeans.api.progress.ProgressHandle;
	import org.netbeans.modules.jackpot30.common.api.LuceneHelpers.BitSetCollector;
	import org.netbeans.modules.jackpot30.impl.duplicates.indexing.DuplicatesCustomIndexerImpl;
	import org.netbeans.modules.jackpot30.impl.duplicates.indexing.DuplicatesIndex;
	import org.netbeans.modules.parsing.impl.indexing.CacheFolder;
	import org.openide.filesystems.FileObject;
	import org.openide.filesystems.FileUtil;
	import org.openide.filesystems.URLMapper;
	import org.openide.util.Exceptions;


	/**
	*
	* @author lahvac
	*/
	public class ComputeDuplicates {

	public Iterator<? extends DuplicateDescription> computeDuplicatesForAllOpenedProjects(ProgressHandle progress, AtomicBoolean cancel) throws IOException {
	Set<URL> urls = new HashSet<URL>();

	for (ClassPath cp : GlobalPathRegistry.getDefault().getPaths(ClassPath.SOURCE)) {
	for (ClassPath.Entry e : cp.entries()) {
	urls.add(e.getURL());
	}
	}

	long start = System.currentTimeMillis();
	try {
	return computeDuplicates(urls, progress, cancel);
	} finally {
	System.err.println("duplicates for all open projects: " + (System.currentTimeMillis() - start));
	}
	}

	public Iterator<? extends DuplicateDescription> computeDuplicates(Set<URL> forURLs, ProgressHandle progress, AtomicBoolean cancel) throws IOException {
	Map<IndexReader, FileObject> readers2Roots = new LinkedHashMap<IndexReader, FileObject>();

	progress.progress("Updating indices");

	for (URL u : forURLs) {
	try {
	//TODO: needs to be removed for server mode
	new DuplicatesCustomIndexerImpl.FactoryImpl().updateIndex(u, cancel); //TODO: show updating progress to the user

	File cacheRoot = cacheRoot(u);

	File dir = new File(cacheRoot, DuplicatesIndex.NAME);

	if (dir.listFiles() != null && dir.listFiles().length > 0) {
	IndexReader reader = IndexReader.open(FSDirectory.open(dir), true);

	readers2Roots.put(reader, URLMapper.findFileObject(u));
	}
	} catch (IOException ex) {
	Exceptions.printStackTrace(ex);
	}
	}

	progress.progress("Searching for duplicates");

	MultiReader r = new MultiReader(readers2Roots.keySet().toArray(new IndexReader[0]));

	List<String> dd = new ArrayList<String>(getDuplicatedValues(r, "duplicatesGeneralized", cancel));

	sortHashes(dd);

	//TODO: only show valuable duplicates?:
	// dd = dd.subList(0, dd.size() / 10 + 1);

	return new DuplicatesIterator(readers2Roots, dd, 2);
	}

	public static Iterator<? extends DuplicateDescription> XXXduplicatesOf(Map<IndexReader, FileObject> readers2Roots, Collection<String> hashes) {
	List<String> hashesList = new ArrayList<String>(hashes);
	sortHashes(hashesList);
	return new DuplicatesIterator(readers2Roots, hashesList, 1);
	}

	private static File cacheRoot(URL sourceRoot) throws IOException {
	FileObject dataFolder = CacheFolder.getDataFolder(sourceRoot);
	FileObject cacheFO = dataFolder.getFileObject(DuplicatesIndex.NAME + "/" +DuplicatesIndex.VERSION);
	File cache = cacheFO != null ? FileUtil.toFile(cacheFO) : null;

	return cache;
	}

	private static final class DuplicatesIterator implements Iterator<DuplicateDescription> {
	private final Map<IndexReader, FileObject> readers2Roots;
	private final Iterator<String> duplicateCandidates;
	private final int minDuplicates;
	private final List<DuplicateDescription> result = new LinkedList<DuplicateDescription>();

	public DuplicatesIterator(Map<IndexReader, FileObject> readers2Roots, Iterable<String> duplicateCandidates, int minDuplicates) {
	this.readers2Roots = readers2Roots;
	this.duplicateCandidates = duplicateCandidates.iterator();
	this.minDuplicates = minDuplicates;
	}

	private DuplicateDescription nextDescription() throws IOException {
	while (duplicateCandidates.hasNext()) {
	String longest = duplicateCandidates.next();
	List<Span> foundDuplicates = new LinkedList<Span>();

	Query query = new TermQuery(new Term("duplicatesGeneralized", longest));

	for (Entry<IndexReader, FileObject> e : readers2Roots.entrySet()) {
	Searcher s = new IndexSearcher(e.getKey());
	BitSet matchingDocuments = new BitSet(e.getKey().maxDoc());
	Collector c = new BitSetCollector(matchingDocuments);

	s.search(query, c);

	for (int docNum = matchingDocuments.nextSetBit(0); docNum >= 0; docNum = matchingDocuments.nextSetBit(docNum + 1)) {
	final Document doc = e.getKey().document(docNum);
	int pos = Arrays.binarySearch(doc.getValues("duplicatesGeneralized"), longest);

	if (pos < 0) {
	continue;
	}

	String spanSpec = doc.getValues("duplicatesPositions")[pos];
	String relPath = doc.getField("duplicatesPath").stringValue();

	for (String spanPart : spanSpec.split(";")) {
	Span span = Span.of(e.getValue().getFileObject(relPath), spanPart);

	if (span != null) {
	foundDuplicates.add(span);
	}
	}
	}
	}

	if (foundDuplicates.size() >= minDuplicates) {
	DuplicateDescription current = DuplicateDescription.of(foundDuplicates, getValue(longest), longest);
	boolean add = true;

	for (Iterator<DuplicateDescription> it = result.iterator(); it.hasNext();) {
	DuplicateDescription existing = it.next();

	if (subsumes(existing, current)) {
	add = false;
	break;
	}

	if (subsumes(current, existing)) {
	//can happen? (note that the duplicates are sorted by value)
	it.remove();
	}
	}

	if (add) {
	result.add(current);
	return current;
	}
	}

	}
	return null;
	}

	private DuplicateDescription next;

	public boolean hasNext() {
	if (next == null) {
	try {
	next = nextDescription();
	} catch (IOException ex) {
	Exceptions.printStackTrace(ex);
	}
	}

	return next != null;
	}

	public DuplicateDescription next() {
	if (!hasNext()) {
	throw new NoSuchElementException();
	}

	DuplicateDescription r = next;

	next = null;
	return r;
	}

	public void remove() {
	throw new UnsupportedOperationException("Not supported.");
	}

	}

	private static List<String> getDuplicatedValues(IndexReader ir, String field, AtomicBoolean cancel) throws IOException {
	List<String> values = new ArrayList<String>();
	TermEnum terms = ir.terms( new Term(field));
	//while (terms.next()) {
	do {
	if (cancel.get()) return Collections.emptyList();

	final Term term = terms.term();

	if ( !field.equals( term.field() ) ) {
	break;
	}

	if (terms.docFreq() < 2) continue;

	values.add(term.text());
	}
	while (terms.next());
	return values;
	}

	private static long getValue(String encoded) {
	return Long.parseLong(encoded.substring(encoded.lastIndexOf(":") + 1));
	}

	private static void sortHashes(List<String> hashes) {
	Collections.sort(hashes, new Comparator<String>() {
	public int compare(String arg0, String arg1) {
	return (int) Math.signum(getValue(arg1) - getValue(arg0));
	}
	});
	}

	private static boolean subsumes(DuplicateDescription bigger, DuplicateDescription smaller) {
	Set<FileObject> bFiles = new HashSet<FileObject>();

	for (Span s : bigger.dupes) {
	bFiles.add(s.file);
	}

	Set<FileObject> sFiles = new HashSet<FileObject>();

	for (Span s : smaller.dupes) {
	sFiles.add(s.file);
	}

	if (!bFiles.equals(sFiles)) return false;

	Span testAgainst = bigger.dupes.get(0);

	for (Span s : smaller.dupes) {
	if (s.file == testAgainst.file) {
	if ( (testAgainst.startOff <= s.startOff && testAgainst.endOff > s.endOff)
	\|\| (testAgainst.startOff < s.startOff && testAgainst.endOff >= s.endOff)) {
	return true;
	}
	}
	}

	return false;
	}

	public static Map<String, long[]> encodeGeneralized(CompilationInfo info) {
	return encodeGeneralized(info.getTrees(), info.getCompilationUnit());
	}

	public static Map<String, long[]> encodeGeneralized(final Trees trees, final CompilationUnitTree cut) {
	final SourcePositions sp = trees.getSourcePositions();
	final Map<String, Collection<Long>> positions = new HashMap<String, Collection<Long>>();

	new TreePathScanner<Void, Void>() {
	@Override
	public Void scan(Tree tree, Void p) {
	if (tree == null) return null;
	if (getCurrentPath() != null) {
	DigestOutputStream baos = null;
	PrintWriter out = null;
	try {
	baos = new DigestOutputStream(new ByteArrayOutputStream(), MessageDigest.getInstance("MD5"));
	out = new PrintWriter(new OutputStreamWriter(baos, "UTF-8"));
	GeneralizePattern gen = new GeneralizePattern(out, trees);
	gen.scan(new TreePath(getCurrentPath(), tree), null);
	out.close();
	if (gen.value >= MINIMAL_VALUE) {
	StringBuilder text = new StringBuilder();
	byte[] bytes = baos.getMessageDigest().digest();
	for (int cntr = 0; cntr < 4; cntr++) {
	text.append(String.format("%02X", bytes[cntr]));
	}
	text.append(':').append(gen.value);
	String enc = text.toString();
	Collection<Long> spanSpecs = positions.get(enc);
	if (spanSpecs == null) {
	positions.put(enc, spanSpecs = new LinkedList<Long>());
	// } else {
	// spanSpecs.append(";");
	}
	long start = sp.getStartPosition(cut, tree);
	// spanSpecs.append(start).append(":").append(sp.getEndPosition(cut, tree) - start);
	spanSpecs.add(start);
	spanSpecs.add(sp.getEndPosition(cut, tree));
	}
	} catch (UnsupportedEncodingException ex) {
	Exceptions.printStackTrace(ex);
	} catch (NoSuchAlgorithmException ex) {
	Exceptions.printStackTrace(ex);
	} finally {
	try {
	baos.close();
	} catch (IOException ex) {
	Exceptions.printStackTrace(ex);
	}
	out.close();
	}
	}
	return super.scan(tree, p);
	}
	}.scan(cut, null);

	Map<String, long[]> result = new TreeMap<String, long[]>();

	for (Entry<String, Collection<Long>> e : positions.entrySet()) {
	long[] spans = new long[e.getValue().size()];
	int idx = 0;

	for (Long l : e.getValue()) {
	spans[idx++] = l;
	}

	result.put(e.getKey(), spans);
	}

	return result;
	}

	private static final class GeneralizePattern extends TreePathScanner<Void, Void> {

	public final Map<Tree, Tree> tree2Variable = new HashMap<Tree, Tree>();
	private final Map<Element, String> element2Variable = new HashMap<Element, String>();
	private final PrintWriter to;
	private final Trees javacTrees;
	private long value;

	private int currentVariableIndex = 0;

	public GeneralizePattern(PrintWriter to, Trees javacTrees) {
	this.to = to;
	this.javacTrees = javacTrees;
	}

	private @NonNull String getVariable(@NonNull Element el) {
	String var = element2Variable.get(el);

	if (var == null) {
	element2Variable.put(el, var = "$" + currentVariableIndex++);
	}

	return var;
	}

	private boolean shouldBeGeneralized(@NonNull Element el) {
	if (el.getModifiers().contains(Modifier.PRIVATE)) {
	return true;
	}

	switch (el.getKind()) {
	case LOCAL_VARIABLE:
	case EXCEPTION_PARAMETER:
	case PARAMETER:
	return true;
	}

	return false;
	}

	@Override
	public Void scan(Tree tree, Void p) {
	if (tree != null) {
	to.append(tree.getKind().name());
	value++;
	}
	return super.scan(tree, p);
	}

	@Override
	public Void visitIdentifier(IdentifierTree node, Void p) {
	Element e = javacTrees.getElement(getCurrentPath());

	if (e != null && shouldBeGeneralized(e)) {
	to.append(getVariable(e));
	value--;
	return null;
	} else {
	to.append(node.getName());
	}

	return super.visitIdentifier(node, p);
	}

	@Override
	public Void visitVariable(VariableTree node, Void p) {
	Element e = javacTrees.getElement(getCurrentPath());

	if (e != null && shouldBeGeneralized(e)) {
	to.append(getVariable(e));
	} else {
	to.append(node.getName());
	}

	return super.visitVariable(node, p);
	}

	@Override
	public Void visitNewClass(NewClassTree node, Void p) {
	return null;
	}

	}

	private static final int MINIMAL_VALUE = 10;

	public static final class DuplicateDescription {

	public final List<Span> dupes;
	public final long value;
	public final String hash;

	private DuplicateDescription(List<Span> dupes, long value, String hash) {
	this.dupes = dupes;
	this.value = value;
	this.hash = hash;
	}

	public static DuplicateDescription of(List<Span> dupes, long value, String hash) {
	return new DuplicateDescription(dupes, value, hash);
	}
	}

	public static final class Span {
	public final FileObject file;
	public final int startOff;
	public final int endOff;

	public Span(FileObject file, int startOff, int endOff) {
	this.file = file;
	this.startOff = startOff;
	this.endOff = endOff;
	}

	public static @CheckForNull Span of(FileObject file, String spanSpec) {
	String[] split = spanSpec.split(":");
	int start = Integer.valueOf(split[0]);
	int end = start + Integer.valueOf(split[1]);
	if (start < 0 \|\| end < 0) return null; //XXX

	return new Span(file, start, end);
	}

	}
	}