blob: be2eeea3a40983ed318995961089457b715ba8d6 [file] [log] [blame]
/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
*
* Copyright 2009-2010 Sun Microsystems, Inc. All rights reserved.
*
* The contents of this file are subject to the terms of either the GNU
* General Public License Version 2 only ("GPL") or the Common
* Development and Distribution License("CDDL") (collectively, the
* "License"). You may not use this file except in compliance with the
* License. You can obtain a copy of the License at
* http://www.netbeans.org/cddl-gplv2.html
* or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
* specific language governing permissions and limitations under the
* License. When distributing the software, include this License Header
* Notice in each file and include the License file at
* nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this
* particular file as subject to the "Classpath" exception as provided
* by Sun in the GPL Version 2 section of the License file that
* accompanied this code. If applicable, add the following below the
* License Header, with the fields enclosed by brackets [] replaced by
* your own identifying information:
* "Portions Copyrighted [year] [name of copyright owner]"
*
* If you wish your version of this file to be governed by only the CDDL
* or only the GPL Version 2, indicate your decision by adding
* "[Contributor] elects to include this software in this distribution
* under the [CDDL or GPL Version 2] license." If you do not indicate a
* single choice of license, a recipient has the option to distribute
* your version of this file under either the CDDL, the GPL Version 2 or
* to extend the choice of license to its licensees as provided above.
* However, if you add GPL Version 2 code and therefore, elected the GPL
* Version 2 license, then the option applies only if the new code is
* made subject to such option by the copyright holder.
*
* Contributor(s):
*
* Portions Copyrighted 2009-2010 Sun Microsystems, Inc.
*/
package org.netbeans.modules.jackpot30.impl.duplicates;
import com.sun.source.tree.CompilationUnitTree;
import com.sun.source.tree.IdentifierTree;
import com.sun.source.tree.NewClassTree;
import com.sun.source.tree.Tree;
import com.sun.source.tree.VariableTree;
import com.sun.source.util.SourcePositions;
import com.sun.source.util.TreePath;
import com.sun.source.util.TreePathScanner;
import com.sun.source.util.Trees;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.security.DigestOutputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.lang.model.element.Element;
import javax.lang.model.element.Modifier;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
import org.netbeans.api.annotations.common.CheckForNull;
import org.netbeans.api.annotations.common.NonNull;
import org.netbeans.api.java.classpath.ClassPath;
import org.netbeans.api.java.classpath.GlobalPathRegistry;
import org.netbeans.api.java.source.CompilationInfo;
import org.netbeans.api.progress.ProgressHandle;
import org.netbeans.modules.jackpot30.common.api.LuceneHelpers.BitSetCollector;
import org.netbeans.modules.jackpot30.impl.duplicates.indexing.DuplicatesCustomIndexerImpl;
import org.netbeans.modules.jackpot30.impl.duplicates.indexing.DuplicatesIndex;
import org.netbeans.modules.parsing.impl.indexing.CacheFolder;
import org.openide.filesystems.FileObject;
import org.openide.filesystems.FileUtil;
import org.openide.filesystems.URLMapper;
import org.openide.util.Exceptions;
/**
*
* @author lahvac
*/
public class ComputeDuplicates {
public Iterator<? extends DuplicateDescription> computeDuplicatesForAllOpenedProjects(ProgressHandle progress, AtomicBoolean cancel) throws IOException {
Set<URL> urls = new HashSet<URL>();
for (ClassPath cp : GlobalPathRegistry.getDefault().getPaths(ClassPath.SOURCE)) {
for (ClassPath.Entry e : cp.entries()) {
urls.add(e.getURL());
}
}
long start = System.currentTimeMillis();
try {
return computeDuplicates(urls, progress, cancel);
} finally {
System.err.println("duplicates for all open projects: " + (System.currentTimeMillis() - start));
}
}
public Iterator<? extends DuplicateDescription> computeDuplicates(Set<URL> forURLs, ProgressHandle progress, AtomicBoolean cancel) throws IOException {
Map<IndexReader, FileObject> readers2Roots = new LinkedHashMap<IndexReader, FileObject>();
progress.progress("Updating indices");
for (URL u : forURLs) {
try {
//TODO: needs to be removed for server mode
new DuplicatesCustomIndexerImpl.FactoryImpl().updateIndex(u, cancel); //TODO: show updating progress to the user
File cacheRoot = cacheRoot(u);
File dir = new File(cacheRoot, DuplicatesIndex.NAME);
if (dir.listFiles() != null && dir.listFiles().length > 0) {
IndexReader reader = IndexReader.open(FSDirectory.open(dir), true);
readers2Roots.put(reader, URLMapper.findFileObject(u));
}
} catch (IOException ex) {
Exceptions.printStackTrace(ex);
}
}
progress.progress("Searching for duplicates");
MultiReader r = new MultiReader(readers2Roots.keySet().toArray(new IndexReader[0]));
List<String> dd = new ArrayList<String>(getDuplicatedValues(r, "duplicatesGeneralized", cancel));
sortHashes(dd);
//TODO: only show valuable duplicates?:
// dd = dd.subList(0, dd.size() / 10 + 1);
return new DuplicatesIterator(readers2Roots, dd, 2);
}
public static Iterator<? extends DuplicateDescription> XXXduplicatesOf(Map<IndexReader, FileObject> readers2Roots, Collection<String> hashes) {
List<String> hashesList = new ArrayList<String>(hashes);
sortHashes(hashesList);
return new DuplicatesIterator(readers2Roots, hashesList, 1);
}
private static File cacheRoot(URL sourceRoot) throws IOException {
FileObject dataFolder = CacheFolder.getDataFolder(sourceRoot);
FileObject cacheFO = dataFolder.getFileObject(DuplicatesIndex.NAME + "/" +DuplicatesIndex.VERSION);
File cache = cacheFO != null ? FileUtil.toFile(cacheFO) : null;
return cache;
}
private static final class DuplicatesIterator implements Iterator<DuplicateDescription> {
private final Map<IndexReader, FileObject> readers2Roots;
private final Iterator<String> duplicateCandidates;
private final int minDuplicates;
private final List<DuplicateDescription> result = new LinkedList<DuplicateDescription>();
public DuplicatesIterator(Map<IndexReader, FileObject> readers2Roots, Iterable<String> duplicateCandidates, int minDuplicates) {
this.readers2Roots = readers2Roots;
this.duplicateCandidates = duplicateCandidates.iterator();
this.minDuplicates = minDuplicates;
}
private DuplicateDescription nextDescription() throws IOException {
while (duplicateCandidates.hasNext()) {
String longest = duplicateCandidates.next();
List<Span> foundDuplicates = new LinkedList<Span>();
Query query = new TermQuery(new Term("duplicatesGeneralized", longest));
for (Entry<IndexReader, FileObject> e : readers2Roots.entrySet()) {
Searcher s = new IndexSearcher(e.getKey());
BitSet matchingDocuments = new BitSet(e.getKey().maxDoc());
Collector c = new BitSetCollector(matchingDocuments);
s.search(query, c);
for (int docNum = matchingDocuments.nextSetBit(0); docNum >= 0; docNum = matchingDocuments.nextSetBit(docNum + 1)) {
final Document doc = e.getKey().document(docNum);
int pos = Arrays.binarySearch(doc.getValues("duplicatesGeneralized"), longest);
if (pos < 0) {
continue;
}
String spanSpec = doc.getValues("duplicatesPositions")[pos];
String relPath = doc.getField("duplicatesPath").stringValue();
for (String spanPart : spanSpec.split(";")) {
Span span = Span.of(e.getValue().getFileObject(relPath), spanPart);
if (span != null) {
foundDuplicates.add(span);
}
}
}
}
if (foundDuplicates.size() >= minDuplicates) {
DuplicateDescription current = DuplicateDescription.of(foundDuplicates, getValue(longest), longest);
boolean add = true;
for (Iterator<DuplicateDescription> it = result.iterator(); it.hasNext();) {
DuplicateDescription existing = it.next();
if (subsumes(existing, current)) {
add = false;
break;
}
if (subsumes(current, existing)) {
//can happen? (note that the duplicates are sorted by value)
it.remove();
}
}
if (add) {
result.add(current);
return current;
}
}
}
return null;
}
private DuplicateDescription next;
public boolean hasNext() {
if (next == null) {
try {
next = nextDescription();
} catch (IOException ex) {
Exceptions.printStackTrace(ex);
}
}
return next != null;
}
public DuplicateDescription next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
DuplicateDescription r = next;
next = null;
return r;
}
public void remove() {
throw new UnsupportedOperationException("Not supported.");
}
}
private static List<String> getDuplicatedValues(IndexReader ir, String field, AtomicBoolean cancel) throws IOException {
List<String> values = new ArrayList<String>();
TermEnum terms = ir.terms( new Term(field));
//while (terms.next()) {
do {
if (cancel.get()) return Collections.emptyList();
final Term term = terms.term();
if ( !field.equals( term.field() ) ) {
break;
}
if (terms.docFreq() < 2) continue;
values.add(term.text());
}
while (terms.next());
return values;
}
private static long getValue(String encoded) {
return Long.parseLong(encoded.substring(encoded.lastIndexOf(":") + 1));
}
private static void sortHashes(List<String> hashes) {
Collections.sort(hashes, new Comparator<String>() {
public int compare(String arg0, String arg1) {
return (int) Math.signum(getValue(arg1) - getValue(arg0));
}
});
}
private static boolean subsumes(DuplicateDescription bigger, DuplicateDescription smaller) {
Set<FileObject> bFiles = new HashSet<FileObject>();
for (Span s : bigger.dupes) {
bFiles.add(s.file);
}
Set<FileObject> sFiles = new HashSet<FileObject>();
for (Span s : smaller.dupes) {
sFiles.add(s.file);
}
if (!bFiles.equals(sFiles)) return false;
Span testAgainst = bigger.dupes.get(0);
for (Span s : smaller.dupes) {
if (s.file == testAgainst.file) {
if ( (testAgainst.startOff <= s.startOff && testAgainst.endOff > s.endOff)
|| (testAgainst.startOff < s.startOff && testAgainst.endOff >= s.endOff)) {
return true;
}
}
}
return false;
}
public static Map<String, long[]> encodeGeneralized(CompilationInfo info) {
return encodeGeneralized(info.getTrees(), info.getCompilationUnit());
}
public static Map<String, long[]> encodeGeneralized(final Trees trees, final CompilationUnitTree cut) {
final SourcePositions sp = trees.getSourcePositions();
final Map<String, Collection<Long>> positions = new HashMap<String, Collection<Long>>();
new TreePathScanner<Void, Void>() {
@Override
public Void scan(Tree tree, Void p) {
if (tree == null) return null;
if (getCurrentPath() != null) {
DigestOutputStream baos = null;
PrintWriter out = null;
try {
baos = new DigestOutputStream(new ByteArrayOutputStream(), MessageDigest.getInstance("MD5"));
out = new PrintWriter(new OutputStreamWriter(baos, "UTF-8"));
GeneralizePattern gen = new GeneralizePattern(out, trees);
gen.scan(new TreePath(getCurrentPath(), tree), null);
out.close();
if (gen.value >= MINIMAL_VALUE) {
StringBuilder text = new StringBuilder();
byte[] bytes = baos.getMessageDigest().digest();
for (int cntr = 0; cntr < 4; cntr++) {
text.append(String.format("%02X", bytes[cntr]));
}
text.append(':').append(gen.value);
String enc = text.toString();
Collection<Long> spanSpecs = positions.get(enc);
if (spanSpecs == null) {
positions.put(enc, spanSpecs = new LinkedList<Long>());
// } else {
// spanSpecs.append(";");
}
long start = sp.getStartPosition(cut, tree);
// spanSpecs.append(start).append(":").append(sp.getEndPosition(cut, tree) - start);
spanSpecs.add(start);
spanSpecs.add(sp.getEndPosition(cut, tree));
}
} catch (UnsupportedEncodingException ex) {
Exceptions.printStackTrace(ex);
} catch (NoSuchAlgorithmException ex) {
Exceptions.printStackTrace(ex);
} finally {
try {
baos.close();
} catch (IOException ex) {
Exceptions.printStackTrace(ex);
}
out.close();
}
}
return super.scan(tree, p);
}
}.scan(cut, null);
Map<String, long[]> result = new TreeMap<String, long[]>();
for (Entry<String, Collection<Long>> e : positions.entrySet()) {
long[] spans = new long[e.getValue().size()];
int idx = 0;
for (Long l : e.getValue()) {
spans[idx++] = l;
}
result.put(e.getKey(), spans);
}
return result;
}
private static final class GeneralizePattern extends TreePathScanner<Void, Void> {
public final Map<Tree, Tree> tree2Variable = new HashMap<Tree, Tree>();
private final Map<Element, String> element2Variable = new HashMap<Element, String>();
private final PrintWriter to;
private final Trees javacTrees;
private long value;
private int currentVariableIndex = 0;
public GeneralizePattern(PrintWriter to, Trees javacTrees) {
this.to = to;
this.javacTrees = javacTrees;
}
private @NonNull String getVariable(@NonNull Element el) {
String var = element2Variable.get(el);
if (var == null) {
element2Variable.put(el, var = "$" + currentVariableIndex++);
}
return var;
}
private boolean shouldBeGeneralized(@NonNull Element el) {
if (el.getModifiers().contains(Modifier.PRIVATE)) {
return true;
}
switch (el.getKind()) {
case LOCAL_VARIABLE:
case EXCEPTION_PARAMETER:
case PARAMETER:
return true;
}
return false;
}
@Override
public Void scan(Tree tree, Void p) {
if (tree != null) {
to.append(tree.getKind().name());
value++;
}
return super.scan(tree, p);
}
@Override
public Void visitIdentifier(IdentifierTree node, Void p) {
Element e = javacTrees.getElement(getCurrentPath());
if (e != null && shouldBeGeneralized(e)) {
to.append(getVariable(e));
value--;
return null;
} else {
to.append(node.getName());
}
return super.visitIdentifier(node, p);
}
@Override
public Void visitVariable(VariableTree node, Void p) {
Element e = javacTrees.getElement(getCurrentPath());
if (e != null && shouldBeGeneralized(e)) {
to.append(getVariable(e));
} else {
to.append(node.getName());
}
return super.visitVariable(node, p);
}
@Override
public Void visitNewClass(NewClassTree node, Void p) {
return null;
}
}
private static final int MINIMAL_VALUE = 10;
public static final class DuplicateDescription {
public final List<Span> dupes;
public final long value;
public final String hash;
private DuplicateDescription(List<Span> dupes, long value, String hash) {
this.dupes = dupes;
this.value = value;
this.hash = hash;
}
public static DuplicateDescription of(List<Span> dupes, long value, String hash) {
return new DuplicateDescription(dupes, value, hash);
}
}
public static final class Span {
public final FileObject file;
public final int startOff;
public final int endOff;
public Span(FileObject file, int startOff, int endOff) {
this.file = file;
this.startOff = startOff;
this.endOff = endOff;
}
public static @CheckForNull Span of(FileObject file, String spanSpec) {
String[] split = spanSpec.split(":");
int start = Integer.valueOf(split[0]);
int end = start + Integer.valueOf(split[1]);
if (start < 0 || end < 0) return null; //XXX
return new Span(file, start, end);
}
}
}