blob: 5f5010829f8593a8e8558908c92b74def757f7d6 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.asterix.fuzzyjoin;
public class FuzzyFiltersJaccard {
/**
* type is double because with float .8 / (1 + .8) * (8 + 10) = 8.0...01
*/
protected final double simThr;
protected final double simThrExpr;
public FuzzyFiltersJaccard(double similarityThreshold) {
simThr = similarityThreshold;
simThrExpr = simThr / (1 + simThr);
}
public int getIndexPrefixLength(int length) {
return length - (int) Math.ceil(2 * simThrExpr * length) + 1;
}
public int getIntersectLowerBound(int lengthX, int lengthY) {
return (int) Math.ceil(simThrExpr * (lengthX + lengthY));
}
public long getIntersectLowerBound(long lengthX, long lengthY) {
return (long) Math.ceil(simThrExpr * (lengthX + lengthY));
}
public int getIntersectUpperBound(int noGramsCommon, int positionX, int positionY, int lengthX, int lengthY) {
return noGramsCommon + Math.min(lengthX - positionX - 1, lengthY - positionY - 1);
}
public long getIntersectUpperBound(int noGramsCommon, long positionX, long positionY, long lengthX, long lengthY) {
return noGramsCommon + Math.min(lengthX - positionX - 1, lengthY - positionY - 1);
}
public int getLengthLowerBound(int length) {
return (int) Math.ceil(simThr * length);
}
public long getLengthLowerBound(long length) {
return (long) Math.ceil(simThr * length);
}
public int getPrefixLength(int length) {
return length - (int) Math.ceil(simThr * length) + 1;
}
public long getPrefixLength(long length) {
return length - (long) Math.ceil(simThr * length) + 1;
}
public double getSimilarityThreshold() {
return simThr;
}
public boolean passLengthFilter(int lengthX, int lengthY) {
return getLengthLowerBound(lengthX) <= lengthY && lengthY <= 1 / simThr * lengthX;
}
public boolean passLengthFilter(long lengthX, long lengthY) {
return getLengthLowerBound(lengthX) <= lengthY && lengthY <= 1 / simThr * lengthX;
}
/**
* @param noGramsCommon
* number of grams in common
* @param positionX
* position of the last gram in common on X
* @param positionY
* position of the last gram in common on X
* @param lengthX
* @param lengthY
* @return
*/
public boolean passPositionFilter(int noGramsCommon, int positionX, int positionY, int lengthX, int lengthY) {
return getIntersectUpperBound(noGramsCommon, positionX, positionY, lengthX,
lengthY) >= getIntersectLowerBound(lengthX, lengthY);
}
public boolean passPositionFilter(int noGramsCommon, long positionX, long positionY, long lengthX, long lengthY) {
return getIntersectUpperBound(noGramsCommon, positionX, positionY, lengthX,
lengthY) >= getIntersectLowerBound(lengthX, lengthY);
}
}