| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.sling.resourceresolver.impl.mapping; |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertTrue; |
| import java.math.BigInteger; |
| import java.util.HashSet; |
| import java.util.Random; |
| import org.junit.Test; |
| |
| /** |
| * Test the bloom filter utility class. |
| */ |
| public class BloomFilterUtilsTest { |
| |
| /** |
| * Program to calculate the best shift and multiply constants. |
| */ |
| public static void main(String... args) { |
| Random random = new Random(1); |
| HashSet<String> inSet = new HashSet<String>(); |
| while (inSet.size() < 100) { |
| inSet.add(randomString(random)); |
| } |
| Object[] in = inSet.toArray(); |
| HashSet<String> notSet = new HashSet<String>(); |
| while (notSet.size() < 10000) { |
| String k = randomString(random); |
| if (!inSet.contains(k)) { |
| notSet.add(k); |
| } |
| } |
| Object[] not = notSet.toArray(); |
| int best = Integer.MAX_VALUE; |
| for (int mul = 1; mul < 100000; mul += 2) { |
| if (!BigInteger.valueOf(mul).isProbablePrime(10)) { |
| continue; |
| } |
| for (int shift = 0; shift < 32; shift++) { |
| byte[] bloom = BloomFilterUtils.createFilter(100, 64); |
| for (Object k : in) { |
| int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift); |
| add(bloom, h1, h2); |
| } |
| int falsePositives = 0; |
| for (Object k : not) { |
| int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift); |
| if (probablyContains(bloom, h1, h2)) { |
| falsePositives++; |
| // short false positives are bad |
| if (k.toString().length() < 4) { |
| falsePositives += 5; |
| } |
| if (falsePositives > best) { |
| break; |
| } |
| } |
| } |
| if (falsePositives < best) { |
| best = falsePositives; |
| System.out.println("mul: " + mul + " shift: " |
| + shift + " falsePositives: " + best); |
| } |
| } |
| } |
| } |
| |
| private static String randomString(Random r) { |
| if (r.nextInt(5) == 0) { |
| return randomName(r); |
| } |
| int length = 1 + Math.abs((int) r.nextGaussian() * 5); |
| if (r.nextBoolean()) { |
| length += r.nextInt(10); |
| } |
| char[] chars = new char[length]; |
| for (int i = 0; i < length; i++) { |
| chars[i] = randomChar(r); |
| } |
| return new String(chars); |
| } |
| |
| private static char randomChar(Random r) { |
| switch (r.nextInt(101) / 100) { |
| case 0: |
| case 1: |
| // 20% ascii |
| return (char) (32 + r.nextInt(127 - 32)); |
| case 2: |
| case 3: |
| case 4: |
| case 5: |
| // 40% a-z |
| return (char) ('a' + r.nextInt('z' - 'a')); |
| case 6: |
| // 10% A-Z |
| return (char) ('A' + r.nextInt('Z' - 'A')); |
| case 7: |
| case 8: |
| // 20% 0-9 |
| return (char) ('0' + r.nextInt('9' - '0')); |
| case 9: |
| // 10% aeiou |
| return "aeiou".charAt(r.nextInt("aeiou".length())); |
| } |
| // 1% unicode |
| return (char) r.nextInt(65535); |
| } |
| |
| private static String randomName(Random r) { |
| int i = r.nextInt(1000); |
| // like TPC-C lastName, but lowercase |
| String[] n = { |
| "bar", "ought", "able", "pri", "pres", "ese", "anti", |
| "cally", "ation", "eing" }; |
| StringBuilder buff = new StringBuilder(); |
| buff.append(n[i / 100]); |
| buff.append(n[(i / 10) % 10]); |
| buff.append(n[i % 10]); |
| return buff.toString(); |
| } |
| |
| |
| private static int hash(int oldHash, int mul, int shift) { |
| return oldHash ^ ((oldHash * mul) >> shift); |
| } |
| |
| private static void add(byte[] bloom, int h1, int h2) { |
| int len = bloom.length; |
| if (len > 0) { |
| bloom[(h1 >>> 3) % len] |= 1 << (h1 & 7); |
| bloom[(h2 >>> 3) % len] |= 1 << (h2 & 7); |
| } |
| } |
| |
| private static boolean probablyContains(byte[] bloom, int h1, int h2) { |
| int len = bloom.length; |
| if (len == 0) { |
| return true; |
| } |
| int x = bloom[(h1 >>> 3) % len] & (1 << (h1 & 7)); |
| if (x != 0) { |
| x = bloom[(h2 >>> 3) % len] & (1 << (h2 & 7)); |
| } |
| return x != 0; |
| } |
| |
| @Test |
| public void size() { |
| byte[] bloom = BloomFilterUtils.createFilter(100, 64); |
| assertEquals(64, bloom.length); |
| bloom = BloomFilterUtils.createFilter(10, 64); |
| assertEquals(11, bloom.length); |
| bloom = BloomFilterUtils.createFilter(0, 64); |
| assertEquals(0, bloom.length); |
| bloom = BloomFilterUtils.createFilter(1, 64); |
| assertEquals(1, bloom.length); |
| } |
| |
| @Test |
| public void probability() { |
| byte[] bloom = BloomFilterUtils.createFilter(20, 64); |
| System.out.println(bloom.length); |
| Random random = new Random(1); |
| random.setSeed(1); |
| for (int i = 0; i < 20; i++) { |
| BloomFilterUtils.add(bloom, random.nextInt()); |
| } |
| random.setSeed(1); |
| for (int i = 0; i < 20; i++) { |
| assertTrue(BloomFilterUtils.probablyContains(bloom, random.nextInt())); |
| } |
| int falsePositives = 0; |
| for (int i = 20; i < 100000; i++) { |
| if (BloomFilterUtils.probablyContains(bloom, random.nextInt())) { |
| falsePositives++; |
| } |
| } |
| assertEquals(4594, falsePositives); |
| } |
| @Test |
| public void negativeHashCode() { |
| BloomFilterUtils.add(new byte[0], new Object() { |
| @Override |
| public int hashCode() { |
| return -1; |
| } |
| }); |
| } |
| |
| } |