Merge pull request #4139 from Lettuceleaves/feat/bloom-multi-hash

Feat/bloom multi hash
This commit is contained in:
Golden Looly
2025-11-26 17:27:24 +08:00
committed by GitHub
3 changed files with 127 additions and 14 deletions

View File

@@ -30,7 +30,7 @@ public abstract class AbstractFilter implements BloomFilter {
@Serial
private static final long serialVersionUID = 1L;
private final BitSet bitSet;
protected final BitSet bitSet;
/**
* 容量
*/
@@ -50,12 +50,12 @@ public abstract class AbstractFilter implements BloomFilter {
@Override
public boolean contains(final String str) {
return bitSet.get(Math.abs(hash(str)));
return bitSet.get(hash(str));
}
@Override
public boolean add(final String str) {
final int hash = Math.abs(hash(str));
final int hash = hash(str);
if (bitSet.get(hash)) {
return false;
}

View File

@@ -16,7 +16,12 @@
package cn.hutool.v7.core.text.bloom;
import cn.hutool.v7.core.lang.Assert;
import java.io.Serial;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.function.Function;
/**
@@ -33,26 +38,69 @@ public class FuncFilter extends AbstractFilter {
* 创建FuncFilter
*
* @param size 最大值
* @param hashFunc Hash函数
* @param hashFuncs Hash函数
* @return FuncFilter
*/
public static FuncFilter of(final int size, final Function<String, Number> hashFunc) {
return new FuncFilter(size, hashFunc);
@SafeVarargs
public static FuncFilter of(final int size, final Function<String, Number>... hashFuncs) {
return new FuncFilter(size, hashFuncs);
}
private final Function<String, Number> hashFunc;
// 允许接收多个哈希函数
private final List<Function<String, Number>> hashFuncs;
/**
* @param size 最大值
* @param hashFunc Hash函数
* @param hashFuncs Hash函数
*/
public FuncFilter(final int size, final Function<String, Number> hashFunc) {
@SafeVarargs
public FuncFilter(final int size, final Function<String, Number>... hashFuncs) {
super(size);
this.hashFunc = hashFunc;
Assert.notEmpty(hashFuncs, "Hash functions must not be empty");
this.hashFuncs = Collections.unmodifiableList(Arrays.asList(hashFuncs));
}
/**
*兼容父类,如果存在多个哈希函数,就使用第一个
*
* @param str 字符串
*/
@Override
public int hash(final String str) {
return hash(str, hashFuncs.get(0));
}
/**
*
* @param str 字符串
* @param hashFunc 哈希函数
* @return HashCode 指定哈希函数的计算结果
*/
public int hash(final String str, final Function<String, Number> hashFunc) {
// 通过位运算获取正数
return (hashFunc.apply(str).intValue() & 0x7FFFFFFF) % size;
}
@Override
public int hash(final String str) {
return hashFunc.apply(str).intValue() % size;
public boolean contains(final String str) {
for (final Function<String, Number> hashFunc : hashFuncs) {
if (!bitSet.get(hash(str, hashFunc))) {
return false;
}
}
return true;
}
@Override
public boolean add(final String str) {
boolean add = false;
for (final Function<String, Number> hashFunc : hashFuncs) {
int hash = hash(str, hashFunc);
if (!bitSet.get(hash)) {
bitSet.set(hash);
add = true;
}
}
return add;
}
}

View File

@@ -22,11 +22,12 @@ import org.junit.jupiter.api.Test;
public class BitMapBloomFilterTest {
private static final int SIZE = 2 * 1024 * 1024 * 8;
@Test
public void filterTest() {
final int size = 2 * 1024 * 1024 * 8;
final CombinedBloomFilter filter = new CombinedBloomFilter(FuncFilter.of(size, HashUtil::rsHash));
final CombinedBloomFilter filter = new CombinedBloomFilter(FuncFilter.of(SIZE, HashUtil::rsHash));
filter.add("123");
filter.add("abc");
filter.add("ddd");
@@ -35,4 +36,68 @@ public class BitMapBloomFilterTest {
Assertions.assertTrue(filter.contains("ddd"));
Assertions.assertTrue(filter.contains("123"));
}
@Test
public void multiHashFuncTest() {
final FuncFilter filter = FuncFilter.of(SIZE,
HashUtil::rsHash,
HashUtil::jsHash,
HashUtil::pjwHash,
HashUtil::elfHash,
HashUtil::bkdrHash,
HashUtil::sdbmHash,
HashUtil::djbHash,
HashUtil::dekHash,
HashUtil::apHash,
HashUtil::javaDefaultHash
);
filter.add("Hutool");
filter.add("BloomFilter");
filter.add("Java");
Assertions.assertTrue(filter.contains("Hutool"));
Assertions.assertTrue(filter.contains("BloomFilter"));
Assertions.assertTrue(filter.contains("Java"));
Assertions.assertFalse(filter.contains("Python"));
Assertions.assertFalse(filter.contains("Go"));
Assertions.assertFalse(filter.contains("hutool"));
}
@Test
public void combinedMultiHashTest() {
FuncFilter multiHashFuncFilter = FuncFilter.of(SIZE,
HashUtil::bkdrHash,
HashUtil::apHash,
HashUtil::djbHash
);
final CombinedBloomFilter filter = new CombinedBloomFilter(multiHashFuncFilter);
filter.add("123123WASD-WASD");
Assertions.assertTrue(filter.contains("123123WASD-WASD"));
Assertions.assertFalse(filter.contains("123123WASD-WASD-false"));
}
@Test
public void chineseStringWithThreeHashesTest() {
final FuncFilter filter = FuncFilter.of(SIZE,
HashUtil::bkdrHash,
HashUtil::apHash,
HashUtil::djbHash
);
String s1 = "你好世界";
String s2 = "双亲委派";
String s3 = "测试工程师";
filter.add(s1);
filter.add(s2);
filter.add(s3);
Assertions.assertTrue(filter.contains(s1), "应包含: " + s1);
Assertions.assertTrue(filter.contains(s2), "应包含: " + s2);
Assertions.assertTrue(filter.contains(s3), "应包含: " + s3);
Assertions.assertFalse(filter.contains("我好世界"), "多字");
Assertions.assertFalse(filter.contains("父亲委派"), "改字");
Assertions.assertFalse(filter.contains("测试"), "子串");
Assertions.assertFalse(filter.contains(""), "空串");
Assertions.assertFalse(filter.contains("👍"), "未添加的");
}
}