Skip to content

Commit a554e9d

Browse files
committedSep 23, 2015
Fixed collision bug in HashFunction
1 parent 04e7b97 commit a554e9d

File tree

2 files changed

+77
-31
lines changed

2 files changed

+77
-31
lines changed
 

‎bloom/src/bloom/hash/HashFunction.java

+52-20
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@ public class HashFunction implements Hash {
1212

1313

1414
public HashFunction(){}
15-
15+
16+
@Override
1617
public void print() {}
17-
18+
19+
@Override
1820
public int getHashCode(String key){
1921
return 0;
2022
}
@@ -44,25 +46,55 @@ public static int fnv1Hash(String key, int m){
4446
}
4547

4648
/**
47-
* Returns an array of k hashes by computing linear
48-
* combinations of the fnv1 hash with the MurmurHash
49-
*
50-
* @param key Key in which the hash will be performed on
51-
* @param k Value specifying how many hash values to compute
52-
* @param m Modulus value
53-
* @return An array of size K with hash values [0, m-1]
49+
* Returns an array of k hashes by computing linear combinations of the fnv1 hash with the
50+
* MurmurHash
51+
*
52+
* @param key Key in which the hash will be performed on
53+
* @param k Value specifying how many hash values to compute
54+
* @param m Modulus value; m should be much larger than k
55+
* @return An array of size K with hash values [0, m-1]
5456
*/
55-
public int[] hash(String key,int k, int m){
57+
@Override
58+
public int[] hash(String key, int k, int m) {
5659
int[] hashes = new int[k];
57-
for(int ii = 0; ii < k; ii++){
58-
long h1 = fnv1Hash(key);
59-
long h2 = murmurHash(key);
60-
long axb = h1+(ii*h2);
61-
hashes[ii] = (int)(axb % m);
60+
long h1 = fnv1Hash(key);
61+
long h2 = murmurHash(key);
62+
for (int ii = 0; ii < k; ii++ ) {
63+
long axb = ii + h1 + (ii * h2);
64+
int hashCandidate = (int) (axb % m);
65+
int iterations = 0;
66+
// Check that the generated hash candidate was not generated before.
67+
while (iterations < m && contains(hashes, hashCandidate, ii)) {
68+
// Try to find a new hash candidate that is not yet in the array.
69+
hashCandidate = (int) ((hashCandidate + murmurHash(Integer.toString(iterations + hashCandidate))) % m);
70+
iterations++ ;
71+
}
72+
if (iterations >= m) {
73+
throw new RuntimeException("Failed to generate distinct hashes for: " + key);
74+
}
75+
hashes[ii] = hashCandidate;
6276
}
6377
return hashes;
6478
}
65-
79+
80+
/**
81+
* Checks if a value is contained in an array.
82+
*
83+
* @param array The array to search in
84+
* @param valueToFind The value to find in the array
85+
* @param lastIndex The last index of the array that is checked
86+
* @return <code>true</code> if the value was found, <code>false</code> otherwise
87+
*/
88+
private static boolean contains(final int[] array, final int valueToFind, int lastIndex) {
89+
lastIndex = Math.min(array.length, lastIndex);
90+
for (int i = 0; i <= lastIndex; i++ ) {
91+
if (valueToFind == array[i]) {
92+
return true;
93+
}
94+
}
95+
return false;
96+
}
97+
6698
/**
6799
* Returns a single min wise independent hash value
68100
* using a linear transformation
@@ -136,13 +168,13 @@ public static long murmurHash(String string) {
136168

137169
if (left != 0) {
138170
if (left >= 3) {
139-
hash ^= (int) data[dataLength - 3] << 16;
171+
hash ^= data[dataLength - 3] << 16;
140172
}
141173
if (left >= 2) {
142-
hash ^= (int) data[dataLength - 2] << 8;
174+
hash ^= data[dataLength - 2] << 8;
143175
}
144176
if (left >= 1) {
145-
hash ^= (int) data[dataLength - 1];
177+
hash ^= data[dataLength - 1];
146178
}
147179

148180
hash *= m;
@@ -152,7 +184,7 @@ public static long murmurHash(String string) {
152184
hash *= m;
153185
hash ^= hash >>> 15;
154186

155-
return (long)(hash & 0x00000000ffffffffL);
187+
return (hash & 0x00000000ffffffffL);
156188
}
157189

158190
}

‎bloom/test/HashFunctionTest.java

+25-11
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11

22
import static org.junit.Assert.*;
33

4+
import java.util.Arrays;
45
import java.util.UUID;
56

67
import org.junit.Test;
@@ -10,13 +11,13 @@
1011

1112

1213
public class HashFunctionTest {
13-
//Constants
14+
//Constants
1415
private static final int M = 1000000;
1516
private static final int m = 100;
1617
private static final double ERROR = 0.05;
17-
18+
1819
//Tests
19-
20+
2021
@Test
2122
public void testIfFNV1HashIsDeterministic(){
2223
for(int ii= 0; ii < M; ii++){
@@ -26,7 +27,7 @@ public void testIfFNV1HashIsDeterministic(){
2627
assertEquals(hash1, hash2);
2728
}
2829
}
29-
30+
3031
@Test
3132
public void testIfMurmurHashIsDeterministic(){
3233
for(int ii = 0; ii < M; ii++){
@@ -36,7 +37,7 @@ public void testIfMurmurHashIsDeterministic(){
3637
assertEquals(hash1, hash2);
3738
}
3839
}
39-
40+
4041
@Test
4142
public void testUniformityOfMurmurHash(){
4243
// Initialize array of buckets
@@ -59,7 +60,7 @@ public void testUniformityOfMurmurHash(){
5960
}
6061
System.out.print("\n");
6162
}
62-
63+
6364
@Test
6465
public void testUniformityOfFnv1Hash(){
6566
int[] buckets = createFreqArray(m);
@@ -81,13 +82,13 @@ public void testUniformityOfFnv1Hash(){
8182
}
8283
System.out.print("\n");
8384
}
84-
85+
8586
@Test
8687
public void testUniformityOf3Hashes(){
8788
int k = 3;
8889
kHashFunctions(k);
8990
}
90-
91+
9192
@Test
9293
public void testUniformityOf4Hashes(){
9394
int k = 4;
@@ -101,8 +102,9 @@ private void kHashFunctions(int k) {
101102
HashFunction hash = new HashFunction();
102103
for (int ii = 0; ii < M; ii++){
103104
int[] hashes = hash.hash(UUID.randomUUID().toString().substring(0, 15), k, m);
104-
for(int jj = 0; jj < hashes.length; jj++)
105+
for(int jj = 0; jj < hashes.length; jj++) {
105106
buckets[hashes[jj]] += 1;
107+
}
106108
}
107109
float low = (float) ((M*k / buckets.length) * (1-ERROR));
108110
float high = (float) ((M*k / buckets.length) * (1+ERROR));
@@ -115,12 +117,24 @@ private void kHashFunctions(int k) {
115117
}
116118
System.out.print("\n");
117119
}
118-
120+
119121
private int[] createFreqArray(int m) {
120122
int[] buckets = new int[m];
121-
for (int ii = 0; ii < m; ii++)
123+
for (int ii = 0; ii < m; ii++) {
122124
buckets[ii] = 0;
125+
}
123126
return buckets;
124127
}
125128

129+
@Test
130+
public void testHashesAreDistinct() {
131+
int k = 4;
132+
HashFunction hash = new HashFunction();
133+
134+
for (int ii = 0; ii < M; ii++ ) {
135+
int[] hashes = hash.hash(UUID.randomUUID().toString().substring(0, 15), k, m);
136+
assertEquals(k, Arrays.stream(hashes).distinct().count());
137+
}
138+
}
139+
126140
}

0 commit comments

Comments
 (0)
Please sign in to comment.