Skip to content

Commit ddbff7c

Browse files
committed
HADOOP-14313. Replace/improve Hadoop's byte[] comparator. Contributed by Vikas Vishwakarma.
1 parent 2b2399d commit ddbff7c

File tree

2 files changed

+25
-27
lines changed

2 files changed

+25
-27
lines changed

NOTICE.txt

+8
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,14 @@ by Google Inc, which can be obtained at:
196196
* HOMEPAGE:
197197
* http://code.google.com/p/snappy/
198198

199+
This product contains a modified portion of UnsignedBytes LexicographicalComparator
200+
from Guava v21 project by Google Inc, which can be obtained at:
201+
202+
* LICENSE:
203+
* license/COPYING (Apache License 2.0)
204+
* HOMEPAGE:
205+
* https://github.com/google/guava
206+
199207
This product optionally depends on 'JBoss Marshalling', an alternative Java
200208
serialization API, which can be obtained at:
201209

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java

+17-27
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import org.slf4j.LoggerFactory;
2727
import sun.misc.Unsafe;
2828

29-
import com.google.common.primitives.Longs;
3029
import com.google.common.primitives.UnsignedBytes;
3130

3231
/**
@@ -195,52 +194,43 @@ public int compareTo(byte[] buffer1, int offset1, int length1,
195194
length1 == length2) {
196195
return 0;
197196
}
197+
final int stride = 8;
198198
int minLength = Math.min(length1, length2);
199-
int minWords = minLength / Longs.BYTES;
199+
int strideLimit = minLength & ~(stride - 1);
200200
int offset1Adj = offset1 + BYTE_ARRAY_BASE_OFFSET;
201201
int offset2Adj = offset2 + BYTE_ARRAY_BASE_OFFSET;
202+
int i;
202203

203204
/*
204205
* Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a
205206
* time is no slower than comparing 4 bytes at a time even on 32-bit.
206207
* On the other hand, it is substantially faster on 64-bit.
207208
*/
208-
for (int i = 0; i < minWords * Longs.BYTES; i += Longs.BYTES) {
209+
for (i = 0; i < strideLimit; i += stride) {
209210
long lw = theUnsafe.getLong(buffer1, offset1Adj + (long) i);
210211
long rw = theUnsafe.getLong(buffer2, offset2Adj + (long) i);
211-
long diff = lw ^ rw;
212212

213-
if (diff != 0) {
213+
if (lw != rw) {
214214
if (!littleEndian) {
215215
return lessThanUnsigned(lw, rw) ? -1 : 1;
216216
}
217217

218-
// Use binary search
219-
int n = 0;
220-
int y;
221-
int x = (int) diff;
222-
if (x == 0) {
223-
x = (int) (diff >>> 32);
224-
n = 32;
225-
}
226-
227-
y = x << 16;
228-
if (y == 0) {
229-
n += 16;
230-
} else {
231-
x = y;
232-
}
233-
234-
y = x << 8;
235-
if (y == 0) {
236-
n += 8;
237-
}
238-
return (int) (((lw >>> n) & 0xFFL) - ((rw >>> n) & 0xFFL));
218+
/*
219+
* We want to compare only the first index where left[index] !=
220+
* right[index]. This corresponds to the least significant nonzero
221+
* byte in lw ^ rw, since lw and rw are little-endian.
222+
* Long.numberOfTrailingZeros(diff) tells us the least significant
223+
* nonzero bit, and zeroing out the first three bits of L.nTZ gives
224+
* us the shift to get that least significant nonzero byte. This
225+
* comparison logic is based on UnsignedBytes from Guava v21
226+
*/
227+
int n = Long.numberOfTrailingZeros(lw ^ rw) & ~0x7;
228+
return ((int) ((lw >>> n) & 0xFF)) - ((int) ((rw >>> n) & 0xFF));
239229
}
240230
}
241231

242232
// The epilogue to cover the last (minLength % 8) elements.
243-
for (int i = minWords * Longs.BYTES; i < minLength; i++) {
233+
for (; i < minLength; i++) {
244234
int result = UnsignedBytes.compare(
245235
buffer1[offset1 + i],
246236
buffer2[offset2 + i]);

0 commit comments

Comments
 (0)