Skip to content

Commit c5d9dca

Browse files
committed
New APIs: ByteString.toIndex() and ByteString.toFraction()
The first one may be useful with hashing to put byte strings in partitioning buckets for scaling. For example, to divide a dataset into 32 partitions, hash the key then use toIndex(32) to map the key to its partition. The second one may be useful with dynamic experiments and A/B tests. For example, to assign a control group to 5% of customers hash the customer key then check if toFraction() is less than 0.05.
1 parent 3181015 commit c5d9dca

File tree

6 files changed

+196
-0
lines changed

6 files changed

+196
-0
lines changed

okio/src/commonMain/kotlin/okio/ByteString.kt

+93
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import kotlin.jvm.JvmField
2020
import kotlin.jvm.JvmName
2121
import kotlin.jvm.JvmOverloads
2222
import kotlin.jvm.JvmStatic
23+
import kotlin.math.min
2324

2425
/**
2526
* An immutable sequence of bytes.
@@ -137,6 +138,98 @@ internal constructor(data: ByteArray) : Comparable<ByteString> {
137138

138139
override fun compareTo(other: ByteString): Int
139140

141+
/**
142+
* Projects this value to the range `[0..size)` using linear interpolation. This is equivalent to
143+
* a sorted partitioning of all possible byte strings across [size] equally-sized buckets and
144+
* returning the index of the bucket that this byte string fits in.
145+
*
146+
* For example, the byte string `8000` is the median of all 2-element byte strings, and calling
147+
* `toIndex(100)` on it returns 50. Some other examples:
148+
*
149+
* | Byte String (hex) | `toIndex(100)` | `toIndex(256)` | `toIndex(Int.MAX_VALUE)` |
150+
* | :----------------- | -------------: | -------------: | -----------------------: |
151+
* | (empty) | 0 | 0 | 0 |
152+
* | 00 | 0 | 0 | 0 |
153+
* | 0000 | 0 | 0 | 0 |
154+
* | 000000 | 0 | 0 | 0 |
155+
* | 0000000001 | 0 | 0 | 0 |
156+
* | 00000001 | 0 | 0 | 0 |
157+
* | 00000002 | 0 | 0 | 0 |
158+
* | 00000003 | 0 | 0 | 1 |
159+
* | 01 | 0 | 1 | 8388607 |
160+
* | 02 | 0 | 2 | 16777215 |
161+
* | 03 | 1 | 3 | 25165823 |
162+
* | 80 | 50 | 128 | 1073741823 |
163+
* | 8000 | 50 | 128 | 1073741823 |
164+
* | 80000000 | 50 | 128 | 1073741823 |
165+
* | 81 | 50 | 129 | 1082130431 |
166+
* | 81ffffff | 50 | 129 | 1090519038 |
167+
* | 82 | 50 | 130 | 1090519039 |
168+
* | 83 | 51 | 131 | 1098907647 |
169+
* | ff | 99 | 255 | 2139095039 |
170+
* | ffff | 99 | 255 | 2147450879 |
171+
* | ffffffff | 99 | 255 | 2147483646 |
172+
* | ffffffffffff | 99 | 255 | 2147483646 |
173+
*
174+
* This interprets the bytes in this byte string as **unsigned**. This behavior is consistent with
175+
* [compareTo]. The returned value is also consistent with [compareTo] though the dynamic range
176+
* is compressed. For two byte strings `a` and `b`, if `a < b`, then
177+
* `a.toIndex(n) <= b.toIndex(n)` for all sizes `n`.
178+
*
179+
* This examines at most the first 4 bytes of this byte string. Data beyond the first 4 bytes is
180+
* not used to compute the result.
181+
*
182+
* @param size a positive integer.
183+
* @return a value that is greater than or equal to `0` and less than [size].
184+
*/
185+
fun toIndex(size: Int): Int
186+
187+
/**
188+
* Projects this value to the range `[0.0..1.0)` using linear interpolation. This is equivalent to
189+
* sorting all possible byte strings and returning the fraction that precede this byte string.
190+
*
191+
* For example, the byte string `8000` is the median of all 2-element byte strings, and calling
192+
* `toFraction()` on it returns 0.5. Some other examples:
193+
*
194+
* | Byte String (hex) | `toFraction()` |
195+
* | :----------------- | :----------------- |
196+
* | (empty) | 0.0 |
197+
* | 00 | 0.0 |
198+
* | 0000 | 0.0 |
199+
* | 000000 | 0.0 |
200+
* | 00000000000001 | 0.0 |
201+
* | 00000000000007 | 0.0 |
202+
* | 00000000000008 | 0.0000000000000001 |
203+
* | 0000000001 | 0.0000000000009094 |
204+
* | 00000001 | 0.0000000002328306 |
205+
* | 01 | 0.00390625 |
206+
* | 02 | 0.0078125 |
207+
* | 03 | 0.01171875 |
208+
* | 80 | 0.5 |
209+
* | 8000 | 0.5 |
210+
* | 80000000000000 | 0.5 |
211+
* | 81 | 0.50390625 |
212+
* | 81ffffff | 0.5078124997671694 |
213+
* | 82 | 0.5078125 |
214+
* | 83 | 0.51171875 |
215+
* | ff | 0.99609375 |
216+
* | ffff | 0.9999847412109375 |
217+
* | ffffffff | 0.9999999997671694 |
218+
* | ffffffffffff | 0.9999999999999964 |
219+
* | ffffffffffffff | 0.9999999999999999 |
220+
*
221+
* This interprets the bytes in this byte string as **unsigned**. This behavior is consistent with
222+
* [compareTo]. The returned value is also consistent with [compareTo] though the dynamic range
223+
* is compressed. For two byte strings `a` and `b`, if `a < b`, then
224+
* `a.toFraction() <= b.toFraction()`.
225+
*
226+
* This examines at most the first 7 bytes of this byte string. Data beyond the first 7 bytes is
227+
* not used to compute the result.
228+
*
229+
* @return a value that is greater than or equal to `0.0` and less than `1.0`.
230+
*/
231+
fun toFraction(): Double
232+
140233
/**
141234
* Returns a human-readable string that describes the contents of this byte string. Typically this
142235
* is a string like `[text=Hello]` or `[hex=0000ffff]`.

okio/src/commonMain/kotlin/okio/internal/ByteString.kt

+32
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import okio.isIsoControl
3030
import okio.processUtf8CodePoints
3131
import okio.shr
3232
import okio.toUtf8String
33+
import kotlin.math.min
3334

3435
// TODO Kotlin's expect classes can't have default implementations, so platform implementations
3536
// have to call these functions. Remove all this nonsense when expect class allow actual code.
@@ -248,6 +249,37 @@ internal inline fun ByteString.commonCompareTo(other: ByteString): Int {
248249
return if (sizeA < sizeB) -1 else 1
249250
}
250251

252+
@Suppress("NOTHING_TO_INLINE")
253+
internal inline fun ByteString.commonToIndex(size: Int): Int {
254+
require(size > 0)
255+
var numerator = 0L
256+
var denominator = 1L
257+
for (i in 0 until min(4, this.size)) {
258+
numerator = (numerator shl 8) + (get(i) and 0xff)
259+
denominator = (denominator shl 8)
260+
}
261+
return (size * numerator / denominator).toInt()
262+
}
263+
264+
@Suppress("NOTHING_TO_INLINE")
265+
internal inline fun ByteString.commonToFraction(): Double {
266+
var numerator = 0L
267+
var denominator = 1L
268+
for (i in 0 until min(7, size)) {
269+
numerator = (numerator shl 8) + (get(i) and 0xff)
270+
denominator = (denominator shl 8)
271+
}
272+
273+
// Double wants 53 bits of precision but we have 56. Discard 3 bits of precision. Without this
274+
// it's possible that this method returns 1.0 for byte strings like "ffffffffffffff".
275+
if (size >= 7) {
276+
numerator = numerator shr 3
277+
denominator = denominator shr 3
278+
}
279+
280+
return numerator.toDouble() / denominator
281+
}
282+
251283
@Suppress("NOTHING_TO_INLINE")
252284
internal inline fun commonOf(data: ByteArray) = ByteString(data.copyOf())
253285

okio/src/commonTest/kotlin/okio/ByteStringTest.kt

+53
Original file line numberDiff line numberDiff line change
@@ -457,4 +457,57 @@ abstract class AbstractByteStringTest internal constructor(
457457
sortedByteStrings.sort()
458458
assertEquals(originalByteStrings, sortedByteStrings)
459459
}
460+
461+
@Test fun toIndex() {
462+
assertEquals(0, factory.decodeHex("").toIndex(1))
463+
assertEquals(0, factory.decodeHex("00").toIndex(1))
464+
assertEquals(0, factory.decodeHex("ff").toIndex(1))
465+
assertEquals(0, factory.decodeHex("ffffffff").toIndex(1))
466+
assertEquals(0, factory.decodeHex("ffffffffffff").toIndex(1))
467+
468+
assertEquals(0, factory.decodeHex("").toIndex(100))
469+
assertEquals(0, factory.decodeHex("00").toIndex(100))
470+
assertEquals(10, factory.decodeHex("1a").toIndex(100))
471+
assertEquals(25, factory.decodeHex("40").toIndex(100))
472+
assertEquals(50, factory.decodeHex("80").toIndex(100))
473+
assertEquals(75, factory.decodeHex("c0").toIndex(100))
474+
assertEquals(99, factory.decodeHex("ff").toIndex(100))
475+
assertEquals(99, factory.decodeHex("ffff").toIndex(100))
476+
assertEquals(99, factory.decodeHex("ffffff").toIndex(100))
477+
assertEquals(99, factory.decodeHex("ffffffff").toIndex(100))
478+
479+
assertEquals(0, factory.decodeHex("").toIndex(Int.MAX_VALUE))
480+
assertEquals(0x7f7fffff, factory.decodeHex("ff").toIndex(Int.MAX_VALUE))
481+
assertEquals(0x7fff7fff, factory.decodeHex("ffff").toIndex(Int.MAX_VALUE))
482+
assertEquals(0x7fffff7f, factory.decodeHex("ffffff").toIndex(Int.MAX_VALUE))
483+
assertEquals(0x7ffffffe, factory.decodeHex("ffffffff").toIndex(Int.MAX_VALUE))
484+
}
485+
486+
@Test fun toFraction() {
487+
assertEquals(0.0, factory.decodeHex("").toFraction())
488+
assertEquals(0.0, factory.decodeHex("00").toFraction())
489+
assertEquals(0.0, factory.decodeHex("00").toFraction())
490+
assertEquals(0.1015625, factory.decodeHex("1a").toFraction())
491+
assertEquals(0.25, factory.decodeHex("40").toFraction())
492+
assertEquals(0.5, factory.decodeHex("80").toFraction())
493+
assertEquals(0.75, factory.decodeHex("c0").toFraction())
494+
assertEquals(0.7929493631236255, factory.decodeHex("cafebabe").toFraction())
495+
assertEquals(0.99609375, factory.decodeHex("ff").toFraction())
496+
assertEquals(0.9999847412109375, factory.decodeHex("ffff").toFraction())
497+
assertEquals(0.9999999403953552, factory.decodeHex("ffffff").toFraction())
498+
assertEquals(0.9999999997671694, factory.decodeHex("ffffffff").toFraction())
499+
assertEquals(0.9999999999999964, factory.decodeHex("ffffffffffff").toFraction())
500+
assertEquals(0.9999999999999999, factory.decodeHex("ffffffffffffff").toFraction())
501+
assertEquals(0.9999999999999999, factory.decodeHex("ffffffffffffffff").toFraction())
502+
}
503+
504+
/** Only 5 bits of the 7th byte are used. We use 53 bits in total for IEEE 754 doubles. */
505+
@Test fun toFractionLast5BitsOf7thByte() {
506+
assertEquals(0.0000000000000000, factory.decodeHex("00000000000007").toFraction())
507+
assertEquals(1.1102230246251565E-16, factory.decodeHex("00000000000008").toFraction())
508+
assertEquals(1.1102230246251565E-16, factory.decodeHex("0000000000000f").toFraction())
509+
assertEquals(2.220446049250313E-16, factory.decodeHex("00000000000010").toFraction())
510+
assertEquals(0.9999999999999998, factory.decodeHex("fffffffffffff0").toFraction())
511+
assertEquals(0.9999999999999999, factory.decodeHex("fffffffffffff8").toFraction())
512+
}
460513
}

okio/src/jsMain/kotlin/okio/ByteString.kt

+6
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ import okio.internal.commonToAsciiLowercase
3838
import okio.internal.commonToAsciiUppercase
3939
import okio.internal.commonToByteArray
4040
import okio.internal.commonToByteString
41+
import okio.internal.commonToFraction
42+
import okio.internal.commonToIndex
4143
import okio.internal.commonToString
4244
import okio.internal.commonUtf8
4345
import okio.internal.commonWrite
@@ -119,6 +121,10 @@ internal actual constructor(
119121

120122
actual override fun compareTo(other: ByteString) = commonCompareTo(other)
121123

124+
actual fun toIndex(size: Int) = commonToIndex(size)
125+
126+
actual fun toFraction() = commonToFraction()
127+
122128
/**
123129
* Returns a human-readable string that describes the contents of this byte string. Typically this
124130
* is a string like `[text=Hello]` or `[hex=0000ffff]`.

okio/src/jvmMain/kotlin/okio/ByteString.kt

+6
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ import okio.internal.commonToAsciiLowercase
3838
import okio.internal.commonToAsciiUppercase
3939
import okio.internal.commonToByteArray
4040
import okio.internal.commonToByteString
41+
import okio.internal.commonToFraction
42+
import okio.internal.commonToIndex
4143
import okio.internal.commonToString
4244
import okio.internal.commonUtf8
4345
import okio.internal.commonWrite
@@ -181,6 +183,10 @@ internal actual constructor(
181183

182184
actual override fun compareTo(other: ByteString) = commonCompareTo(other)
183185

186+
actual fun toIndex(size: Int) = commonToIndex(size)
187+
188+
actual fun toFraction() = commonToFraction()
189+
184190
actual override fun toString() = commonToString()
185191

186192
@Throws(IOException::class)

okio/src/nativeMain/kotlin/okio/ByteString.kt

+6
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ import okio.internal.commonToAsciiLowercase
3939
import okio.internal.commonToAsciiUppercase
4040
import okio.internal.commonToByteArray
4141
import okio.internal.commonToByteString
42+
import okio.internal.commonToFraction
43+
import okio.internal.commonToIndex
4244
import okio.internal.commonToString
4345
import okio.internal.commonUtf8
4446
import okio.internal.commonWrite
@@ -125,6 +127,10 @@ internal actual constructor(
125127

126128
actual override fun compareTo(other: ByteString) = commonCompareTo(other)
127129

130+
actual fun toIndex(size: Int) = commonToIndex(size)
131+
132+
actual fun toFraction() = commonToFraction()
133+
128134
/**
129135
* Returns a human-readable string that describes the contents of this byte string. Typically this
130136
* is a string like `[text=Hello]` or `[hex=0000ffff]`.

0 commit comments

Comments
 (0)