|
| 1 | +package org.jetbrains.kotlinx.spark.examples |
| 2 | + |
| 3 | +import org.apache.spark.sql.Dataset |
| 4 | +import org.jetbrains.kotlinx.spark.api.* |
| 5 | +import org.jetbrains.kotlinx.spark.api.tuples.X |
| 6 | +import org.jetbrains.kotlinx.spark.examples.GroupCalculation.getAllPossibleGroups |
| 7 | +import scala.Tuple2 |
| 8 | +import kotlin.math.pow |
| 9 | + |
| 10 | +/** |
| 11 | + * Gets all the possible, unique, non repeating groups of indices for a list. |
| 12 | + * |
| 13 | + * Example by Jolanrensen. |
| 14 | + */ |
| 15 | + |
| 16 | +fun main() = withSpark { |
| 17 | + val groupIndices = getAllPossibleGroups(listSize = 10, groupSize = 4) |
| 18 | + .sort("value") |
| 19 | + |
| 20 | + groupIndices.showDS(numRows = groupIndices.count().toInt()) |
| 21 | +} |
| 22 | + |
| 23 | +object GroupCalculation { |
| 24 | + |
| 25 | + /** |
| 26 | + * Get all the possible, unique, non repeating groups (of size [groupSize]) of indices for a list of |
| 27 | + * size [listSize]. |
| 28 | + * |
| 29 | + * |
| 30 | + * The workload is evenly distributed by [listSize] and [groupSize] |
| 31 | + * |
| 32 | + * @param listSize the size of the list for which to calculate the indices |
| 33 | + * @param groupSize the size of a group of indices |
| 34 | + * @return all the possible, unique non repeating groups of indices |
| 35 | + */ |
| 36 | + fun KSparkSession.getAllPossibleGroups( |
| 37 | + listSize: Int, |
| 38 | + groupSize: Int, |
| 39 | + ): Dataset<IntArray> { |
| 40 | + val indices = (0 until listSize).toList().toRDD() // Easy RDD creation! |
| 41 | + |
| 42 | + // for a groupSize of 1, no pairing up is needed, so just return the indices converted to IntArrays |
| 43 | + if (groupSize == 1) { |
| 44 | + return indices |
| 45 | + .mapPartitions { |
| 46 | + it.map { intArrayOf(it) } |
| 47 | + } |
| 48 | + .toDS() |
| 49 | + } |
| 50 | + |
| 51 | + // this converts all indices to (number in table, index) |
| 52 | + val keys = indices.mapPartitions { |
| 53 | + |
| 54 | + // _1 is key (item in table), _2 is index in list |
| 55 | + it.transformAsSequence { |
| 56 | + flatMap { listIndex -> |
| 57 | + |
| 58 | + // for each dimension loop over the other dimensions using addTuples |
| 59 | + (0 until groupSize).asSequence().flatMap { dimension -> |
| 60 | + addTuples( |
| 61 | + groupSize = groupSize, |
| 62 | + value = listIndex, |
| 63 | + listSize = listSize, |
| 64 | + skipDimension = dimension, |
| 65 | + ) |
| 66 | + } |
| 67 | + } |
| 68 | + } |
| 69 | + } |
| 70 | + |
| 71 | + // Since we have a JavaRDD<Tuple2> we can aggregateByKey! |
| 72 | + // Each number in table occurs for each dimension as key. |
| 73 | + // The values of those two will be a tuple of (key, indices as list) |
| 74 | + val allPossibleGroups = keys.aggregateByKey( |
| 75 | + zeroValue = IntArray(groupSize) { -1 }, |
| 76 | + seqFunc = { base: IntArray, listIndex: Int -> |
| 77 | + // put listIndex in the first empty spot in base |
| 78 | + base[base.indexOfFirst { it < 0 }] = listIndex |
| 79 | + |
| 80 | + base |
| 81 | + }, |
| 82 | + |
| 83 | + // how to merge partially filled up int arrays |
| 84 | + combFunc = { a: IntArray, b: IntArray -> |
| 85 | + // merge a and b |
| 86 | + var j = 0 |
| 87 | + for (i in a.indices) { |
| 88 | + if (a[i] < 0) { |
| 89 | + while (b[j] < 0) { |
| 90 | + j++ |
| 91 | + if (j == b.size) return@aggregateByKey a |
| 92 | + } |
| 93 | + a[i] = b[j] |
| 94 | + j++ |
| 95 | + } |
| 96 | + } |
| 97 | + a |
| 98 | + }, |
| 99 | + ) |
| 100 | + .values() // finally just take the values |
| 101 | + |
| 102 | + return allPossibleGroups.toDS() |
| 103 | + } |
| 104 | + |
| 105 | + /** |
| 106 | + * Simple method to give each index of x dimensions a unique number. |
| 107 | + * |
| 108 | + * @param indexTuple IntArray (can be seen as Tuple) of size x with all values < listSize. The index for which to return the number |
| 109 | + * @param listSize The size of the list, aka the max width, height etc. of the table |
| 110 | + * @return the unique number for this [indexTuple] |
| 111 | + */ |
| 112 | + private fun getTupleValue(indexTuple: List<Int>, listSize: Int): Int = |
| 113 | + indexTuple.indices.sumOf { |
| 114 | + indexTuple[it] * listSize.toDouble().pow(it).toInt() |
| 115 | + } |
| 116 | + |
| 117 | + |
| 118 | + /** |
| 119 | + * To make sure that every tuple is only picked once, this method returns true only if the indices are in the right |
| 120 | + * corner of the matrix. This works for any number of dimensions > 1. Here is an example for 2-D: |
| 121 | + * |
| 122 | + * |
| 123 | + * - 0 1 2 3 4 5 6 7 8 9 |
| 124 | + * -------------------------------- |
| 125 | + * 0| x ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ |
| 126 | + * 1| x x ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ |
| 127 | + * 2| x x x ✓ ✓ ✓ ✓ ✓ ✓ ✓ |
| 128 | + * 3| x x x x ✓ ✓ ✓ ✓ ✓ ✓ |
| 129 | + * 4| x x x x x ✓ ✓ ✓ ✓ ✓ |
| 130 | + * 5| x x x x x x ✓ ✓ ✓ ✓ |
| 131 | + * 6| x x x x x x x ✓ ✓ ✓ |
| 132 | + * 7| x x x x x x x x ✓ ✓ |
| 133 | + * 8| x x x x x x x x x ✓ |
| 134 | + * 9| x x x x x x x x x x |
| 135 | + * |
| 136 | + * @param indexTuple a tuple of indices in the form of an IntArray |
| 137 | + * @return true if this tuple is in the right corner and should be included |
| 138 | + */ |
| 139 | + private fun isValidIndexTuple(indexTuple: List<Int>): Boolean { |
| 140 | + // x - y > 0; 2d |
| 141 | + // (x - y) > 0 && (x - z) > 0 && (y - z) > 0; 3d |
| 142 | + // (x - y) > 0 && (x - z) > 0 && (x - a) > 0 && (y - z) > 0 && (y - a) > 0 && (z - a) > 0; 4d |
| 143 | + require(indexTuple.size >= 2) { "not a tuple" } |
| 144 | + for (i in 0 until indexTuple.size - 1) { |
| 145 | + for (j in i + 1 until indexTuple.size) { |
| 146 | + if (indexTuple[i] - indexTuple[j] <= 0) return false |
| 147 | + } |
| 148 | + } |
| 149 | + return true |
| 150 | + } |
| 151 | + |
| 152 | + /** |
| 153 | + * Recursive method that for [skipDimension] loops over all the other dimensions and returns all results from |
| 154 | + * [getTupleValue] as key and [value] as value. |
| 155 | + * In the end, the return value will have, for each key in the table below, a value for the key's column, row etc. |
| 156 | + * |
| 157 | + * |
| 158 | + * This is an example for 2D. The letters will be int indices as well (a = 0, b = 1, ..., [listSize]), but help for clarification. |
| 159 | + * The numbers we don't want are filtered out using [isValidIndexTuple]. |
| 160 | + * The actual value of the number in the table comes from [getTupleValue]. |
| 161 | + * |
| 162 | + * |
| 163 | + * |
| 164 | + * |
| 165 | + * - a b c d e f g h i j |
| 166 | + * -------------------------------- |
| 167 | + * a| - 1 2 3 4 5 6 7 8 9 |
| 168 | + * b| - - 12 13 14 15 16 17 18 19 |
| 169 | + * c| - - - 23 24 25 26 27 28 29 |
| 170 | + * d| - - - - 34 35 36 37 38 39 |
| 171 | + * e| - - - - - 45 46 47 48 49 |
| 172 | + * f| - - - - - - 56 57 58 59 |
| 173 | + * g| - - - - - - - 67 68 69 |
| 174 | + * h| - - - - - - - - 78 79 |
| 175 | + * i| - - - - - - - - - 89 |
| 176 | + * j| - - - - - - - - - - |
| 177 | + * |
| 178 | + * |
| 179 | + * @param groupSize the size of index tuples to form |
| 180 | + * @param value the current index to work from (can be seen as a letter in the table above) |
| 181 | + * @param listSize the size of the list to make |
| 182 | + * @param skipDimension the current dimension that will have a set value [value] while looping over the other dimensions |
| 183 | + */ |
| 184 | + private fun addTuples( |
| 185 | + groupSize: Int, |
| 186 | + value: Int, |
| 187 | + listSize: Int, |
| 188 | + skipDimension: Int, |
| 189 | + ): List<Tuple2<Int, Int>> { |
| 190 | + |
| 191 | + /** |
| 192 | + * @param currentDimension the indicator for which dimension we're currently calculating for (and how deep in the recursion we are) |
| 193 | + * @param indexTuple the list (or tuple) in which to store the current indices |
| 194 | + */ |
| 195 | + fun recursiveCall( |
| 196 | + currentDimension: Int = 0, |
| 197 | + indexTuple: List<Int> = emptyList(), |
| 198 | + ): List<Tuple2<Int, Int>> = when { |
| 199 | + // base case |
| 200 | + currentDimension >= groupSize -> |
| 201 | + if (isValidIndexTuple(indexTuple)) |
| 202 | + listOf(getTupleValue(indexTuple, listSize) X value) |
| 203 | + else |
| 204 | + emptyList() |
| 205 | + |
| 206 | + currentDimension == skipDimension -> |
| 207 | + recursiveCall( |
| 208 | + currentDimension = currentDimension + 1, |
| 209 | + indexTuple = indexTuple + value, |
| 210 | + ) |
| 211 | + |
| 212 | + else -> |
| 213 | + (0 until listSize).flatMap { i -> |
| 214 | + recursiveCall( |
| 215 | + currentDimension = currentDimension + 1, |
| 216 | + indexTuple = indexTuple + i, |
| 217 | + ) |
| 218 | + } |
| 219 | + } |
| 220 | + |
| 221 | + return recursiveCall() |
| 222 | + } |
| 223 | +} |
0 commit comments