Skip to content

Commit 6ec521f

Browse files
authored
Merge pull request #16 from bakwc/speedOptimize
Improved performance
2 parents c9ba256 + e32222f commit 6ec521f

18 files changed

+2832
-15
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ JamSpell is a spell checking library with following features:
4242
<td>79.53%</td>
4343
<td>84.10%</td>
4444
<td>0.64%</td>
45-
<td>1833</td>
45+
<td>4854</td>
4646
</tr>
4747
<tr>
4848
<td>Norvig</td>
@@ -103,7 +103,7 @@ To ensure that our model is not too overfitted for wikipedia+news we checked it
103103
<td>72.03%</td>
104104
<td>79.73%</td>
105105
<td>0.50%</td>
106-
<td>1764</td>
106+
<td>5524</td>
107107
</tr>
108108
<tr>
109109
<td>Norvig</td>

contrib/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ add_subdirectory(handypack)
33
add_subdirectory(phf)
44
add_subdirectory(cityhash)
55
add_subdirectory(bloom)
6-
6+
add_subdirectory(tsl)

contrib/tsl/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
add_library(tsl robin_map.cpp robin_map.h robin_hash.h robin_set.h)

contrib/tsl/LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2017 Tessil
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

contrib/tsl/robin_growth_policy.h

+270
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
/**
2+
* MIT License
3+
*
4+
* Copyright (c) 2017 Tessil
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
#ifndef TSL_ROBIN_GROWTH_POLICY_H
25+
#define TSL_ROBIN_GROWTH_POLICY_H
26+
27+
28+
#include <algorithm>
29+
#include <array>
30+
#include <climits>
31+
#include <cmath>
32+
#include <cstddef>
33+
#include <iterator>
34+
#include <limits>
35+
#include <ratio>
36+
#include <stdexcept>
37+
38+
39+
namespace tsl {
40+
namespace rh {
41+
42+
/**
43+
* Grow the hash table by a factor of GrowthFactor keeping the bucket count to a power of two. It allows
44+
* the table to use a mask operation instead of a modulo operation to map a hash to a bucket.
45+
*
46+
* GrowthFactor must be a power of two >= 2.
47+
*/
48+
template<std::size_t GrowthFactor>
49+
class power_of_two_growth_policy {
50+
public:
51+
/**
52+
* Called on the hash table creation and on rehash. The number of buckets for the table is passed in parameter.
53+
* This number is a minimum, the policy may update this value with a higher value if needed (but not lower).
54+
*/
55+
power_of_two_growth_policy(std::size_t& min_bucket_count_in_out) {
56+
if(min_bucket_count_in_out > max_bucket_count()) {
57+
throw std::length_error("The hash table exceeds its maxmimum size.");
58+
}
59+
60+
static_assert(MIN_BUCKETS_SIZE > 0, "MIN_BUCKETS_SIZE must be > 0.");
61+
const std::size_t min_bucket_count = MIN_BUCKETS_SIZE;
62+
63+
min_bucket_count_in_out = std::max(min_bucket_count, min_bucket_count_in_out);
64+
min_bucket_count_in_out = round_up_to_power_of_two(min_bucket_count_in_out);
65+
m_mask = min_bucket_count_in_out - 1;
66+
}
67+
68+
/**
69+
* Return the bucket [0, bucket_count()) to which the hash belongs.
70+
*/
71+
std::size_t bucket_for_hash(std::size_t hash) const noexcept {
72+
return hash & m_mask;
73+
}
74+
75+
/**
76+
* Return the bucket count to use when the bucket array grows on rehash.
77+
*/
78+
std::size_t next_bucket_count() const {
79+
if((m_mask + 1) > max_bucket_count() / GrowthFactor) {
80+
throw std::length_error("The hash table exceeds its maxmimum size.");
81+
}
82+
83+
return (m_mask + 1) * GrowthFactor;
84+
}
85+
86+
/**
87+
* Return the maximum number of buckets supported by the policy.
88+
*/
89+
std::size_t max_bucket_count() const {
90+
// Largest power of two.
91+
return (std::numeric_limits<std::size_t>::max() / 2) + 1;
92+
}
93+
94+
private:
95+
static std::size_t round_up_to_power_of_two(std::size_t value) {
96+
if(is_power_of_two(value)) {
97+
return value;
98+
}
99+
100+
if(value == 0) {
101+
return 1;
102+
}
103+
104+
--value;
105+
for(std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
106+
value |= value >> i;
107+
}
108+
109+
return value + 1;
110+
}
111+
112+
static constexpr bool is_power_of_two(std::size_t value) {
113+
return value != 0 && (value & (value - 1)) == 0;
114+
}
115+
116+
protected:
117+
static const std::size_t MIN_BUCKETS_SIZE = 2;
118+
static_assert(is_power_of_two(GrowthFactor) && GrowthFactor >= 2, "GrowthFactor must be a power of two >= 2.");
119+
120+
std::size_t m_mask;
121+
};
122+
123+
124+
/**
125+
* Grow the hash table by GrowthFactor::num / GrowthFactor::den and use a modulo to map a hash
126+
* to a bucket. Slower but it can be usefull if you want a slower growth.
127+
*/
128+
template<class GrowthFactor = std::ratio<3, 2>>
129+
class mod_growth_policy {
130+
public:
131+
mod_growth_policy(std::size_t& min_bucket_count_in_out) {
132+
if(min_bucket_count_in_out > max_bucket_count()) {
133+
throw std::length_error("The hash table exceeds its maxmimum size.");
134+
}
135+
136+
static_assert(MIN_BUCKETS_SIZE > 0, "MIN_BUCKETS_SIZE must be > 0.");
137+
const std::size_t min_bucket_count = MIN_BUCKETS_SIZE;
138+
139+
min_bucket_count_in_out = std::max(min_bucket_count, min_bucket_count_in_out);
140+
m_bucket_count = min_bucket_count_in_out;
141+
}
142+
143+
std::size_t bucket_for_hash(std::size_t hash) const noexcept {
144+
return hash % m_bucket_count;
145+
}
146+
147+
std::size_t next_bucket_count() const {
148+
if(m_bucket_count == max_bucket_count()) {
149+
throw std::length_error("The hash table exceeds its maxmimum size.");
150+
}
151+
152+
const double next_bucket_count = std::ceil(double(m_bucket_count) * REHASH_SIZE_MULTIPLICATION_FACTOR);
153+
if(!std::isnormal(next_bucket_count)) {
154+
throw std::length_error("The hash table exceeds its maxmimum size.");
155+
}
156+
157+
if(next_bucket_count > double(max_bucket_count())) {
158+
return max_bucket_count();
159+
}
160+
else {
161+
return std::size_t(next_bucket_count);
162+
}
163+
}
164+
165+
std::size_t max_bucket_count() const {
166+
return MAX_BUCKET_COUNT;
167+
}
168+
169+
private:
170+
static const std::size_t MIN_BUCKETS_SIZE = 2;
171+
static constexpr double REHASH_SIZE_MULTIPLICATION_FACTOR = 1.0 * GrowthFactor::num / GrowthFactor::den;
172+
static const std::size_t MAX_BUCKET_COUNT =
173+
std::size_t(double(
174+
std::numeric_limits<std::size_t>::max() / REHASH_SIZE_MULTIPLICATION_FACTOR
175+
));
176+
177+
static_assert(REHASH_SIZE_MULTIPLICATION_FACTOR >= 1.1, "Growth factor should be >= 1.1.");
178+
179+
std::size_t m_bucket_count;
180+
};
181+
182+
183+
184+
namespace detail {
185+
186+
static constexpr const std::array<std::size_t, 39> PRIMES = {{
187+
5ul, 17ul, 29ul, 37ul, 53ul, 67ul, 79ul, 97ul, 131ul, 193ul, 257ul, 389ul, 521ul, 769ul, 1031ul, 1543ul, 2053ul,
188+
3079ul, 6151ul, 12289ul, 24593ul, 49157ul, 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, 3145739ul,
189+
6291469ul, 12582917ul, 25165843ul, 50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
190+
1610612741ul, 3221225473ul, 4294967291ul
191+
}};
192+
193+
template<unsigned int IPrime>
194+
static constexpr std::size_t mod(std::size_t hash) { return hash % PRIMES[IPrime]; }
195+
196+
// MOD_PRIME[iprime](hash) returns hash % PRIMES[iprime]. This table allows for faster modulo as the
197+
// compiler can optimize the modulo code better with a constant known at the compilation.
198+
static constexpr const std::array<std::size_t(*)(std::size_t), 39> MOD_PRIME = {{
199+
&mod<0>, &mod<1>, &mod<2>, &mod<3>, &mod<4>, &mod<5>, &mod<6>, &mod<7>, &mod<8>, &mod<9>, &mod<10>,
200+
&mod<11>, &mod<12>, &mod<13>, &mod<14>, &mod<15>, &mod<16>, &mod<17>, &mod<18>, &mod<19>, &mod<20>,
201+
&mod<21>, &mod<22>, &mod<23>, &mod<24>, &mod<25>, &mod<26>, &mod<27>, &mod<28>, &mod<29>, &mod<30>,
202+
&mod<31>, &mod<32>, &mod<33>, &mod<34>, &mod<35>, &mod<36>, &mod<37> , &mod<38>
203+
}};
204+
205+
}
206+
207+
/**
208+
* Grow the hash table by using prime numbers as bucket count. Slower than tsl::rh::power_of_two_growth_policy in
209+
* general but will probably distribute the values around better in the buckets with a poor hash function.
210+
*
211+
* To allow the compiler to optimize the modulo operation, a lookup table is used with constant primes numbers.
212+
*
213+
* With a switch the code would look like:
214+
* \code
215+
* switch(iprime) { // iprime is the current prime of the hash table
216+
* case 0: hash % 5ul;
217+
* break;
218+
* case 1: hash % 17ul;
219+
* break;
220+
* case 2: hash % 29ul;
221+
* break;
222+
* ...
223+
* }
224+
* \endcode
225+
*
226+
* Due to the constant variable in the modulo the compiler is able to optimize the operation
227+
* by a series of multiplications, substractions and shifts.
228+
*
229+
* The 'hash % 5' could become something like 'hash - (hash * 0xCCCCCCCD) >> 34) * 5' in a 64 bits environement.
230+
*/
231+
class prime_growth_policy {
232+
public:
233+
prime_growth_policy(std::size_t& min_bucket_count_in_out) {
234+
auto it_prime = std::lower_bound(detail::PRIMES.begin(),
235+
detail::PRIMES.end(), min_bucket_count_in_out);
236+
if(it_prime == detail::PRIMES.end()) {
237+
throw std::length_error("The hash table exceeds its maxmimum size.");
238+
}
239+
240+
m_iprime = static_cast<unsigned int>(std::distance(detail::PRIMES.begin(), it_prime));
241+
min_bucket_count_in_out = *it_prime;
242+
}
243+
244+
std::size_t bucket_for_hash(std::size_t hash) const noexcept {
245+
return detail::MOD_PRIME[m_iprime](hash);
246+
}
247+
248+
std::size_t next_bucket_count() const {
249+
if(m_iprime + 1 >= detail::PRIMES.size()) {
250+
throw std::length_error("The hash table exceeds its maxmimum size.");
251+
}
252+
253+
return detail::PRIMES[m_iprime + 1];
254+
}
255+
256+
std::size_t max_bucket_count() const {
257+
return detail::PRIMES.back();
258+
}
259+
260+
private:
261+
unsigned int m_iprime;
262+
263+
static_assert(std::numeric_limits<decltype(m_iprime)>::max() >= detail::PRIMES.size(),
264+
"The type of m_iprime is not big enough.");
265+
};
266+
267+
}
268+
}
269+
270+
#endif

0 commit comments

Comments
 (0)