-
Notifications
You must be signed in to change notification settings - Fork 0
/
strhash.c
286 lines (248 loc) · 6.96 KB
/
strhash.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/*
* Copyright (C) 1986, 1988, 1989, 1991-2012 the Free Software Foundation, Inc.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
/*
* The decision is made to grow the array if the average chain length is
* ``too big''. This is defined as the total number of entries in the table
* divided by the size of the array being greater than some constant.
*
* 11/2002: We make the constant a variable, so that it can be tweaked
* via environment variable.
*/
static int AVG_CHAIN_MAX = 2; /* 11/2002: Modern machines are bigger, cut this down from 10. */
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#if HAVE_UNISTD_H
#include <unistd.h>
#endif
#if HAVE_STDLIB_H
#include <stdlib.h>
#endif
#if HAVE_STRING_H
#include <string.h>
#endif
#include <sys/stat.h>
#include <stdio.h>
#include <errno.h>
#include "gawkextlib.h"
/* Are 2 counted strings equal? Note that STREQNN will match two empty ""
strings, whereas STREQN does not! */
/* Optimized version: check first char before trying memcmp. Is this
really faster with a modern compiler that may inline memcmp? */
#define STREQNN(S1,L1,S2,L2) \
(((L1) == (L2)) && \
(((L1) == 0) || ((*(S1) == *(S2)) && \
(memcmp((S1)+1,(S2)+1,(L1)-1) == 0))))
/* nuke definition in gawkapi.h */
#undef emalloc
#define emalloc(var,ty,x,str) \
do { \
if (!(var=(ty)malloc(x))) { \
fprintf(stderr, \
"%s: %s: can't allocate %ld bytes of memory (%s)", \
(str), #var, (long) (x), strerror(errno)); \
exit(1); \
} \
} while (0)
#define hash my_hash
/* my_hash --- calculate the hash function of the string in subs (copied
from gawk str_array.c, but not masking HASHC to 32 bits) */
static unsigned long
my_hash(const char *s, size_t len, unsigned long hsize, size_t *code)
{
unsigned long h = 0;
unsigned long htmp;
/*
* Ozan Yigit's original sdbm hash, copied from Margo Seltzers
* db package.
*
* This is INCREDIBLY ugly, but fast. We break the string up into
* 8 byte units. On the first time through the loop we get the
* "leftover bytes" (strlen % 8). On every other iteration, we
* perform 8 HASHC's so we handle all 8 bytes. Essentially, this
* saves us 7 cmp & branch instructions. If this routine is
* heavily used enough, it's worth the ugly coding.
*/
/*
* Even more speed:
* #define HASHC h = *s++ + 65599 * h
* Because 65599 = pow(2, 6) + pow(2, 16) - 1 we multiply by shifts
*/
#define HASHC htmp = (h << 6); \
h = *s++ + htmp + (htmp << 10) - h
h = 0;
/* "Duff's Device" */
if (len > 0) {
size_t loop = (len + 8 - 1) >> 3;
switch (len & (8 - 1)) {
case 0:
do { /* All fall throughs */
HASHC;
case 7: HASHC;
case 6: HASHC;
case 5: HASHC;
case 4: HASHC;
case 3: HASHC;
case 2: HASHC;
case 1: HASHC;
} while (--loop);
}
}
if (code != NULL)
*code = h;
if (h >= hsize)
h %= hsize;
return h;
}
static unsigned long
get_table_size(unsigned long oldsize)
{
size_t i, j;
/*
* This is an array of primes. We grow the table by an order of
* magnitude each time (not just doubling) so that growing is a
* rare operation. We expect, on average, that it won't happen
* more than twice. The final size is also chosen to be small
* enough so that MS-DOG mallocs can handle it. When things are
* very large (> 8K), we just double more or less, instead of
* just jumping from 8K to 64K.
*/
static const unsigned long sizes[] = {
13, 127, 1021, 8191, 16381, 32749, 65497,
131101, 262147, 524309, 1048583, 2097169,
4194319, 8388617, 16777259, 33554467,
67108879, 134217757, 268435459, 536870923,
1073741827
};
/* find next biggest hash size */
for (i = 0, j = sizeof(sizes)/sizeof(sizes[0]); i < j; i++) {
if (oldsize < sizes[i])
return sizes[i];
}
return sizes[j-1];
}
/* Generic string hash routines: */
struct _strhash {
strhash_entry **ht_index;
size_t index_size;
size_t entries;
int size_maxed;
};
strhash *
strhash_create(size_t min_table_size)
{
strhash *ht;
emalloc(ht, strhash *, sizeof(*ht), "strhash_create");
ht->index_size = get_table_size(min_table_size);
/* We need an ecalloc macro. */
emalloc(ht->ht_index, strhash_entry **,
ht->index_size*sizeof(*ht->ht_index), "strhash_create");
memset(ht->ht_index, 0, ht->index_size*sizeof(*ht->ht_index));
ht->entries = 0;
ht->size_maxed = 0;
return ht;
}
static void
strhash_grow(strhash *ht)
{
unsigned long newsize;
strhash_entry **new_index;
strhash_entry **bptr;
size_t i;
if ((newsize = get_table_size(ht->index_size)) <= ht->index_size) {
ht->size_maxed = 1;
return;
}
/* We need an ecalloc macro. */
emalloc(new_index, strhash_entry **,
newsize*sizeof(*new_index), "strhash_grow");
memset(new_index, 0, newsize*sizeof(*new_index));
for (bptr = ht->ht_index, i = 0; i < ht->index_size; i++, bptr++) {
strhash_entry *ent;
strhash_entry *nent;
for (ent = *bptr; ent; ent = nent) {
strhash_entry **nbptr;
nent = ent->next;
nbptr = &new_index[hash(ent->s, ent->len, newsize, NULL)];
ent->next = *nbptr;
*nbptr = ent;
}
}
free(ht->ht_index);
ht->ht_index = new_index;
ht->index_size = newsize;
}
strhash_entry *
strhash_get(strhash *ht, const char *s, size_t len, int insert_if_missing)
{
strhash_entry **bucket = &ht->ht_index[hash(s, len, ht->index_size, NULL)];
strhash_entry *ent;
/* Check if already present. */
for (ent = *bucket; ent; ent = ent->next) {
if (STREQNN(s, len, ent->s, ent->len))
return ent;
}
if (!insert_if_missing)
return NULL;
ht->entries++;
if (!ht->size_maxed && (ht->entries > AVG_CHAIN_MAX*ht->index_size))
strhash_grow(ht);
emalloc(ent, strhash_entry *, sizeof(*ent)+len, "strhash_insert");
ent->next = *bucket;
*bucket = ent;
ent->data = NULL;
ent->len = len;
memcpy(ent->s, s, len);
/* Add terminating NUL char just to be safe. */
ent->s[len] = '\0';
return ent;
}
int
strhash_delete(strhash *ht, const char *s, size_t len,
strhash_delete_func func, void *opaque)
{
strhash_entry **bucket = &ht->ht_index[hash(s, len, ht->index_size, NULL)];
strhash_entry *ent;
strhash_entry *prev = NULL;
/* Check if already present. */
for (ent = *bucket; ent; prev = ent, ent = ent->next) {
if (STREQNN(s, len, ent->s, ent->len)) {
/* Remove entry first before calling delete_func
to make sure the hashtable is in a consistent
state. */
if (prev)
prev->next = ent->next;
else
*bucket = ent->next;
ht->entries--;
if (func)
(*func)(ent->data, opaque, ht, ent);
free(ent);
return 0;
}
}
return -1;
}
void
strhash_destroy(strhash *ht, strhash_delete_func func, void *opaque)
{
size_t i;
strhash_entry **bptr;
for (bptr = ht->ht_index, i = 0; i < ht->index_size; i++, bptr++) {
while (*bptr != NULL) {
strhash_entry *ent;
ent = *bptr;
*bptr = ent->next;
if (func)
(*func)(ent->data, opaque, ht, ent);
free(ent);
}
}
free(ht->ht_index);
free(ht);
}