Lines 24-98
Link Here
|
24 |
* |
24 |
* |
25 |
*/ |
25 |
*/ |
26 |
|
26 |
|
|
|
27 |
#define _GNU_SOURCE |
28 |
|
27 |
#include <stdio.h> |
29 |
#include <stdio.h> |
28 |
#include <stdlib.h> |
30 |
#include <stdlib.h> |
29 |
#include <string.h> |
31 |
#include <string.h> |
30 |
#include <ctype.h> |
32 |
#include <ctype.h> |
31 |
|
33 |
|
32 |
/* maximum token length used. It doesn't pay to increase it a lot, because |
|
|
33 |
* very long substrings probably don't repeat themselves too often. */ |
34 |
#define MAX_TOK_SIZE 11 |
35 |
#define KSYM_NAME_LEN 127 |
34 |
#define KSYM_NAME_LEN 127 |
36 |
|
35 |
|
37 |
/* we use only a subset of the complete symbol table to gather the token count, |
|
|
38 |
* to speed up compression, at the expense of a little compression ratio */ |
39 |
#define WORKING_SET 1024 |
40 |
|
41 |
/* first find the best token only on the list of tokens that would profit more |
42 |
* than GOOD_BAD_THRESHOLD. Only if this list is empty go to the "bad" list. |
43 |
* Increasing this value will put less tokens on the "good" list, so the search |
44 |
* is faster. However, if the good list runs out of tokens, we must painfully |
45 |
* search the bad list. */ |
46 |
#define GOOD_BAD_THRESHOLD 10 |
47 |
|
48 |
/* token hash parameters */ |
49 |
#define HASH_BITS 18 |
50 |
#define HASH_TABLE_SIZE (1 << HASH_BITS) |
51 |
#define HASH_MASK (HASH_TABLE_SIZE - 1) |
52 |
#define HASH_BASE_OFFSET 2166136261U |
53 |
#define HASH_FOLD(a) ((a)&(HASH_MASK)) |
54 |
|
55 |
/* flags to mark symbols */ |
56 |
#define SYM_FLAG_VALID 1 |
57 |
#define SYM_FLAG_SAMPLED 2 |
58 |
|
36 |
|
59 |
struct sym_entry { |
37 |
struct sym_entry { |
60 |
unsigned long long addr; |
38 |
unsigned long long addr; |
61 |
char type; |
39 |
unsigned int len; |
62 |
unsigned char flags; |
|
|
63 |
unsigned char len; |
64 |
unsigned char *sym; |
40 |
unsigned char *sym; |
65 |
}; |
41 |
}; |
66 |
|
42 |
|
67 |
|
43 |
|
68 |
static struct sym_entry *table; |
44 |
static struct sym_entry *table; |
69 |
static int size, cnt; |
45 |
static unsigned int table_size, table_cnt; |
70 |
static unsigned long long _stext, _etext, _sinittext, _einittext, _sextratext, _eextratext; |
46 |
static unsigned long long _stext, _etext, _sinittext, _einittext, _sextratext, _eextratext; |
71 |
static int all_symbols = 0; |
47 |
static int all_symbols = 0; |
72 |
static char symbol_prefix_char = '\0'; |
48 |
static char symbol_prefix_char = '\0'; |
73 |
|
49 |
|
74 |
struct token { |
50 |
int token_profit[0x10000]; |
75 |
unsigned char data[MAX_TOK_SIZE]; |
|
|
76 |
unsigned char len; |
77 |
/* profit: the number of bytes that could be saved by inserting this |
78 |
* token into the table */ |
79 |
int profit; |
80 |
struct token *next; /* next token on the hash list */ |
81 |
struct token *right; /* next token on the good/bad list */ |
82 |
struct token *left; /* previous token on the good/bad list */ |
83 |
struct token *smaller; /* token that is less one letter than this one */ |
84 |
}; |
85 |
|
86 |
struct token bad_head, good_head; |
87 |
struct token *hash_table[HASH_TABLE_SIZE]; |
88 |
|
51 |
|
89 |
/* the table that holds the result of the compression */ |
52 |
/* the table that holds the result of the compression */ |
90 |
unsigned char best_table[256][MAX_TOK_SIZE+1]; |
53 |
unsigned char best_table[256][2]; |
91 |
unsigned char best_table_len[256]; |
54 |
unsigned char best_table_len[256]; |
92 |
|
55 |
|
93 |
|
56 |
|
94 |
static void |
57 |
static void usage(void) |
95 |
usage(void) |
|
|
96 |
{ |
58 |
{ |
97 |
fprintf(stderr, "Usage: kallsyms [--all-symbols] [--symbol-prefix=<prefix char>] < in.map > out.S\n"); |
59 |
fprintf(stderr, "Usage: kallsyms [--all-symbols] [--symbol-prefix=<prefix char>] < in.map > out.S\n"); |
98 |
exit(1); |
60 |
exit(1); |
Lines 102-122
usage(void)
Link Here
|
102 |
* This ignores the intensely annoying "mapping symbols" found |
64 |
* This ignores the intensely annoying "mapping symbols" found |
103 |
* in ARM ELF files: $a, $t and $d. |
65 |
* in ARM ELF files: $a, $t and $d. |
104 |
*/ |
66 |
*/ |
105 |
static inline int |
67 |
static inline int is_arm_mapping_symbol(const char *str) |
106 |
is_arm_mapping_symbol(const char *str) |
|
|
107 |
{ |
68 |
{ |
108 |
return str[0] == '$' && strchr("atd", str[1]) |
69 |
return str[0] == '$' && strchr("atd", str[1]) |
109 |
&& (str[2] == '\0' || str[2] == '.'); |
70 |
&& (str[2] == '\0' || str[2] == '.'); |
110 |
} |
71 |
} |
111 |
|
72 |
|
112 |
static int |
73 |
static int read_symbol(FILE *in, struct sym_entry *s) |
113 |
read_symbol(FILE *in, struct sym_entry *s) |
|
|
114 |
{ |
74 |
{ |
115 |
char str[500]; |
75 |
char str[500]; |
116 |
char *sym; |
76 |
char *sym, stype; |
117 |
int rc; |
77 |
int rc; |
118 |
|
78 |
|
119 |
rc = fscanf(in, "%llx %c %499s\n", &s->addr, &s->type, str); |
79 |
rc = fscanf(in, "%llx %c %499s\n", &s->addr, &stype, str); |
120 |
if (rc != 3) { |
80 |
if (rc != 3) { |
121 |
if (rc != EOF) { |
81 |
if (rc != EOF) { |
122 |
/* skip line */ |
82 |
/* skip line */ |
Lines 143-149
read_symbol(FILE *in, struct sym_entry *
Link Here
|
143 |
_sextratext = s->addr; |
103 |
_sextratext = s->addr; |
144 |
else if (strcmp(sym, "_eextratext") == 0) |
104 |
else if (strcmp(sym, "_eextratext") == 0) |
145 |
_eextratext = s->addr; |
105 |
_eextratext = s->addr; |
146 |
else if (toupper(s->type) == 'A') |
106 |
else if (toupper(stype) == 'A') |
147 |
{ |
107 |
{ |
148 |
/* Keep these useful absolute symbols */ |
108 |
/* Keep these useful absolute symbols */ |
149 |
if (strcmp(sym, "__kernel_syscall_via_break") && |
109 |
if (strcmp(sym, "__kernel_syscall_via_break") && |
Lines 153-174
read_symbol(FILE *in, struct sym_entry *
Link Here
|
153 |
return -1; |
113 |
return -1; |
154 |
|
114 |
|
155 |
} |
115 |
} |
156 |
else if (toupper(s->type) == 'U' || |
116 |
else if (toupper(stype) == 'U' || |
157 |
is_arm_mapping_symbol(sym)) |
117 |
is_arm_mapping_symbol(sym)) |
158 |
return -1; |
118 |
return -1; |
159 |
|
119 |
|
160 |
/* include the type field in the symbol name, so that it gets |
120 |
/* include the type field in the symbol name, so that it gets |
161 |
* compressed together */ |
121 |
* compressed together */ |
162 |
s->len = strlen(str) + 1; |
122 |
s->len = strlen(str) + 1; |
163 |
s->sym = (char *) malloc(s->len + 1); |
123 |
s->sym = malloc(s->len + 1); |
164 |
strcpy(s->sym + 1, str); |
124 |
strcpy((char *)s->sym + 1, str); |
165 |
s->sym[0] = s->type; |
125 |
s->sym[0] = stype; |
166 |
|
126 |
|
167 |
return 0; |
127 |
return 0; |
168 |
} |
128 |
} |
169 |
|
129 |
|
170 |
static int |
130 |
static int symbol_valid(struct sym_entry *s) |
171 |
symbol_valid(struct sym_entry *s) |
|
|
172 |
{ |
131 |
{ |
173 |
/* Symbols which vary between passes. Passes 1 and 2 must have |
132 |
/* Symbols which vary between passes. Passes 1 and 2 must have |
174 |
* identical symbol lists. The kallsyms_* symbols below are only added |
133 |
* identical symbol lists. The kallsyms_* symbols below are only added |
Lines 214-243
symbol_valid(struct sym_entry *s)
Link Here
|
214 |
} |
173 |
} |
215 |
|
174 |
|
216 |
/* Exclude symbols which vary between passes. */ |
175 |
/* Exclude symbols which vary between passes. */ |
217 |
if (strstr(s->sym + offset, "_compiled.")) |
176 |
if (strstr((char *)s->sym + offset, "_compiled.")) |
218 |
return 0; |
177 |
return 0; |
219 |
|
178 |
|
220 |
for (i = 0; special_symbols[i]; i++) |
179 |
for (i = 0; special_symbols[i]; i++) |
221 |
if( strcmp(s->sym + offset, special_symbols[i]) == 0 ) |
180 |
if( strcmp((char *)s->sym + offset, special_symbols[i]) == 0 ) |
222 |
return 0; |
181 |
return 0; |
223 |
|
182 |
|
224 |
return 1; |
183 |
return 1; |
225 |
} |
184 |
} |
226 |
|
185 |
|
227 |
static void |
186 |
static void read_map(FILE *in) |
228 |
read_map(FILE *in) |
|
|
229 |
{ |
187 |
{ |
230 |
while (!feof(in)) { |
188 |
while (!feof(in)) { |
231 |
if (cnt >= size) { |
189 |
if (table_cnt >= table_size) { |
232 |
size += 10000; |
190 |
table_size += 10000; |
233 |
table = realloc(table, sizeof(*table) * size); |
191 |
table = realloc(table, sizeof(*table) * table_size); |
234 |
if (!table) { |
192 |
if (!table) { |
235 |
fprintf(stderr, "out of memory\n"); |
193 |
fprintf(stderr, "out of memory\n"); |
236 |
exit (1); |
194 |
exit (1); |
237 |
} |
195 |
} |
238 |
} |
196 |
} |
239 |
if (read_symbol(in, &table[cnt]) == 0) |
197 |
if (read_symbol(in, &table[table_cnt]) == 0) |
240 |
cnt++; |
198 |
table_cnt++; |
241 |
} |
199 |
} |
242 |
} |
200 |
} |
243 |
|
201 |
|
Lines 281-290
static int expand_symbol(unsigned char *
Link Here
|
281 |
return total; |
239 |
return total; |
282 |
} |
240 |
} |
283 |
|
241 |
|
284 |
static void |
242 |
static void write_src(void) |
285 |
write_src(void) |
|
|
286 |
{ |
243 |
{ |
287 |
int i, k, off, valid; |
244 |
unsigned int i, k, off; |
288 |
unsigned int best_idx[256]; |
245 |
unsigned int best_idx[256]; |
289 |
unsigned int *markers; |
246 |
unsigned int *markers; |
290 |
char buf[KSYM_NAME_LEN+1]; |
247 |
char buf[KSYM_NAME_LEN+1]; |
Lines 301-333
write_src(void)
Link Here
|
301 |
printf(".data\n"); |
258 |
printf(".data\n"); |
302 |
|
259 |
|
303 |
output_label("kallsyms_addresses"); |
260 |
output_label("kallsyms_addresses"); |
304 |
valid = 0; |
261 |
for (i = 0; i < table_cnt; i++) { |
305 |
for (i = 0; i < cnt; i++) { |
262 |
printf("\tPTR\t%#llx\n", table[i].addr); |
306 |
if (table[i].flags & SYM_FLAG_VALID) { |
|
|
307 |
printf("\tPTR\t%#llx\n", table[i].addr); |
308 |
valid++; |
309 |
} |
310 |
} |
263 |
} |
311 |
printf("\n"); |
264 |
printf("\n"); |
312 |
|
265 |
|
313 |
output_label("kallsyms_num_syms"); |
266 |
output_label("kallsyms_num_syms"); |
314 |
printf("\tPTR\t%d\n", valid); |
267 |
printf("\tPTR\t%d\n", table_cnt); |
315 |
printf("\n"); |
268 |
printf("\n"); |
316 |
|
269 |
|
317 |
/* table of offset markers, that give the offset in the compressed stream |
270 |
/* table of offset markers, that give the offset in the compressed stream |
318 |
* every 256 symbols */ |
271 |
* every 256 symbols */ |
319 |
markers = (unsigned int *) malloc(sizeof(unsigned int)*((valid + 255) / 256)); |
272 |
markers = (unsigned int *) malloc(sizeof(unsigned int) * ((table_cnt + 255) / 256)); |
320 |
|
273 |
|
321 |
output_label("kallsyms_names"); |
274 |
output_label("kallsyms_names"); |
322 |
valid = 0; |
|
|
323 |
off = 0; |
275 |
off = 0; |
324 |
for (i = 0; i < cnt; i++) { |
276 |
for (i = 0; i < table_cnt; i++) { |
325 |
|
277 |
if ((i & 0xFF) == 0) |
326 |
if (!table[i].flags & SYM_FLAG_VALID) |
278 |
markers[i >> 8] = off; |
327 |
continue; |
|
|
328 |
|
329 |
if ((valid & 0xFF) == 0) |
330 |
markers[valid >> 8] = off; |
331 |
|
279 |
|
332 |
printf("\t.byte 0x%02x", table[i].len); |
280 |
printf("\t.byte 0x%02x", table[i].len); |
333 |
for (k = 0; k < table[i].len; k++) |
281 |
for (k = 0; k < table[i].len; k++) |
Lines 335-346
write_src(void)
Link Here
|
335 |
printf("\n"); |
283 |
printf("\n"); |
336 |
|
284 |
|
337 |
off += table[i].len + 1; |
285 |
off += table[i].len + 1; |
338 |
valid++; |
|
|
339 |
} |
286 |
} |
340 |
printf("\n"); |
287 |
printf("\n"); |
341 |
|
288 |
|
342 |
output_label("kallsyms_markers"); |
289 |
output_label("kallsyms_markers"); |
343 |
for (i = 0; i < ((valid + 255) >> 8); i++) |
290 |
for (i = 0; i < ((table_cnt + 255) >> 8); i++) |
344 |
printf("\tPTR\t%d\n", markers[i]); |
291 |
printf("\tPTR\t%d\n", markers[i]); |
345 |
printf("\n"); |
292 |
printf("\n"); |
346 |
|
293 |
|
Lines 350-356
write_src(void)
Link Here
|
350 |
off = 0; |
297 |
off = 0; |
351 |
for (i = 0; i < 256; i++) { |
298 |
for (i = 0; i < 256; i++) { |
352 |
best_idx[i] = off; |
299 |
best_idx[i] = off; |
353 |
expand_symbol(best_table[i],best_table_len[i],buf); |
300 |
expand_symbol(best_table[i], best_table_len[i], buf); |
354 |
printf("\t.asciz\t\"%s\"\n", buf); |
301 |
printf("\t.asciz\t\"%s\"\n", buf); |
355 |
off += strlen(buf) + 1; |
302 |
off += strlen(buf) + 1; |
356 |
} |
303 |
} |
Lines 365-517
write_src(void)
Link Here
|
365 |
|
312 |
|
366 |
/* table lookup compression functions */ |
313 |
/* table lookup compression functions */ |
367 |
|
314 |
|
368 |
static inline unsigned int rehash_token(unsigned int hash, unsigned char data) |
|
|
369 |
{ |
370 |
return ((hash * 16777619) ^ data); |
371 |
} |
372 |
|
373 |
static unsigned int hash_token(unsigned char *data, int len) |
374 |
{ |
375 |
unsigned int hash=HASH_BASE_OFFSET; |
376 |
int i; |
377 |
|
378 |
for (i = 0; i < len; i++) |
379 |
hash = rehash_token(hash, data[i]); |
380 |
|
381 |
return HASH_FOLD(hash); |
382 |
} |
383 |
|
384 |
/* find a token given its data and hash value */ |
385 |
static struct token *find_token_hash(unsigned char *data, int len, unsigned int hash) |
386 |
{ |
387 |
struct token *ptr; |
388 |
|
389 |
ptr = hash_table[hash]; |
390 |
|
391 |
while (ptr) { |
392 |
if ((ptr->len == len) && (memcmp(ptr->data, data, len) == 0)) |
393 |
return ptr; |
394 |
ptr=ptr->next; |
395 |
} |
396 |
|
397 |
return NULL; |
398 |
} |
399 |
|
400 |
static inline void insert_token_in_group(struct token *head, struct token *ptr) |
401 |
{ |
402 |
ptr->right = head->right; |
403 |
ptr->right->left = ptr; |
404 |
head->right = ptr; |
405 |
ptr->left = head; |
406 |
} |
407 |
|
408 |
static inline void remove_token_from_group(struct token *ptr) |
409 |
{ |
410 |
ptr->left->right = ptr->right; |
411 |
ptr->right->left = ptr->left; |
412 |
} |
413 |
|
414 |
|
415 |
/* build the counts for all the tokens that start with "data", and have lenghts |
416 |
* from 2 to "len" */ |
417 |
static void learn_token(unsigned char *data, int len) |
418 |
{ |
419 |
struct token *ptr,*last_ptr; |
420 |
int i, newprofit; |
421 |
unsigned int hash = HASH_BASE_OFFSET; |
422 |
unsigned int hashes[MAX_TOK_SIZE + 1]; |
423 |
|
424 |
if (len > MAX_TOK_SIZE) |
425 |
len = MAX_TOK_SIZE; |
426 |
|
427 |
/* calculate and store the hash values for all the sub-tokens */ |
428 |
hash = rehash_token(hash, data[0]); |
429 |
for (i = 2; i <= len; i++) { |
430 |
hash = rehash_token(hash, data[i-1]); |
431 |
hashes[i] = HASH_FOLD(hash); |
432 |
} |
433 |
|
434 |
last_ptr = NULL; |
435 |
ptr = NULL; |
436 |
|
437 |
for (i = len; i >= 2; i--) { |
438 |
hash = hashes[i]; |
439 |
|
440 |
if (!ptr) ptr = find_token_hash(data, i, hash); |
441 |
|
442 |
if (!ptr) { |
443 |
/* create a new token entry */ |
444 |
ptr = (struct token *) malloc(sizeof(*ptr)); |
445 |
|
446 |
memcpy(ptr->data, data, i); |
447 |
ptr->len = i; |
448 |
|
449 |
/* when we create an entry, it's profit is 0 because |
450 |
* we also take into account the size of the token on |
451 |
* the compressed table. We then subtract GOOD_BAD_THRESHOLD |
452 |
* so that the test to see if this token belongs to |
453 |
* the good or bad list, is a comparison to zero */ |
454 |
ptr->profit = -GOOD_BAD_THRESHOLD; |
455 |
|
456 |
ptr->next = hash_table[hash]; |
457 |
hash_table[hash] = ptr; |
458 |
|
459 |
insert_token_in_group(&bad_head, ptr); |
460 |
|
461 |
ptr->smaller = NULL; |
462 |
} else { |
463 |
newprofit = ptr->profit + (ptr->len - 1); |
464 |
/* check to see if this token needs to be moved to a |
465 |
* different list */ |
466 |
if((ptr->profit < 0) && (newprofit >= 0)) { |
467 |
remove_token_from_group(ptr); |
468 |
insert_token_in_group(&good_head,ptr); |
469 |
} |
470 |
ptr->profit = newprofit; |
471 |
} |
472 |
|
473 |
if (last_ptr) last_ptr->smaller = ptr; |
474 |
last_ptr = ptr; |
475 |
|
476 |
ptr = ptr->smaller; |
477 |
} |
478 |
} |
479 |
|
480 |
/* decrease the counts for all the tokens that start with "data", and have lenghts |
481 |
* from 2 to "len". This function is much simpler than learn_token because we have |
482 |
* more guarantees (tho tokens exist, the ->smaller pointer is set, etc.) |
483 |
* The two separate functions exist only because of compression performance */ |
484 |
static void forget_token(unsigned char *data, int len) |
485 |
{ |
486 |
struct token *ptr; |
487 |
int i, newprofit; |
488 |
unsigned int hash=0; |
489 |
|
490 |
if (len > MAX_TOK_SIZE) len = MAX_TOK_SIZE; |
491 |
|
492 |
hash = hash_token(data, len); |
493 |
ptr = find_token_hash(data, len, hash); |
494 |
|
495 |
for (i = len; i >= 2; i--) { |
496 |
|
497 |
newprofit = ptr->profit - (ptr->len - 1); |
498 |
if ((ptr->profit >= 0) && (newprofit < 0)) { |
499 |
remove_token_from_group(ptr); |
500 |
insert_token_in_group(&bad_head, ptr); |
501 |
} |
502 |
ptr->profit=newprofit; |
503 |
|
504 |
ptr=ptr->smaller; |
505 |
} |
506 |
} |
507 |
|
508 |
/* count all the possible tokens in a symbol */ |
315 |
/* count all the possible tokens in a symbol */ |
509 |
static void learn_symbol(unsigned char *symbol, int len) |
316 |
static void learn_symbol(unsigned char *symbol, int len) |
510 |
{ |
317 |
{ |
511 |
int i; |
318 |
int i; |
512 |
|
319 |
|
513 |
for (i = 0; i < len - 1; i++) |
320 |
for (i = 0; i < len - 1; i++) |
514 |
learn_token(symbol + i, len - i); |
321 |
token_profit[ symbol[i] + (symbol[i + 1] << 8) ]++; |
515 |
} |
322 |
} |
516 |
|
323 |
|
517 |
/* decrease the count for all the possible tokens in a symbol */ |
324 |
/* decrease the count for all the possible tokens in a symbol */ |
Lines 520-636
static void forget_symbol(unsigned char
Link Here
|
520 |
int i; |
327 |
int i; |
521 |
|
328 |
|
522 |
for (i = 0; i < len - 1; i++) |
329 |
for (i = 0; i < len - 1; i++) |
523 |
forget_token(symbol + i, len - i); |
330 |
token_profit[ symbol[i] + (symbol[i + 1] << 8) ]--; |
524 |
} |
331 |
} |
525 |
|
332 |
|
526 |
/* set all the symbol flags and do the initial token count */ |
333 |
/* remove all the invalid symbols from the table and do the initial token count */ |
527 |
static void build_initial_tok_table(void) |
334 |
static void build_initial_tok_table(void) |
528 |
{ |
335 |
{ |
529 |
int i, use_it, valid; |
336 |
unsigned int i, pos; |
530 |
|
337 |
|
531 |
valid = 0; |
338 |
pos = 0; |
532 |
for (i = 0; i < cnt; i++) { |
339 |
for (i = 0; i < table_cnt; i++) { |
533 |
table[i].flags = 0; |
|
|
534 |
if ( symbol_valid(&table[i]) ) { |
340 |
if ( symbol_valid(&table[i]) ) { |
535 |
table[i].flags |= SYM_FLAG_VALID; |
341 |
if (pos != i) |
536 |
valid++; |
342 |
table[pos] = table[i]; |
|
|
343 |
learn_symbol(table[pos].sym, table[pos].len); |
344 |
pos++; |
537 |
} |
345 |
} |
538 |
} |
346 |
} |
539 |
|
347 |
table_cnt = pos; |
540 |
use_it = 0; |
|
|
541 |
for (i = 0; i < cnt; i++) { |
542 |
|
543 |
/* subsample the available symbols. This method is almost like |
544 |
* a Bresenham's algorithm to get uniformly distributed samples |
545 |
* across the symbol table */ |
546 |
if (table[i].flags & SYM_FLAG_VALID) { |
547 |
|
548 |
use_it += WORKING_SET; |
549 |
|
550 |
if (use_it >= valid) { |
551 |
table[i].flags |= SYM_FLAG_SAMPLED; |
552 |
use_it -= valid; |
553 |
} |
554 |
} |
555 |
if (table[i].flags & SYM_FLAG_SAMPLED) |
556 |
learn_symbol(table[i].sym, table[i].len); |
557 |
} |
558 |
} |
348 |
} |
559 |
|
349 |
|
560 |
/* replace a given token in all the valid symbols. Use the sampled symbols |
350 |
/* replace a given token in all the valid symbols. Use the sampled symbols |
561 |
* to update the counts */ |
351 |
* to update the counts */ |
562 |
static void compress_symbols(unsigned char *str, int tlen, int idx) |
352 |
static void compress_symbols(unsigned char *str, int idx) |
563 |
{ |
353 |
{ |
564 |
int i, len, learn, size; |
354 |
unsigned int i, len, size; |
565 |
unsigned char *p; |
355 |
unsigned char *p1, *p2; |
566 |
|
|
|
567 |
for (i = 0; i < cnt; i++) { |
568 |
|
356 |
|
569 |
if (!(table[i].flags & SYM_FLAG_VALID)) continue; |
357 |
for (i = 0; i < table_cnt; i++) { |
570 |
|
358 |
|
571 |
len = table[i].len; |
359 |
len = table[i].len; |
572 |
learn = 0; |
360 |
p1 = table[i].sym; |
573 |
p = table[i].sym; |
361 |
|
|
|
362 |
/* find the token on the symbol */ |
363 |
p2 = memmem(p1, len, str, 2); |
364 |
if (!p2) continue; |
365 |
|
366 |
/* decrease the counts for this symbol's tokens */ |
367 |
forget_symbol(table[i].sym, len); |
368 |
|
369 |
size = len; |
574 |
|
370 |
|
575 |
do { |
371 |
do { |
|
|
372 |
*p2 = idx; |
373 |
p2++; |
374 |
size -= (p2 - p1); |
375 |
memmove(p2, p2 + 1, size); |
376 |
p1 = p2; |
377 |
len--; |
378 |
|
379 |
if (size < 2) break; |
380 |
|
576 |
/* find the token on the symbol */ |
381 |
/* find the token on the symbol */ |
577 |
p = (unsigned char *) strstr((char *) p, (char *) str); |
382 |
p2 = memmem(p1, size, str, 2); |
578 |
if (!p) break; |
|
|
579 |
|
383 |
|
580 |
if (!learn) { |
384 |
} while (p2); |
581 |
/* if this symbol was used to count, decrease it */ |
|
|
582 |
if (table[i].flags & SYM_FLAG_SAMPLED) |
583 |
forget_symbol(table[i].sym, len); |
584 |
learn = 1; |
585 |
} |
586 |
|
385 |
|
587 |
*p = idx; |
386 |
table[i].len = len; |
588 |
size = (len - (p - table[i].sym)) - tlen + 1; |
387 |
|
589 |
memmove(p + 1, p + tlen, size); |
388 |
/* increase the counts for this symbol's new tokens */ |
590 |
p++; |
389 |
learn_symbol(table[i].sym, len); |
591 |
len -= tlen - 1; |
|
|
592 |
|
593 |
} while (size >= tlen); |
594 |
|
595 |
if(learn) { |
596 |
table[i].len = len; |
597 |
/* if this symbol was used to count, learn it again */ |
598 |
if(table[i].flags & SYM_FLAG_SAMPLED) |
599 |
learn_symbol(table[i].sym, len); |
600 |
} |
601 |
} |
390 |
} |
602 |
} |
391 |
} |
603 |
|
392 |
|
604 |
/* search the token with the maximum profit */ |
393 |
/* search the token with the maximum profit */ |
605 |
static struct token *find_best_token(void) |
394 |
static int find_best_token(void) |
606 |
{ |
395 |
{ |
607 |
struct token *ptr,*best,*head; |
396 |
int i, best, bestprofit; |
608 |
int bestprofit; |
|
|
609 |
|
397 |
|
610 |
bestprofit=-10000; |
398 |
bestprofit=-10000; |
|
|
399 |
best = 0; |
611 |
|
400 |
|
612 |
/* failsafe: if the "good" list is empty search from the "bad" list */ |
401 |
for (i = 0; i < 0x10000; i++) { |
613 |
if(good_head.right == &good_head) head = &bad_head; |
402 |
if (token_profit[i] > bestprofit) { |
614 |
else head = &good_head; |
403 |
best = i; |
615 |
|
404 |
bestprofit = token_profit[i]; |
616 |
ptr = head->right; |
|
|
617 |
best = NULL; |
618 |
while (ptr != head) { |
619 |
if (ptr->profit > bestprofit) { |
620 |
bestprofit = ptr->profit; |
621 |
best = ptr; |
622 |
} |
405 |
} |
623 |
ptr = ptr->right; |
|
|
624 |
} |
406 |
} |
625 |
|
|
|
626 |
return best; |
407 |
return best; |
627 |
} |
408 |
} |
628 |
|
409 |
|
629 |
/* this is the core of the algorithm: calculate the "best" table */ |
410 |
/* this is the core of the algorithm: calculate the "best" table */ |
630 |
static void optimize_result(void) |
411 |
static void optimize_result(void) |
631 |
{ |
412 |
{ |
632 |
struct token *best; |
413 |
int i, best; |
633 |
int i; |
|
|
634 |
|
414 |
|
635 |
/* using the '\0' symbol last allows compress_symbols to use standard |
415 |
/* using the '\0' symbol last allows compress_symbols to use standard |
636 |
* fast string functions */ |
416 |
* fast string functions */ |
Lines 644-657
static void optimize_result(void)
Link Here
|
644 |
best = find_best_token(); |
424 |
best = find_best_token(); |
645 |
|
425 |
|
646 |
/* place it in the "best" table */ |
426 |
/* place it in the "best" table */ |
647 |
best_table_len[i] = best->len; |
427 |
best_table_len[i] = 2; |
648 |
memcpy(best_table[i], best->data, best_table_len[i]); |
428 |
best_table[i][0] = best & 0xFF; |
649 |
/* zero terminate the token so that we can use strstr |
429 |
best_table[i][1] = (best >> 8) & 0xFF; |
650 |
in compress_symbols */ |
|
|
651 |
best_table[i][best_table_len[i]]='\0'; |
652 |
|
430 |
|
653 |
/* replace this token in all the valid symbols */ |
431 |
/* replace this token in all the valid symbols */ |
654 |
compress_symbols(best_table[i], best_table_len[i], i); |
432 |
compress_symbols(best_table[i], i); |
655 |
} |
433 |
} |
656 |
} |
434 |
} |
657 |
} |
435 |
} |
Lines 659-697
static void optimize_result(void)
Link Here
|
659 |
/* start by placing the symbols that are actually used on the table */ |
437 |
/* start by placing the symbols that are actually used on the table */ |
660 |
static void insert_real_symbols_in_table(void) |
438 |
static void insert_real_symbols_in_table(void) |
661 |
{ |
439 |
{ |
662 |
int i, j, c; |
440 |
unsigned int i, j, c; |
663 |
|
441 |
|
664 |
memset(best_table, 0, sizeof(best_table)); |
442 |
memset(best_table, 0, sizeof(best_table)); |
665 |
memset(best_table_len, 0, sizeof(best_table_len)); |
443 |
memset(best_table_len, 0, sizeof(best_table_len)); |
666 |
|
444 |
|
667 |
for (i = 0; i < cnt; i++) { |
445 |
for (i = 0; i < table_cnt; i++) { |
668 |
if (table[i].flags & SYM_FLAG_VALID) { |
446 |
for (j = 0; j < table[i].len; j++) { |
669 |
for (j = 0; j < table[i].len; j++) { |
447 |
c = table[i].sym[j]; |
670 |
c = table[i].sym[j]; |
448 |
best_table[c][0]=c; |
671 |
best_table[c][0]=c; |
449 |
best_table_len[c]=1; |
672 |
best_table_len[c]=1; |
|
|
673 |
} |
674 |
} |
450 |
} |
675 |
} |
451 |
} |
676 |
} |
452 |
} |
677 |
|
453 |
|
678 |
static void optimize_token_table(void) |
454 |
static void optimize_token_table(void) |
679 |
{ |
455 |
{ |
680 |
memset(hash_table, 0, sizeof(hash_table)); |
|
|
681 |
|
682 |
good_head.left = &good_head; |
683 |
good_head.right = &good_head; |
684 |
|
685 |
bad_head.left = &bad_head; |
686 |
bad_head.right = &bad_head; |
687 |
|
688 |
build_initial_tok_table(); |
456 |
build_initial_tok_table(); |
689 |
|
457 |
|
690 |
insert_real_symbols_in_table(); |
458 |
insert_real_symbols_in_table(); |
691 |
|
459 |
|
692 |
/* When valid symbol is not registered, exit to error */ |
460 |
/* When valid symbol is not registered, exit to error */ |
693 |
if (good_head.left == good_head.right && |
461 |
if (!table_cnt) { |
694 |
bad_head.left == bad_head.right) { |
|
|
695 |
fprintf(stderr, "No valid symbol.\n"); |
462 |
fprintf(stderr, "No valid symbol.\n"); |
696 |
exit(1); |
463 |
exit(1); |
697 |
} |
464 |
} |
Lines 700-707
static void optimize_token_table(void)
Link Here
|
700 |
} |
467 |
} |
701 |
|
468 |
|
702 |
|
469 |
|
703 |
int |
470 |
int main(int argc, char **argv) |
704 |
main(int argc, char **argv) |
|
|
705 |
{ |
471 |
{ |
706 |
if (argc >= 2) { |
472 |
if (argc >= 2) { |
707 |
int i; |
473 |
int i; |