1 /* Further modified for PHP */
2 /* $Id$ */
3
4 /* $OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucgendat.c,v 1.36.2.4 2007/01/02 21:43:51 kurt Exp $ */
5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6 *
7 * Copyright 1998-2007 The OpenLDAP Foundation.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted only as authorized by the OpenLDAP
12 * Public License.
13 *
14 * A copy of this license is available at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17
18 /* Copyright 2001 Computing Research Labs, New Mexico State University
19 *
20 * Permission is hereby granted, free of charge, to any person obtaining a
21 * copy of this software and associated documentation files (the "Software"),
22 * to deal in the Software without restriction, including without limitation
23 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 * and/or sell copies of the Software, and to permit persons to whom the
25 * Software is furnished to do so, subject to the following conditions:
26 *
27 * The above copyright notice and this permission notice shall be included in
28 * all copies or substantial portions of the Software.
29 *
30 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
33 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
34 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
35 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
36 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37 */
38 /* orig Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */
39
40 #include <stdio.h>
41 #include <ctype.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <unistd.h>
45
46 #define ac_uint2 unsigned short
47 #define ac_uint4 unsigned int
48 #define LDAP_DIRSEP "/"
49 #define AC_MEMCPY memcpy
50
51 #ifndef HARDCODE_DATA
52 #define HARDCODE_DATA 1
53 #endif
54
55 #undef ishdigit
56 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
57 ((cc) >= 'A' && (cc) <= 'F') ||\
58 ((cc) >= 'a' && (cc) <= 'f'))
59
60 /*
61 * A header written to the output file with the byte-order-mark and the number
62 * of property nodes.
63 */
64 static ac_uint2 hdr[2] = {0xfeff, 0};
65
66 #define NUMPROPS 50
67 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
68
69 typedef struct {
70 char *name;
71 int len;
72 } _prop_t;
73
74 /*
75 * List of properties expected to be found in the Unicode Character Database
76 * including some implementation specific properties.
77 *
78 * The implementation specific properties are:
79 * Cm = Composed (can be decomposed)
80 * Nb = Non-breaking
81 * Sy = Symmetric (has left and right forms)
82 * Hd = Hex digit
83 * Qm = Quote marks
84 * Mr = Mirroring
85 * Ss = Space, other
86 * Cp = Defined character
87 */
88 static _prop_t props[NUMPROPS] = {
89 {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
90 {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
91 {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
92 {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
93 {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L", 1}, {"R", 1},
94 {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B", 1},
95 {"S", 1}, {"WS", 2}, {"ON", 2},
96 {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
97 {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
98 };
99
100 typedef struct {
101 ac_uint4 *ranges;
102 ac_uint2 used;
103 ac_uint2 size;
104 } _ranges_t;
105
106 static _ranges_t proptbl[NUMPROPS];
107
108 /*
109 * Make sure this array is sized to be on a 4-byte boundary at compile time.
110 */
111 static ac_uint2 propcnt[NEEDPROPS];
112
113 /*
114 * Array used to collect a decomposition before adding it to the decomposition
115 * table.
116 */
117 static ac_uint4 dectmp[64];
118 static ac_uint4 dectmp_size;
119
120 typedef struct {
121 ac_uint4 code;
122 ac_uint2 size;
123 ac_uint2 used;
124 ac_uint4 *decomp;
125 } _decomp_t;
126
127 /*
128 * List of decomposition. Created and expanded in order as the characters are
129 * encountered. First list contains canonical mappings, second also includes
130 * compatibility mappings.
131 */
132 static _decomp_t *decomps;
133 static ac_uint4 decomps_used;
134 static ac_uint4 decomps_size;
135
136 static _decomp_t *kdecomps;
137 static ac_uint4 kdecomps_used;
138 static ac_uint4 kdecomps_size;
139
140 /*
141 * Composition exclusion table stuff.
142 */
143 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31)))
144 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31)))
145 static ac_uint4 compexs[8192];
146
147 /*
148 * Struct for holding a composition pair, and array of composition pairs
149 */
150 typedef struct {
151 ac_uint4 comp;
152 ac_uint4 count;
153 ac_uint4 code1;
154 ac_uint4 code2;
155 } _comp_t;
156
157 #if 0
158 static _comp_t *comps;
159 #endif
160 static ac_uint4 comps_used;
161
162 /*
163 * Types and lists for handling lists of case mappings.
164 */
165 typedef struct {
166 ac_uint4 key;
167 ac_uint4 other1;
168 ac_uint4 other2;
169 } _case_t;
170
171 static _case_t *upper;
172 static _case_t *lower;
173 static _case_t *title;
174 static ac_uint4 upper_used;
175 static ac_uint4 upper_size;
176 static ac_uint4 lower_used;
177 static ac_uint4 lower_size;
178 static ac_uint4 title_used;
179 static ac_uint4 title_size;
180
181 /*
182 * Array used to collect case mappings before adding them to a list.
183 */
184 static ac_uint4 cases[3];
185
186 /*
187 * An array to hold ranges for combining classes.
188 */
189 static ac_uint4 *ccl;
190 static ac_uint4 ccl_used;
191 static ac_uint4 ccl_size;
192
193 /*
194 * Structures for handling numbers.
195 */
196 typedef struct {
197 ac_uint4 code;
198 ac_uint4 idx;
199 } _codeidx_t;
200
201 typedef struct {
202 short numerator;
203 short denominator;
204 } _num_t;
205
206 /*
207 * Arrays to hold the mapping of codes to numbers.
208 */
209 static _codeidx_t *ncodes;
210 static ac_uint4 ncodes_used;
211 static ac_uint4 ncodes_size;
212
213 static _num_t *nums;
214 static ac_uint4 nums_used;
215 static ac_uint4 nums_size;
216
217 /*
218 * Array for holding numbers.
219 */
220 static _num_t *nums;
221 static ac_uint4 nums_used;
222 static ac_uint4 nums_size;
223
224 static void
add_range(ac_uint4 start,ac_uint4 end,char * p1,char * p2)225 add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2)
226 {
227 int i, j, k, len;
228 _ranges_t *rlp;
229 char *name;
230
231 for (k = 0; k < 2; k++) {
232 if (k == 0) {
233 name = p1;
234 len = 2;
235 } else {
236 if (p2 == 0)
237 break;
238
239 name = p2;
240 len = 1;
241 }
242
243 for (i = 0; i < NUMPROPS; i++) {
244 if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
245 break;
246 }
247
248 if (i == NUMPROPS)
249 continue;
250
251 rlp = &proptbl[i];
252
253 /*
254 * Resize the range list if necessary.
255 */
256 if (rlp->used == rlp->size) {
257 if (rlp->size == 0)
258 rlp->ranges = (ac_uint4 *)
259 malloc(sizeof(ac_uint4) << 3);
260 else
261 rlp->ranges = (ac_uint4 *)
262 realloc((char *) rlp->ranges,
263 sizeof(ac_uint4) * (rlp->size + 8));
264 rlp->size += 8;
265 }
266
267 /*
268 * If this is the first code for this property list, just add it
269 * and return.
270 */
271 if (rlp->used == 0) {
272 rlp->ranges[0] = start;
273 rlp->ranges[1] = end;
274 rlp->used += 2;
275 continue;
276 }
277
278 /*
279 * Optimize the case of adding the range to the end.
280 */
281 j = rlp->used - 1;
282 if (start > rlp->ranges[j]) {
283 j = rlp->used;
284 rlp->ranges[j++] = start;
285 rlp->ranges[j++] = end;
286 rlp->used = j;
287 continue;
288 }
289
290 /*
291 * Need to locate the insertion point.
292 */
293 for (i = 0;
294 i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
295
296 /*
297 * If the start value lies in the current range, then simply set the
298 * new end point of the range to the end value passed as a parameter.
299 */
300 if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
301 rlp->ranges[i + 1] = end;
302 return;
303 }
304
305 /*
306 * Shift following values up by two.
307 */
308 for (j = rlp->used; j > i; j -= 2) {
309 rlp->ranges[j] = rlp->ranges[j - 2];
310 rlp->ranges[j + 1] = rlp->ranges[j - 1];
311 }
312
313 /*
314 * Add the new range at the insertion point.
315 */
316 rlp->ranges[i] = start;
317 rlp->ranges[i + 1] = end;
318 rlp->used += 2;
319 }
320 }
321
322 static void
ordered_range_insert(ac_uint4 c,char * name,int len)323 ordered_range_insert(ac_uint4 c, char *name, int len)
324 {
325 int i, j;
326 ac_uint4 s, e;
327 _ranges_t *rlp;
328
329 if (len == 0)
330 return;
331
332 /*
333 * Deal with directionality codes introduced in Unicode 3.0.
334 */
335 if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
336 (len == 3 &&
337 (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
338 memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
339 memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0 ||
340 memcmp(name, "LRI", 3) == 0 || memcmp(name, "RLI", 3) == 0 ||
341 memcmp(name, "FSI", 3) == 0 || memcmp(name, "PDI", 3) == 0))) {
342 /*
343 * Mark all of these as Other Neutral to preserve compatibility with
344 * older versions.
345 */
346 len = 2;
347 name = "ON";
348 }
349
350 for (i = 0; i < NUMPROPS; i++) {
351 if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
352 break;
353 }
354
355 if (i == NUMPROPS) {
356 printf("Unknown property %s\n", name);
357 return;
358 }
359
360 /*
361 * Have a match, so insert the code in order.
362 */
363 rlp = &proptbl[i];
364
365 /*
366 * Resize the range list if necessary.
367 */
368 if (rlp->used == rlp->size) {
369 if (rlp->size == 0)
370 rlp->ranges = (ac_uint4 *)
371 malloc(sizeof(ac_uint4) << 3);
372 else
373 rlp->ranges = (ac_uint4 *)
374 realloc((char *) rlp->ranges,
375 sizeof(ac_uint4) * (rlp->size + 8));
376 rlp->size += 8;
377 }
378
379 /*
380 * If this is the first code for this property list, just add it
381 * and return.
382 */
383 if (rlp->used == 0) {
384 rlp->ranges[0] = rlp->ranges[1] = c;
385 rlp->used += 2;
386 return;
387 }
388
389 /*
390 * Optimize the cases of extending the last range and adding new ranges to
391 * the end.
392 */
393 j = rlp->used - 1;
394 e = rlp->ranges[j];
395 s = rlp->ranges[j - 1];
396
397 if (c == e + 1) {
398 /*
399 * Extend the last range.
400 */
401 rlp->ranges[j] = c;
402 return;
403 }
404
405 if (c > e + 1) {
406 /*
407 * Start another range on the end.
408 */
409 j = rlp->used;
410 rlp->ranges[j] = rlp->ranges[j + 1] = c;
411 rlp->used += 2;
412 return;
413 }
414
415 if (c >= s)
416 /*
417 * The code is a duplicate of a code in the last range, so just return.
418 */
419 return;
420
421 /*
422 * The code should be inserted somewhere before the last range in the
423 * list. Locate the insertion point.
424 */
425 for (i = 0;
426 i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
427
428 s = rlp->ranges[i];
429 e = rlp->ranges[i + 1];
430
431 if (c == e + 1)
432 /*
433 * Simply extend the current range.
434 */
435 rlp->ranges[i + 1] = c;
436 else if (c < s) {
437 /*
438 * Add a new entry before the current location. Shift all entries
439 * before the current one up by one to make room.
440 */
441 for (j = rlp->used; j > i; j -= 2) {
442 rlp->ranges[j] = rlp->ranges[j - 2];
443 rlp->ranges[j + 1] = rlp->ranges[j - 1];
444 }
445 rlp->ranges[i] = rlp->ranges[i + 1] = c;
446
447 rlp->used += 2;
448 }
449 }
450
451 static void
add_decomp(ac_uint4 code,short compat)452 add_decomp(ac_uint4 code, short compat)
453 {
454 ac_uint4 i, j, size;
455 _decomp_t **pdecomps;
456 ac_uint4 *pdecomps_used;
457 ac_uint4 *pdecomps_size;
458
459 if (compat) {
460 pdecomps = &kdecomps;
461 pdecomps_used = &kdecomps_used;
462 pdecomps_size = &kdecomps_size;
463 } else {
464 pdecomps = &decomps;
465 pdecomps_used = &decomps_used;
466 pdecomps_size = &decomps_size;
467 }
468
469 /*
470 * Add the code to the composite property.
471 */
472 if (!compat) {
473 ordered_range_insert(code, "Cm", 2);
474 }
475
476 /*
477 * Locate the insertion point for the code.
478 */
479 for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ;
480
481 /*
482 * Allocate space for a new decomposition.
483 */
484 if (*pdecomps_used == *pdecomps_size) {
485 if (*pdecomps_size == 0)
486 *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
487 else
488 *pdecomps = (_decomp_t *)
489 realloc((char *) *pdecomps,
490 sizeof(_decomp_t) * (*pdecomps_size + 8));
491 (void) memset((char *) (*pdecomps + *pdecomps_size), '\0',
492 sizeof(_decomp_t) << 3);
493 *pdecomps_size += 8;
494 }
495
496 if (i < *pdecomps_used && code != (*pdecomps)[i].code) {
497 /*
498 * Shift the decomps up by one if the codes don't match.
499 */
500 for (j = *pdecomps_used; j > i; j--)
501 (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1],
502 sizeof(_decomp_t));
503 }
504
505 /*
506 * Insert or replace a decomposition.
507 */
508 size = dectmp_size + (4 - (dectmp_size & 3));
509 if ((*pdecomps)[i].size < size) {
510 if ((*pdecomps)[i].size == 0)
511 (*pdecomps)[i].decomp = (ac_uint4 *)
512 malloc(sizeof(ac_uint4) * size);
513 else
514 (*pdecomps)[i].decomp = (ac_uint4 *)
515 realloc((char *) (*pdecomps)[i].decomp,
516 sizeof(ac_uint4) * size);
517 (*pdecomps)[i].size = size;
518 }
519
520 if ((*pdecomps)[i].code != code)
521 (*pdecomps_used)++;
522
523 (*pdecomps)[i].code = code;
524 (*pdecomps)[i].used = dectmp_size;
525 (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp,
526 sizeof(ac_uint4) * dectmp_size);
527
528 /*
529 * NOTICE: This needs changing later so it is more general than simply
530 * pairs. This calculation is done here to simplify allocation elsewhere.
531 */
532 if (!compat && dectmp_size == 2)
533 comps_used++;
534 }
535
536 static void
add_title(ac_uint4 code)537 add_title(ac_uint4 code)
538 {
539 ac_uint4 i, j;
540
541 /*
542 * Always map the code to itself.
543 */
544 cases[2] = code;
545
546 /* If lower/upper case does not exist, stay the same */
547 if (!cases[0]) cases[0] = code;
548 if (!cases[1]) cases[1] = code;
549
550 if (title_used == title_size) {
551 if (title_size == 0)
552 title = (_case_t *) malloc(sizeof(_case_t) << 3);
553 else
554 title = (_case_t *) realloc((char *) title,
555 sizeof(_case_t) * (title_size + 8));
556 title_size += 8;
557 }
558
559 /*
560 * Locate the insertion point.
561 */
562 for (i = 0; i < title_used && code > title[i].key; i++) ;
563
564 if (i < title_used) {
565 /*
566 * Shift the array up by one.
567 */
568 for (j = title_used; j > i; j--)
569 (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
570 sizeof(_case_t));
571 }
572
573 title[i].key = cases[2]; /* Title */
574 title[i].other1 = cases[0]; /* Upper */
575 title[i].other2 = cases[1]; /* Lower */
576
577 title_used++;
578 }
579
580 static void
add_upper(ac_uint4 code)581 add_upper(ac_uint4 code)
582 {
583 ac_uint4 i, j;
584
585 /*
586 * Always map the code to itself.
587 */
588 cases[0] = code;
589
590 /*
591 * If the title case character is not present, then make it the same as
592 * the upper case.
593 */
594 if (cases[2] == 0)
595 cases[2] = code;
596
597 if (upper_used == upper_size) {
598 if (upper_size == 0)
599 upper = (_case_t *) malloc(sizeof(_case_t) << 3);
600 else
601 upper = (_case_t *) realloc((char *) upper,
602 sizeof(_case_t) * (upper_size + 8));
603 upper_size += 8;
604 }
605
606 /*
607 * Locate the insertion point.
608 */
609 for (i = 0; i < upper_used && code > upper[i].key; i++) ;
610
611 if (i < upper_used) {
612 /*
613 * Shift the array up by one.
614 */
615 for (j = upper_used; j > i; j--)
616 (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
617 sizeof(_case_t));
618 }
619
620 upper[i].key = cases[0]; /* Upper */
621 upper[i].other1 = cases[1]; /* Lower */
622 upper[i].other2 = cases[2]; /* Title */
623
624 upper_used++;
625 }
626
627 static void
add_lower(ac_uint4 code)628 add_lower(ac_uint4 code)
629 {
630 ac_uint4 i, j;
631
632 /*
633 * Always map the code to itself.
634 */
635 cases[1] = code;
636
637 /*
638 * If the title case character is empty, then make it the same as the
639 * upper case.
640 */
641 if (cases[2] == 0)
642 cases[2] = cases[0];
643
644 if (lower_used == lower_size) {
645 if (lower_size == 0)
646 lower = (_case_t *) malloc(sizeof(_case_t) << 3);
647 else
648 lower = (_case_t *) realloc((char *) lower,
649 sizeof(_case_t) * (lower_size + 8));
650 lower_size += 8;
651 }
652
653 /*
654 * Locate the insertion point.
655 */
656 for (i = 0; i < lower_used && code > lower[i].key; i++) ;
657
658 if (i < lower_used) {
659 /*
660 * Shift the array up by one.
661 */
662 for (j = lower_used; j > i; j--)
663 (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
664 sizeof(_case_t));
665 }
666
667 lower[i].key = cases[1]; /* Lower */
668 lower[i].other1 = cases[0]; /* Upper */
669 lower[i].other2 = cases[2]; /* Title */
670
671 lower_used++;
672 }
673
674 static void
ordered_ccl_insert(ac_uint4 c,ac_uint4 ccl_code)675 ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code)
676 {
677 ac_uint4 i, j;
678
679 if (ccl_used == ccl_size) {
680 if (ccl_size == 0)
681 ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24);
682 else
683 ccl = (ac_uint4 *)
684 realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24));
685 ccl_size += 24;
686 }
687
688 /*
689 * Optimize adding the first item.
690 */
691 if (ccl_used == 0) {
692 ccl[0] = ccl[1] = c;
693 ccl[2] = ccl_code;
694 ccl_used += 3;
695 return;
696 }
697
698 /*
699 * Handle the special case of extending the range on the end. This
700 * requires that the combining class codes are the same.
701 */
702 if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
703 ccl[ccl_used - 2] = c;
704 return;
705 }
706
707 /*
708 * Handle the special case of adding another range on the end.
709 */
710 if (c > ccl[ccl_used - 2] + 1 ||
711 (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
712 ccl[ccl_used++] = c;
713 ccl[ccl_used++] = c;
714 ccl[ccl_used++] = ccl_code;
715 return;
716 }
717
718 /*
719 * Locate either the insertion point or range for the code.
720 */
721 for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
722
723 if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
724 /*
725 * Extend an existing range.
726 */
727 ccl[i + 1] = c;
728 return;
729 } else if (c < ccl[i]) {
730 /*
731 * Start a new range before the current location.
732 */
733 for (j = ccl_used; j > i; j -= 3) {
734 ccl[j] = ccl[j - 3];
735 ccl[j - 1] = ccl[j - 4];
736 ccl[j - 2] = ccl[j - 5];
737 }
738 ccl[i] = ccl[i + 1] = c;
739 ccl[i + 2] = ccl_code;
740 }
741 }
742
743 /*
744 * Adds a number if it does not already exist and returns an index value
745 * multiplied by 2.
746 */
747 static ac_uint4
make_number(short num,short denom)748 make_number(short num, short denom)
749 {
750 ac_uint4 n;
751
752 /*
753 * Determine if the number already exists.
754 */
755 for (n = 0; n < nums_used; n++) {
756 if (nums[n].numerator == num && nums[n].denominator == denom)
757 return n << 1;
758 }
759
760 if (nums_used == nums_size) {
761 if (nums_size == 0)
762 nums = (_num_t *) malloc(sizeof(_num_t) << 3);
763 else
764 nums = (_num_t *) realloc((char *) nums,
765 sizeof(_num_t) * (nums_size + 8));
766 nums_size += 8;
767 }
768
769 n = nums_used++;
770 nums[n].numerator = num;
771 nums[n].denominator = denom;
772
773 return n << 1;
774 }
775
776 static void
add_number(ac_uint4 code,short num,short denom)777 add_number(ac_uint4 code, short num, short denom)
778 {
779 ac_uint4 i, j;
780
781 /*
782 * Insert the code in order.
783 */
784 for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
785
786 /*
787 * Handle the case of the codes matching and simply replace the number
788 * that was there before.
789 */
790 if (i < ncodes_used && code == ncodes[i].code) {
791 ncodes[i].idx = make_number(num, denom);
792 return;
793 }
794
795 /*
796 * Resize the array if necessary.
797 */
798 if (ncodes_used == ncodes_size) {
799 if (ncodes_size == 0)
800 ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
801 else
802 ncodes = (_codeidx_t *)
803 realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
804
805 ncodes_size += 8;
806 }
807
808 /*
809 * Shift things around to insert the code if necessary.
810 */
811 if (i < ncodes_used) {
812 for (j = ncodes_used; j > i; j--) {
813 ncodes[j].code = ncodes[j - 1].code;
814 ncodes[j].idx = ncodes[j - 1].idx;
815 }
816 }
817 ncodes[i].code = code;
818 ncodes[i].idx = make_number(num, denom);
819
820 ncodes_used++;
821 }
822
823 /*
824 * This routine assumes that the line is a valid Unicode Character Database
825 * entry.
826 */
827 static void
read_cdata(FILE * in)828 read_cdata(FILE *in)
829 {
830 ac_uint4 i, lineno, skip, code, ccl_code;
831 short wnum, neg, number[2], compat;
832 char line[512], *s, *e;
833
834 lineno = skip = 0;
835 while (fgets(line, sizeof(line), in)) {
836 int is_title = 0;
837
838 if( (s=strchr(line, '\n')) ) *s = '\0';
839 lineno++;
840
841 /*
842 * Skip blank lines and lines that start with a '#'.
843 */
844 if (line[0] == 0 || line[0] == '#')
845 continue;
846
847 /*
848 * If lines need to be skipped, do it here.
849 */
850 if (skip) {
851 skip--;
852 continue;
853 }
854
855 /*
856 * Collect the code. The code can be up to 6 hex digits in length to
857 * allow surrogates to be specified.
858 */
859 for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
860 code <<= 4;
861 if (*s >= '0' && *s <= '9')
862 code += *s - '0';
863 else if (*s >= 'A' && *s <= 'F')
864 code += (*s - 'A') + 10;
865 else if (*s >= 'a' && *s <= 'f')
866 code += (*s - 'a') + 10;
867 }
868
869 /*
870 * Handle the following special cases:
871 * 1. 4E00-9FA5 CJK Ideographs.
872 * 2. AC00-D7A3 Hangul Syllables.
873 * 3. D800-DFFF Surrogates.
874 * 4. E000-F8FF Private Use Area.
875 * 5. F900-FA2D Han compatibility.
876 * ...Plus additional ranges in newer Unicode versions...
877 */
878 switch (code) {
879 case 0x3400:
880 /* CJK Ideograph Extension A */
881 add_range(0x3400, 0x4db5, "Lo", "L");
882
883 add_range(0x3400, 0x4db5, "Cp", 0);
884
885 skip = 1;
886 break;
887 case 0x4e00:
888 /*
889 * The Han ideographs.
890 */
891 add_range(0x4e00, 0x9fff, "Lo", "L");
892
893 /*
894 * Add the characters to the defined category.
895 */
896 add_range(0x4e00, 0x9fa5, "Cp", 0);
897
898 skip = 1;
899 break;
900 case 0xac00:
901 /*
902 * The Hangul syllables.
903 */
904 add_range(0xac00, 0xd7a3, "Lo", "L");
905
906 /*
907 * Add the characters to the defined category.
908 */
909 add_range(0xac00, 0xd7a3, "Cp", 0);
910
911 skip = 1;
912 break;
913 case 0xd800:
914 /*
915 * Make a range of all surrogates and assume some default
916 * properties.
917 */
918 add_range(0x010000, 0x10ffff, "Cs", "L");
919 skip = 5;
920 break;
921 case 0xe000:
922 /*
923 * The Private Use area. Add with a default set of properties.
924 */
925 add_range(0xe000, 0xf8ff, "Co", "L");
926 skip = 1;
927 break;
928 case 0xf900:
929 /*
930 * The CJK compatibility area.
931 */
932 add_range(0xf900, 0xfaff, "Lo", "L");
933
934 /*
935 * Add the characters to the defined category.
936 */
937 add_range(0xf900, 0xfaff, "Cp", 0);
938
939 skip = 1;
940 break;
941 case 0x20000:
942 /* CJK Ideograph Extension B */
943 add_range(0x20000, 0x2a6d6, "Lo", "L");
944
945 add_range(0x20000, 0x2a6d6, "Cp", 0);
946
947 skip = 1;
948 break;
949 case 0xf0000:
950 /* Plane 15 private use */
951 add_range(0xf0000, 0xffffd, "Co", "L");
952 skip = 1;
953 break;
954
955 case 0x100000:
956 /* Plane 16 private use */
957 add_range(0x100000, 0x10fffd, "Co", "L");
958 skip = 1;
959 break;
960 }
961
962 if (skip)
963 continue;
964
965 /*
966 * Add the code to the defined category.
967 */
968 ordered_range_insert(code, "Cp", 2);
969
970 /*
971 * Locate the first character property field.
972 */
973 for (i = 0; *s != 0 && i < 2; s++) {
974 if (*s == ';')
975 i++;
976 }
977 for (e = s; *e && *e != ';'; e++) ;
978
979 ordered_range_insert(code, s, e - s);
980
981 if (e - s == 2 && s[0] == 'L' && s[1] == 't') {
982 is_title = 1;
983 }
984
985 /*
986 * Locate the combining class code.
987 */
988 for (s = e; *s != 0 && i < 3; s++) {
989 if (*s == ';')
990 i++;
991 }
992
993 /*
994 * Convert the combining class code from decimal.
995 */
996 for (ccl_code = 0, e = s; *e && *e != ';'; e++)
997 ccl_code = (ccl_code * 10) + (*e - '0');
998
999 /*
1000 * Add the code if it not 0.
1001 */
1002 if (ccl_code != 0)
1003 ordered_ccl_insert(code, ccl_code);
1004
1005 /*
1006 * Locate the second character property field.
1007 */
1008 for (s = e; *s != 0 && i < 4; s++) {
1009 if (*s == ';')
1010 i++;
1011 }
1012 for (e = s; *e && *e != ';'; e++) ;
1013
1014 ordered_range_insert(code, s, e - s);
1015
1016 /*
1017 * Check for a decomposition.
1018 */
1019 s = ++e;
1020 if (*s != ';') {
1021 compat = *s == '<';
1022 if (compat) {
1023 /*
1024 * Skip compatibility formatting tag.
1025 */
1026 while (*s++ != '>');
1027 }
1028 /*
1029 * Collect the codes of the decomposition.
1030 */
1031 for (dectmp_size = 0; *s != ';'; ) {
1032 /*
1033 * Skip all leading non-hex digits.
1034 */
1035 while (!ishdigit(*s))
1036 s++;
1037
1038 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
1039 dectmp[dectmp_size] <<= 4;
1040 if (*s >= '0' && *s <= '9')
1041 dectmp[dectmp_size] += *s - '0';
1042 else if (*s >= 'A' && *s <= 'F')
1043 dectmp[dectmp_size] += (*s - 'A') + 10;
1044 else if (*s >= 'a' && *s <= 'f')
1045 dectmp[dectmp_size] += (*s - 'a') + 10;
1046 }
1047 dectmp_size++;
1048 }
1049
1050 /*
1051 * If there are any codes in the temporary decomposition array,
1052 * then add the character with its decomposition.
1053 */
1054 if (dectmp_size > 0) {
1055 if (!compat) {
1056 add_decomp(code, 0);
1057 }
1058 add_decomp(code, 1);
1059 }
1060 }
1061
1062 /*
1063 * Skip to the number field.
1064 */
1065 for (i = 0; i < 3 && *s; s++) {
1066 if (*s == ';')
1067 i++;
1068 }
1069
1070 /*
1071 * Scan the number in.
1072 */
1073 number[0] = number[1] = 0;
1074 for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
1075 if (*e == '-') {
1076 neg = 1;
1077 continue;
1078 }
1079
1080 if (*e == '/') {
1081 /*
1082 * Move the denominator of the fraction.
1083 */
1084 if (neg)
1085 number[wnum] *= -1;
1086 neg = 0;
1087 e++;
1088 wnum++;
1089 }
1090 number[wnum] = (number[wnum] * 10) + (*e - '0');
1091 }
1092
1093 if (e > s) {
1094 /*
1095 * Adjust the denominator in case of integers and add the number.
1096 */
1097 if (wnum == 0)
1098 number[1] = 1;
1099
1100 add_number(code, number[0], number[1]);
1101 }
1102
1103 /*
1104 * Skip to the start of the possible case mappings.
1105 */
1106 for (s = e, i = 0; i < 4 && *s; s++) {
1107 if (*s == ';')
1108 i++;
1109 }
1110
1111 /*
1112 * Collect the case mappings.
1113 */
1114 cases[0] = cases[1] = cases[2] = 0;
1115 for (i = 0; i < 3; i++) {
1116 while (ishdigit(*s)) {
1117 cases[i] <<= 4;
1118 if (*s >= '0' && *s <= '9')
1119 cases[i] += *s - '0';
1120 else if (*s >= 'A' && *s <= 'F')
1121 cases[i] += (*s - 'A') + 10;
1122 else if (*s >= 'a' && *s <= 'f')
1123 cases[i] += (*s - 'a') + 10;
1124 s++;
1125 }
1126 if (*s == ';')
1127 s++;
1128 }
1129 if (is_title)
1130 /*
1131 * Add the upper and lower mappings for a title case character.
1132 */
1133 add_title(code);
1134 else if (cases[1])
1135 /*
1136 * Add the lower and title case mappings for the upper case
1137 * character.
1138 */
1139 add_upper(code);
1140 else if (cases[0])
1141 /*
1142 * Add the upper and title case mappings for the lower case
1143 * character.
1144 */
1145 add_lower(code);
1146 }
1147 }
1148
1149 #if 0
1150
1151 static _decomp_t *
1152 find_decomp(ac_uint4 code, short compat)
1153 {
1154 long l, r, m;
1155 _decomp_t *decs;
1156
1157 l = 0;
1158 r = (compat ? kdecomps_used : decomps_used) - 1;
1159 decs = compat ? kdecomps : decomps;
1160 while (l <= r) {
1161 m = (l + r) >> 1;
1162 if (code > decs[m].code)
1163 l = m + 1;
1164 else if (code < decs[m].code)
1165 r = m - 1;
1166 else
1167 return &decs[m];
1168 }
1169 return 0;
1170 }
1171
1172 static void
1173 decomp_it(_decomp_t *d, short compat)
1174 {
1175 ac_uint4 i;
1176 _decomp_t *dp;
1177
1178 for (i = 0; i < d->used; i++) {
1179 if ((dp = find_decomp(d->decomp[i], compat)) != 0)
1180 decomp_it(dp, compat);
1181 else
1182 dectmp[dectmp_size++] = d->decomp[i];
1183 }
1184 }
1185
1186
1187 /*
1188 * Expand all decompositions by recursively decomposing each character
1189 * in the decomposition.
1190 */
1191 static void
1192 expand_decomp(void)
1193 {
1194 ac_uint4 i;
1195
1196 for (i = 0; i < decomps_used; i++) {
1197 dectmp_size = 0;
1198 decomp_it(&decomps[i], 0);
1199 if (dectmp_size > 0)
1200 add_decomp(decomps[i].code, 0);
1201 }
1202
1203 for (i = 0; i < kdecomps_used; i++) {
1204 dectmp_size = 0;
1205 decomp_it(&kdecomps[i], 1);
1206 if (dectmp_size > 0)
1207 add_decomp(kdecomps[i].code, 1);
1208 }
1209 }
1210
1211 static int
1212 cmpcomps(const void *v_comp1, const void *v_comp2)
1213 {
1214 const _comp_t *comp1 = v_comp1, *comp2 = v_comp2;
1215 long diff = comp1->code1 - comp2->code1;
1216
1217 if (!diff)
1218 diff = comp1->code2 - comp2->code2;
1219 return (int) diff;
1220 }
1221
1222 #endif
1223
1224 /*
1225 * Load composition exclusion data
1226 */
1227 static void
read_compexdata(FILE * in)1228 read_compexdata(FILE *in)
1229 {
1230 ac_uint2 i;
1231 ac_uint4 code;
1232 char line[512], *s;
1233
1234 (void) memset((char *) compexs, 0, sizeof(compexs));
1235
1236 while (fgets(line, sizeof(line), in)) {
1237 if( (s=strchr(line, '\n')) ) *s = '\0';
1238 /*
1239 * Skip blank lines and lines that start with a '#'.
1240 */
1241 if (line[0] == 0 || line[0] == '#')
1242 continue;
1243
1244 /*
1245 * Collect the code. Assume max 6 digits
1246 */
1247
1248 for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) {
1249 if (isspace((unsigned char)*s)) break;
1250 code <<= 4;
1251 if (*s >= '0' && *s <= '9')
1252 code += *s - '0';
1253 else if (*s >= 'A' && *s <= 'F')
1254 code += (*s - 'A') + 10;
1255 else if (*s >= 'a' && *s <= 'f')
1256 code += (*s - 'a') + 10;
1257 }
1258 COMPEX_SET(code);
1259 }
1260 }
1261
1262 #if 0
1263
1264 /*
1265 * Creates array of compositions from decomposition array
1266 */
1267 static void
1268 create_comps(void)
1269 {
1270 ac_uint4 i, cu;
1271
1272 comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
1273
1274 for (i = cu = 0; i < decomps_used; i++) {
1275 if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
1276 continue;
1277 comps[cu].comp = decomps[i].code;
1278 comps[cu].count = 2;
1279 comps[cu].code1 = decomps[i].decomp[0];
1280 comps[cu].code2 = decomps[i].decomp[1];
1281 cu++;
1282 }
1283 comps_used = cu;
1284 qsort(comps, comps_used, sizeof(_comp_t), cmpcomps);
1285 }
1286
1287 #endif
1288
1289 #if HARDCODE_DATA
1290 static void
write_case(FILE * out,_case_t * tab,int num,int first)1291 write_case(FILE *out, _case_t *tab, int num, int first)
1292 {
1293 int i;
1294
1295 for (i=0; i<num; i++) {
1296 if (first) first = 0;
1297 else fprintf(out, ",");
1298 fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx",
1299 (unsigned long) tab[i].key, (unsigned long) tab[i].other1,
1300 (unsigned long) tab[i].other2);
1301 }
1302 }
1303
1304 #define PREF "static const "
1305
1306 #endif
1307
1308 static void
write_cdata(char * opath)1309 write_cdata(char *opath)
1310 {
1311 FILE *out;
1312 ac_uint4 bytes;
1313 ac_uint4 i, idx, nprops;
1314 #if !(HARDCODE_DATA)
1315 ac_uint2 casecnt[2];
1316 #endif
1317 char path[BUFSIZ];
1318 #if HARDCODE_DATA
1319 int j, k;
1320
1321 /*****************************************************************
1322 *
1323 * Generate the ctype data.
1324 *
1325 *****************************************************************/
1326
1327 /*
1328 * Open the output file.
1329 */
1330 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath);
1331 if ((out = fopen(path, "w")) == 0)
1332 return;
1333 #else
1334 /*
1335 * Open the ctype.dat file.
1336 */
1337 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath);
1338 if ((out = fopen(path, "wb")) == 0)
1339 return;
1340 #endif
1341
1342 /*
1343 * Collect the offsets for the properties. The offsets array is
1344 * on a 4-byte boundary to keep things efficient for architectures
1345 * that need such a thing.
1346 */
1347 for (i = idx = 0; i < NUMPROPS; i++) {
1348 propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1349 idx += proptbl[i].used;
1350 }
1351
1352 /*
1353 * Add the sentinel index which is used by the binary search as the upper
1354 * bound for a search.
1355 */
1356 propcnt[i] = idx;
1357
1358 /*
1359 * Record the actual number of property lists. This may be different than
1360 * the number of offsets actually written because of aligning on a 4-byte
1361 * boundary.
1362 */
1363 hdr[1] = NUMPROPS;
1364
1365 /*
1366 * Calculate the byte count needed and pad the property counts array to a
1367 * 4-byte boundary.
1368 */
1369 if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3)
1370 bytes += 4 - (bytes & 3);
1371 nprops = bytes / sizeof(ac_uint2);
1372 bytes += sizeof(ac_uint4) * idx;
1373
1374 #if HARDCODE_DATA
1375 fprintf(out,
1376 "/* This file was generated from a modified version UCData's ucgendat.\n"
1377 " *\n"
1378 " * DO NOT EDIT THIS FILE!\n"
1379 " * \n"
1380 " * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download\n"
1381 " * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt\n"
1382 " * files from http://www.unicode.org/Public/ and run this program.\n"
1383 " *\n"
1384 " * More information can be found in the UCData package. Unfortunately,\n"
1385 " * the project's page doesn't seem to be live anymore, so you can use\n"
1386 " * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */\n\n");
1387
1388 fprintf(out, PREF "unsigned short _ucprop_size = %d;\n\n", NUMPROPS);
1389
1390 fprintf(out, PREF "unsigned short _ucprop_offsets[] = {");
1391
1392 for (i = 0; i<nprops; i++) {
1393 if (i) fprintf(out, ",");
1394 if (!(i&7)) fprintf(out, "\n\t");
1395 else fprintf(out, " ");
1396 fprintf(out, "0x%04x", propcnt[i]);
1397 }
1398 fprintf(out, "\n};\n\n");
1399
1400 fprintf(out, PREF "unsigned int _ucprop_ranges[] = {");
1401
1402 k = 0;
1403 for (i = 0; i < NUMPROPS; i++) {
1404 if (proptbl[i].used > 0) {
1405 for (j=0; j<proptbl[i].used; j++) {
1406 if (k) fprintf(out, ",");
1407 if (!(k&3)) fprintf(out,"\n\t");
1408 else fprintf(out, " ");
1409 k++;
1410 fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]);
1411 }
1412 }
1413 }
1414 fprintf(out, "\n};\n\n");
1415 #else
1416 /*
1417 * Write the header.
1418 */
1419 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1420
1421 /*
1422 * Write the byte count.
1423 */
1424 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1425
1426 /*
1427 * Write the property list counts.
1428 */
1429 fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out);
1430
1431 /*
1432 * Write the property lists.
1433 */
1434 for (i = 0; i < NUMPROPS; i++) {
1435 if (proptbl[i].used > 0)
1436 fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4),
1437 proptbl[i].used, out);
1438 }
1439
1440 fclose(out);
1441 #endif
1442
1443 /*****************************************************************
1444 *
1445 * Generate the case mapping data.
1446 *
1447 *****************************************************************/
1448
1449 #if HARDCODE_DATA
1450 fprintf(out, PREF "unsigned int _uccase_size = %ld;\n\n",
1451 (long) (upper_used + lower_used + title_used));
1452
1453 fprintf(out,
1454 "/* Starting indexes of the case tables\n"
1455 " * UpperIndex = 0\n"
1456 " * LowerIndex = _uccase_len[0]\n"
1457 " * TitleIndex = LowerIndex + _uccase_len[1] */\n\n");
1458 fprintf(out, PREF "unsigned short _uccase_len[2] = {%ld, %ld};\n\n",
1459 (long) upper_used, (long) lower_used);
1460 fprintf(out, PREF "unsigned int _uccase_map[] = {");
1461
1462 if (upper_used > 0)
1463 /*
1464 * Write the upper case table.
1465 */
1466 write_case(out, upper, upper_used, 1);
1467
1468 if (lower_used > 0)
1469 /*
1470 * Write the lower case table.
1471 */
1472 write_case(out, lower, lower_used, !upper_used);
1473
1474 if (title_used > 0)
1475 /*
1476 * Write the title case table.
1477 */
1478 write_case(out, title, title_used, !(upper_used||lower_used));
1479
1480 if (!(upper_used || lower_used || title_used))
1481 fprintf(out, "\t0");
1482
1483 fprintf(out, "\n};\n\n");
1484 #else
1485 /*
1486 * Open the case.dat file.
1487 */
1488 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath);
1489 if ((out = fopen(path, "wb")) == 0)
1490 return;
1491
1492 /*
1493 * Write the case mapping tables.
1494 */
1495 hdr[1] = upper_used + lower_used + title_used;
1496 casecnt[0] = upper_used;
1497 casecnt[1] = lower_used;
1498
1499 /*
1500 * Write the header.
1501 */
1502 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1503
1504 /*
1505 * Write the upper and lower case table sizes.
1506 */
1507 fwrite((char *) casecnt, sizeof(ac_uint2), 2, out);
1508
1509 if (upper_used > 0)
1510 /*
1511 * Write the upper case table.
1512 */
1513 fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1514
1515 if (lower_used > 0)
1516 /*
1517 * Write the lower case table.
1518 */
1519 fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1520
1521 if (title_used > 0)
1522 /*
1523 * Write the title case table.
1524 */
1525 fwrite((char *) title, sizeof(_case_t), title_used, out);
1526
1527 fclose(out);
1528 #endif
1529
1530 #if 0
1531
1532 /*****************************************************************
1533 *
1534 * Generate the composition data.
1535 *
1536 *****************************************************************/
1537
1538 /*
1539 * Create compositions from decomposition data
1540 */
1541 create_comps();
1542
1543 #if HARDCODE_DATA
1544 fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n",
1545 comps_used * 4L);
1546
1547 fprintf(out, PREF "ac_uint4 _uccomp_data[] = {");
1548
1549 /*
1550 * Now, if comps exist, write them out.
1551 */
1552 if (comps_used > 0) {
1553 for (i=0; i<comps_used; i++) {
1554 if (i) fprintf(out, ",");
1555 fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx",
1556 (unsigned long) comps[i].comp, (unsigned long) comps[i].count,
1557 (unsigned long) comps[i].code1, (unsigned long) comps[i].code2);
1558 }
1559 } else {
1560 fprintf(out, "\t0");
1561 }
1562 fprintf(out, "\n};\n\n");
1563 #else
1564 /*
1565 * Open the comp.dat file.
1566 */
1567 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath);
1568 if ((out = fopen(path, "wb")) == 0)
1569 return;
1570
1571 /*
1572 * Write the header.
1573 */
1574 hdr[1] = (ac_uint2) comps_used * 4;
1575 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1576
1577 /*
1578 * Write out the byte count to maintain header size.
1579 */
1580 bytes = comps_used * sizeof(_comp_t);
1581 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1582
1583 /*
1584 * Now, if comps exist, write them out.
1585 */
1586 if (comps_used > 0)
1587 fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
1588
1589 fclose(out);
1590 #endif
1591
1592 /*****************************************************************
1593 *
1594 * Generate the decomposition data.
1595 *
1596 *****************************************************************/
1597
1598 /*
1599 * Fully expand all decompositions before generating the output file.
1600 */
1601 expand_decomp();
1602
1603 #if HARDCODE_DATA
1604 fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n",
1605 decomps_used * 2L);
1606
1607 fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {");
1608
1609 if (decomps_used) {
1610 /*
1611 * Write the list of decomp nodes.
1612 */
1613 for (i = idx = 0; i < decomps_used; i++) {
1614 fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1615 (unsigned long) decomps[i].code, (unsigned long) idx);
1616 idx += decomps[i].used;
1617 }
1618
1619 /*
1620 * Write the sentinel index as the last decomp node.
1621 */
1622 fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1623
1624 fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {");
1625 /*
1626 * Write the decompositions themselves.
1627 */
1628 k = 0;
1629 for (i = 0; i < decomps_used; i++)
1630 for (j=0; j<decomps[i].used; j++) {
1631 if (k) fprintf(out, ",");
1632 if (!(k&3)) fprintf(out,"\n\t");
1633 else fprintf(out, " ");
1634 k++;
1635 fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]);
1636 }
1637 fprintf(out, "\n};\n\n");
1638 }
1639 #else
1640 /*
1641 * Open the decomp.dat file.
1642 */
1643 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath);
1644 if ((out = fopen(path, "wb")) == 0)
1645 return;
1646
1647 hdr[1] = decomps_used;
1648
1649 /*
1650 * Write the header.
1651 */
1652 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1653
1654 /*
1655 * Write a temporary byte count which will be calculated as the
1656 * decompositions are written out.
1657 */
1658 bytes = 0;
1659 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1660
1661 if (decomps_used) {
1662 /*
1663 * Write the list of decomp nodes.
1664 */
1665 for (i = idx = 0; i < decomps_used; i++) {
1666 fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out);
1667 fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1668 idx += decomps[i].used;
1669 }
1670
1671 /*
1672 * Write the sentinel index as the last decomp node.
1673 */
1674 fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1675
1676 /*
1677 * Write the decompositions themselves.
1678 */
1679 for (i = 0; i < decomps_used; i++)
1680 fwrite((char *) decomps[i].decomp, sizeof(ac_uint4),
1681 decomps[i].used, out);
1682
1683 /*
1684 * Seek back to the beginning and write the byte count.
1685 */
1686 bytes = (sizeof(ac_uint4) * idx) +
1687 (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1688 fseek(out, sizeof(ac_uint2) << 1, 0L);
1689 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1690
1691 fclose(out);
1692 }
1693 #endif
1694
1695 #ifdef HARDCODE_DATA
1696 fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n",
1697 kdecomps_used * 2L);
1698
1699 fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {");
1700
1701 if (kdecomps_used) {
1702 /*
1703 * Write the list of kdecomp nodes.
1704 */
1705 for (i = idx = 0; i < kdecomps_used; i++) {
1706 fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1707 (unsigned long) kdecomps[i].code, (unsigned long) idx);
1708 idx += kdecomps[i].used;
1709 }
1710
1711 /*
1712 * Write the sentinel index as the last decomp node.
1713 */
1714 fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1715
1716 fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {");
1717
1718 /*
1719 * Write the decompositions themselves.
1720 */
1721 k = 0;
1722 for (i = 0; i < kdecomps_used; i++)
1723 for (j=0; j<kdecomps[i].used; j++) {
1724 if (k) fprintf(out, ",");
1725 if (!(k&3)) fprintf(out,"\n\t");
1726 else fprintf(out, " ");
1727 k++;
1728 fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]);
1729 }
1730 fprintf(out, "\n};\n\n");
1731 }
1732 #else
1733 /*
1734 * Open the kdecomp.dat file.
1735 */
1736 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath);
1737 if ((out = fopen(path, "wb")) == 0)
1738 return;
1739
1740 hdr[1] = kdecomps_used;
1741
1742 /*
1743 * Write the header.
1744 */
1745 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1746
1747 /*
1748 * Write a temporary byte count which will be calculated as the
1749 * decompositions are written out.
1750 */
1751 bytes = 0;
1752 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1753
1754 if (kdecomps_used) {
1755 /*
1756 * Write the list of kdecomp nodes.
1757 */
1758 for (i = idx = 0; i < kdecomps_used; i++) {
1759 fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out);
1760 fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1761 idx += kdecomps[i].used;
1762 }
1763
1764 /*
1765 * Write the sentinel index as the last decomp node.
1766 */
1767 fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1768
1769 /*
1770 * Write the decompositions themselves.
1771 */
1772 for (i = 0; i < kdecomps_used; i++)
1773 fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4),
1774 kdecomps[i].used, out);
1775
1776 /*
1777 * Seek back to the beginning and write the byte count.
1778 */
1779 bytes = (sizeof(ac_uint4) * idx) +
1780 (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1781 fseek(out, sizeof(ac_uint2) << 1, 0L);
1782 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1783
1784 fclose(out);
1785 }
1786 #endif
1787
1788 /*****************************************************************
1789 *
1790 * Generate the combining class data.
1791 *
1792 *****************************************************************/
1793 #ifdef HARDCODE_DATA
1794 fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used);
1795
1796 fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {");
1797
1798 if (ccl_used > 0) {
1799 /*
1800 * Write the combining class ranges out.
1801 */
1802 for (i = 0; i<ccl_used; i++) {
1803 if (i) fprintf(out, ",");
1804 if (!(i&3)) fprintf(out, "\n\t");
1805 else fprintf(out, " ");
1806 fprintf(out, "0x%08lx", (unsigned long) ccl[i]);
1807 }
1808 } else {
1809 fprintf(out, "\t0");
1810 }
1811 fprintf(out, "\n};\n\n");
1812 #else
1813 /*
1814 * Open the cmbcl.dat file.
1815 */
1816 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath);
1817 if ((out = fopen(path, "wb")) == 0)
1818 return;
1819
1820 /*
1821 * Set the number of ranges used. Each range has a combining class which
1822 * means each entry is a 3-tuple.
1823 */
1824 hdr[1] = ccl_used / 3;
1825
1826 /*
1827 * Write the header.
1828 */
1829 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1830
1831 /*
1832 * Write out the byte count to maintain header size.
1833 */
1834 bytes = ccl_used * sizeof(ac_uint4);
1835 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1836
1837 if (ccl_used > 0)
1838 /*
1839 * Write the combining class ranges out.
1840 */
1841 fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out);
1842
1843 fclose(out);
1844 #endif
1845
1846 /*****************************************************************
1847 *
1848 * Generate the number data.
1849 *
1850 *****************************************************************/
1851
1852 #if HARDCODE_DATA
1853 fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n",
1854 (unsigned long)ncodes_used<<1);
1855
1856 fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {");
1857
1858 /*
1859 * Now, if number mappings exist, write them out.
1860 */
1861 if (ncodes_used > 0) {
1862 for (i = 0; i<ncodes_used; i++) {
1863 if (i) fprintf(out, ",");
1864 if (!(i&1)) fprintf(out, "\n\t");
1865 else fprintf(out, " ");
1866 fprintf(out, "0x%08lx, 0x%08lx",
1867 (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx);
1868 }
1869 fprintf(out, "\n};\n\n");
1870
1871 fprintf(out, PREF "short _ucnum_vals[] = {");
1872 for (i = 0; i<nums_used; i++) {
1873 if (i) fprintf(out, ",");
1874 if (!(i&3)) fprintf(out, "\n\t");
1875 else fprintf(out, " ");
1876 if (nums[i].numerator < 0) {
1877 fprintf(out, "%6d, 0x%04x",
1878 nums[i].numerator, nums[i].denominator);
1879 } else {
1880 fprintf(out, "0x%04x, 0x%04x",
1881 nums[i].numerator, nums[i].denominator);
1882 }
1883 }
1884 fprintf(out, "\n};\n\n");
1885 }
1886 #else
1887 /*
1888 * Open the num.dat file.
1889 */
1890 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath);
1891 if ((out = fopen(path, "wb")) == 0)
1892 return;
1893
1894 /*
1895 * The count part of the header will be the total number of codes that
1896 * have numbers.
1897 */
1898 hdr[1] = (ac_uint2) (ncodes_used << 1);
1899 bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1900
1901 /*
1902 * Write the header.
1903 */
1904 fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1905
1906 /*
1907 * Write out the byte count to maintain header size.
1908 */
1909 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1910
1911 /*
1912 * Now, if number mappings exist, write them out.
1913 */
1914 if (ncodes_used > 0) {
1915 fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1916 fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1917 }
1918 #endif
1919
1920 #endif
1921
1922 fclose(out);
1923 }
1924
1925 static void
usage(char * prog)1926 usage(char *prog)
1927 {
1928 fprintf(stderr,
1929 "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
1930 fprintf(stderr, " datafile1 datafile2 ...\n\n");
1931 fprintf(stderr,
1932 "-o output-directory\n\t\tWrite the output files to a different");
1933 fprintf(stderr, " directory (default: .).\n");
1934 fprintf(stderr,
1935 "-x composition-exclusion\n\t\tFile of composition codes");
1936 fprintf(stderr, " that should be excluded.\n");
1937 exit(1);
1938 }
1939
1940 int
main(int argc,char * argv[])1941 main(int argc, char *argv[])
1942 {
1943 FILE *in;
1944 char *prog, *opath;
1945
1946 prog = argv[1];
1947
1948 opath = 0;
1949 in = stdin;
1950
1951 argc--;
1952 argv++;
1953
1954 while (argc > 0) {
1955 if (argv[0][0] == '-') {
1956 switch (argv[0][1]) {
1957 case 'o':
1958 argc--;
1959 argv++;
1960 opath = argv[0];
1961 break;
1962 case 'x':
1963 argc--;
1964 argv++;
1965 if ((in = fopen(argv[0], "r")) == 0)
1966 fprintf(stderr,
1967 "%s: unable to open composition exclusion file %s\n",
1968 prog, argv[0]);
1969 else {
1970 read_compexdata(in);
1971 fclose(in);
1972 in = 0;
1973 }
1974 break;
1975 default:
1976 usage(prog);
1977 }
1978 } else {
1979 if (in != stdin && in != NULL)
1980 fclose(in);
1981 if ((in = fopen(argv[0], "r")) == 0)
1982 fprintf(stderr, "%s: unable to open ctype file %s\n",
1983 prog, argv[0]);
1984 else {
1985 read_cdata(in);
1986 fclose(in);
1987 in = 0;
1988 }
1989 }
1990 argc--;
1991 argv++;
1992 }
1993
1994 if (opath == 0)
1995 opath = ".";
1996 write_cdata(opath);
1997
1998 return 0;
1999 }
2000