xref: /PHP-5.5/ext/mbstring/ucgendat/ucgendat.c (revision 99807e9a)
1 /* Further modified for PHP */
2 /* $Id$ */
3 
4 /* $OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucgendat.c,v 1.36.2.4 2007/01/02 21:43:51 kurt Exp $ */
5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6  *
7  * Copyright 1998-2007 The OpenLDAP Foundation.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted only as authorized by the OpenLDAP
12  * Public License.
13  *
14  * A copy of this license is available at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 
18 /* Copyright 2001 Computing Research Labs, New Mexico State University
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining a
21  * copy of this software and associated documentation files (the "Software"),
22  * to deal in the Software without restriction, including without limitation
23  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
24  * and/or sell copies of the Software, and to permit persons to whom the
25  * Software is furnished to do so, subject to the following conditions:
26  *
27  * The above copyright notice and this permission notice shall be included in
28  * all copies or substantial portions of the Software.
29  *
30  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
33  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
34  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
35  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
36  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37  */
38 /* orig Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */
39 
40 #include <stdio.h>
41 #include <ctype.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <unistd.h>
45 
46 #define ac_uint2 unsigned short
47 #define ac_uint4 unsigned int
48 #define LDAP_DIRSEP "/"
49 #define AC_MEMCPY memcpy
50 
51 #ifndef HARDCODE_DATA
52 #define	HARDCODE_DATA	1
53 #endif
54 
55 #undef ishdigit
56 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
57                       ((cc) >= 'A' && (cc) <= 'F') ||\
58                       ((cc) >= 'a' && (cc) <= 'f'))
59 
60 /*
61  * A header written to the output file with the byte-order-mark and the number
62  * of property nodes.
63  */
64 static ac_uint2 hdr[2] = {0xfeff, 0};
65 
66 #define NUMPROPS 50
67 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
68 
69 typedef struct {
70     char *name;
71     int len;
72 } _prop_t;
73 
74 /*
75  * List of properties expected to be found in the Unicode Character Database
76  * including some implementation specific properties.
77  *
78  * The implementation specific properties are:
79  * Cm = Composed (can be decomposed)
80  * Nb = Non-breaking
81  * Sy = Symmetric (has left and right forms)
82  * Hd = Hex digit
83  * Qm = Quote marks
84  * Mr = Mirroring
85  * Ss = Space, other
86  * Cp = Defined character
87  */
88 static _prop_t props[NUMPROPS] = {
89     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
90     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
91     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
92     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
93     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
94     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
95     {"S",  1}, {"WS", 2}, {"ON", 2},
96     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
97     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
98 };
99 
100 typedef struct {
101     ac_uint4 *ranges;
102     ac_uint2 used;
103     ac_uint2 size;
104 } _ranges_t;
105 
106 static _ranges_t proptbl[NUMPROPS];
107 
108 /*
109  * Make sure this array is sized to be on a 4-byte boundary at compile time.
110  */
111 static ac_uint2 propcnt[NEEDPROPS];
112 
113 /*
114  * Array used to collect a decomposition before adding it to the decomposition
115  * table.
116  */
117 static ac_uint4 dectmp[64];
118 static ac_uint4 dectmp_size;
119 
120 typedef struct {
121     ac_uint4 code;
122     ac_uint2 size;
123     ac_uint2 used;
124     ac_uint4 *decomp;
125 } _decomp_t;
126 
127 /*
128  * List of decomposition.  Created and expanded in order as the characters are
129  * encountered. First list contains canonical mappings, second also includes
130  * compatibility mappings.
131  */
132 static _decomp_t *decomps;
133 static ac_uint4 decomps_used;
134 static ac_uint4 decomps_size;
135 
136 static _decomp_t *kdecomps;
137 static ac_uint4 kdecomps_used;
138 static ac_uint4 kdecomps_size;
139 
140 /*
141  * Composition exclusion table stuff.
142  */
143 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31)))
144 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31)))
145 static ac_uint4 compexs[8192];
146 
147 /*
148  * Struct for holding a composition pair, and array of composition pairs
149  */
150 typedef struct {
151     ac_uint4 comp;
152     ac_uint4 count;
153     ac_uint4 code1;
154     ac_uint4 code2;
155 } _comp_t;
156 
157 #if 0
158 static _comp_t *comps;
159 #endif
160 static ac_uint4 comps_used;
161 
162 /*
163  * Types and lists for handling lists of case mappings.
164  */
165 typedef struct {
166     ac_uint4 key;
167     ac_uint4 other1;
168     ac_uint4 other2;
169 } _case_t;
170 
171 static _case_t *upper;
172 static _case_t *lower;
173 static _case_t *title;
174 static ac_uint4 upper_used;
175 static ac_uint4 upper_size;
176 static ac_uint4 lower_used;
177 static ac_uint4 lower_size;
178 static ac_uint4 title_used;
179 static ac_uint4 title_size;
180 
181 /*
182  * Array used to collect case mappings before adding them to a list.
183  */
184 static ac_uint4 cases[3];
185 
186 /*
187  * An array to hold ranges for combining classes.
188  */
189 static ac_uint4 *ccl;
190 static ac_uint4 ccl_used;
191 static ac_uint4 ccl_size;
192 
193 /*
194  * Structures for handling numbers.
195  */
196 typedef struct {
197     ac_uint4 code;
198     ac_uint4 idx;
199 } _codeidx_t;
200 
201 typedef struct {
202     short numerator;
203     short denominator;
204 } _num_t;
205 
206 /*
207  * Arrays to hold the mapping of codes to numbers.
208  */
209 static _codeidx_t *ncodes;
210 static ac_uint4 ncodes_used;
211 static ac_uint4 ncodes_size;
212 
213 static _num_t *nums;
214 static ac_uint4 nums_used;
215 static ac_uint4 nums_size;
216 
217 /*
218  * Array for holding numbers.
219  */
220 static _num_t *nums;
221 static ac_uint4 nums_used;
222 static ac_uint4 nums_size;
223 
224 static void
add_range(ac_uint4 start,ac_uint4 end,char * p1,char * p2)225 add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2)
226 {
227     int i, j, k, len;
228     _ranges_t *rlp;
229     char *name;
230 
231     for (k = 0; k < 2; k++) {
232         if (k == 0) {
233             name = p1;
234             len = 2;
235         } else {
236             if (p2 == 0)
237               break;
238 
239             name = p2;
240             len = 1;
241         }
242 
243         for (i = 0; i < NUMPROPS; i++) {
244             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
245               break;
246         }
247 
248         if (i == NUMPROPS)
249           continue;
250 
251         rlp = &proptbl[i];
252 
253         /*
254          * Resize the range list if necessary.
255          */
256         if (rlp->used == rlp->size) {
257             if (rlp->size == 0)
258               rlp->ranges = (ac_uint4 *)
259                   malloc(sizeof(ac_uint4) << 3);
260             else
261               rlp->ranges = (ac_uint4 *)
262                   realloc((char *) rlp->ranges,
263                           sizeof(ac_uint4) * (rlp->size + 8));
264             rlp->size += 8;
265         }
266 
267         /*
268          * If this is the first code for this property list, just add it
269          * and return.
270          */
271         if (rlp->used == 0) {
272             rlp->ranges[0] = start;
273             rlp->ranges[1] = end;
274             rlp->used += 2;
275             continue;
276         }
277 
278         /*
279          * Optimize the case of adding the range to the end.
280          */
281         j = rlp->used - 1;
282         if (start > rlp->ranges[j]) {
283             j = rlp->used;
284             rlp->ranges[j++] = start;
285             rlp->ranges[j++] = end;
286             rlp->used = j;
287             continue;
288         }
289 
290         /*
291          * Need to locate the insertion point.
292          */
293         for (i = 0;
294              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
295 
296         /*
297          * If the start value lies in the current range, then simply set the
298          * new end point of the range to the end value passed as a parameter.
299          */
300         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
301             rlp->ranges[i + 1] = end;
302             return;
303         }
304 
305         /*
306          * Shift following values up by two.
307          */
308         for (j = rlp->used; j > i; j -= 2) {
309             rlp->ranges[j] = rlp->ranges[j - 2];
310             rlp->ranges[j + 1] = rlp->ranges[j - 1];
311         }
312 
313         /*
314          * Add the new range at the insertion point.
315          */
316         rlp->ranges[i] = start;
317         rlp->ranges[i + 1] = end;
318         rlp->used += 2;
319     }
320 }
321 
322 static void
ordered_range_insert(ac_uint4 c,char * name,int len)323 ordered_range_insert(ac_uint4 c, char *name, int len)
324 {
325     int i, j;
326     ac_uint4 s, e;
327     _ranges_t *rlp;
328 
329     if (len == 0)
330       return;
331 
332     /*
333      * Deal with directionality codes introduced in Unicode 3.0.
334      */
335     if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
336         (len == 3 &&
337          (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
338           memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
339           memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) {
340         /*
341          * Mark all of these as Other Neutral to preserve compatibility with
342          * older versions.
343          */
344         len = 2;
345         name = "ON";
346     }
347 
348     for (i = 0; i < NUMPROPS; i++) {
349         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
350           break;
351     }
352 
353     if (i == NUMPROPS)
354       return;
355 
356     /*
357      * Have a match, so insert the code in order.
358      */
359     rlp = &proptbl[i];
360 
361     /*
362      * Resize the range list if necessary.
363      */
364     if (rlp->used == rlp->size) {
365         if (rlp->size == 0)
366           rlp->ranges = (ac_uint4 *)
367               malloc(sizeof(ac_uint4) << 3);
368         else
369           rlp->ranges = (ac_uint4 *)
370               realloc((char *) rlp->ranges,
371                       sizeof(ac_uint4) * (rlp->size + 8));
372         rlp->size += 8;
373     }
374 
375     /*
376      * If this is the first code for this property list, just add it
377      * and return.
378      */
379     if (rlp->used == 0) {
380         rlp->ranges[0] = rlp->ranges[1] = c;
381         rlp->used += 2;
382         return;
383     }
384 
385     /*
386      * Optimize the cases of extending the last range and adding new ranges to
387      * the end.
388      */
389     j = rlp->used - 1;
390     e = rlp->ranges[j];
391     s = rlp->ranges[j - 1];
392 
393     if (c == e + 1) {
394         /*
395          * Extend the last range.
396          */
397         rlp->ranges[j] = c;
398         return;
399     }
400 
401     if (c > e + 1) {
402         /*
403          * Start another range on the end.
404          */
405         j = rlp->used;
406         rlp->ranges[j] = rlp->ranges[j + 1] = c;
407         rlp->used += 2;
408         return;
409     }
410 
411     if (c >= s)
412       /*
413        * The code is a duplicate of a code in the last range, so just return.
414        */
415       return;
416 
417     /*
418      * The code should be inserted somewhere before the last range in the
419      * list.  Locate the insertion point.
420      */
421     for (i = 0;
422          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
423 
424     s = rlp->ranges[i];
425     e = rlp->ranges[i + 1];
426 
427     if (c == e + 1)
428       /*
429        * Simply extend the current range.
430        */
431       rlp->ranges[i + 1] = c;
432     else if (c < s) {
433         /*
434          * Add a new entry before the current location.  Shift all entries
435          * before the current one up by one to make room.
436          */
437         for (j = rlp->used; j > i; j -= 2) {
438             rlp->ranges[j] = rlp->ranges[j - 2];
439             rlp->ranges[j + 1] = rlp->ranges[j - 1];
440         }
441         rlp->ranges[i] = rlp->ranges[i + 1] = c;
442 
443         rlp->used += 2;
444     }
445 }
446 
447 static void
add_decomp(ac_uint4 code,short compat)448 add_decomp(ac_uint4 code, short compat)
449 {
450     ac_uint4 i, j, size;
451     _decomp_t **pdecomps;
452     ac_uint4 *pdecomps_used;
453     ac_uint4 *pdecomps_size;
454 
455     if (compat) {
456 	pdecomps = &kdecomps;
457 	pdecomps_used = &kdecomps_used;
458 	pdecomps_size = &kdecomps_size;
459     } else {
460 	pdecomps = &decomps;
461 	pdecomps_used = &decomps_used;
462 	pdecomps_size = &decomps_size;
463     }
464 
465     /*
466      * Add the code to the composite property.
467      */
468     if (!compat) {
469 	ordered_range_insert(code, "Cm", 2);
470     }
471 
472     /*
473      * Locate the insertion point for the code.
474      */
475     for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ;
476 
477     /*
478      * Allocate space for a new decomposition.
479      */
480     if (*pdecomps_used == *pdecomps_size) {
481         if (*pdecomps_size == 0)
482           *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
483         else
484           *pdecomps = (_decomp_t *)
485               realloc((char *) *pdecomps,
486                       sizeof(_decomp_t) * (*pdecomps_size + 8));
487         (void) memset((char *) (*pdecomps + *pdecomps_size), '\0',
488                       sizeof(_decomp_t) << 3);
489         *pdecomps_size += 8;
490     }
491 
492     if (i < *pdecomps_used && code != (*pdecomps)[i].code) {
493         /*
494          * Shift the decomps up by one if the codes don't match.
495          */
496         for (j = *pdecomps_used; j > i; j--)
497           (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1],
498                         sizeof(_decomp_t));
499     }
500 
501     /*
502      * Insert or replace a decomposition.
503      */
504     size = dectmp_size + (4 - (dectmp_size & 3));
505     if ((*pdecomps)[i].size < size) {
506         if ((*pdecomps)[i].size == 0)
507           (*pdecomps)[i].decomp = (ac_uint4 *)
508               malloc(sizeof(ac_uint4) * size);
509         else
510           (*pdecomps)[i].decomp = (ac_uint4 *)
511               realloc((char *) (*pdecomps)[i].decomp,
512                       sizeof(ac_uint4) * size);
513         (*pdecomps)[i].size = size;
514     }
515 
516     if ((*pdecomps)[i].code != code)
517       (*pdecomps_used)++;
518 
519     (*pdecomps)[i].code = code;
520     (*pdecomps)[i].used = dectmp_size;
521     (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp,
522                   sizeof(ac_uint4) * dectmp_size);
523 
524     /*
525      * NOTICE: This needs changing later so it is more general than simply
526      * pairs.  This calculation is done here to simplify allocation elsewhere.
527      */
528     if (!compat && dectmp_size == 2)
529       comps_used++;
530 }
531 
532 static void
add_title(ac_uint4 code)533 add_title(ac_uint4 code)
534 {
535     ac_uint4 i, j;
536 
537     /*
538      * Always map the code to itself.
539      */
540     cases[2] = code;
541 
542     if (title_used == title_size) {
543         if (title_size == 0)
544           title = (_case_t *) malloc(sizeof(_case_t) << 3);
545         else
546           title = (_case_t *) realloc((char *) title,
547                                       sizeof(_case_t) * (title_size + 8));
548         title_size += 8;
549     }
550 
551     /*
552      * Locate the insertion point.
553      */
554     for (i = 0; i < title_used && code > title[i].key; i++) ;
555 
556     if (i < title_used) {
557         /*
558          * Shift the array up by one.
559          */
560         for (j = title_used; j > i; j--)
561           (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
562                         sizeof(_case_t));
563     }
564 
565     title[i].key = cases[2];    /* Title */
566     title[i].other1 = cases[0]; /* Upper */
567     title[i].other2 = cases[1]; /* Lower */
568 
569     title_used++;
570 }
571 
572 static void
add_upper(ac_uint4 code)573 add_upper(ac_uint4 code)
574 {
575     ac_uint4 i, j;
576 
577     /*
578      * Always map the code to itself.
579      */
580     cases[0] = code;
581 
582     /*
583      * If the title case character is not present, then make it the same as
584      * the upper case.
585      */
586     if (cases[2] == 0)
587       cases[2] = code;
588 
589     if (upper_used == upper_size) {
590         if (upper_size == 0)
591           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
592         else
593           upper = (_case_t *) realloc((char *) upper,
594                                       sizeof(_case_t) * (upper_size + 8));
595         upper_size += 8;
596     }
597 
598     /*
599      * Locate the insertion point.
600      */
601     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
602 
603     if (i < upper_used) {
604         /*
605          * Shift the array up by one.
606          */
607         for (j = upper_used; j > i; j--)
608           (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
609                         sizeof(_case_t));
610     }
611 
612     upper[i].key = cases[0];    /* Upper */
613     upper[i].other1 = cases[1]; /* Lower */
614     upper[i].other2 = cases[2]; /* Title */
615 
616     upper_used++;
617 }
618 
619 static void
add_lower(ac_uint4 code)620 add_lower(ac_uint4 code)
621 {
622     ac_uint4 i, j;
623 
624     /*
625      * Always map the code to itself.
626      */
627     cases[1] = code;
628 
629     /*
630      * If the title case character is empty, then make it the same as the
631      * upper case.
632      */
633     if (cases[2] == 0)
634       cases[2] = cases[0];
635 
636     if (lower_used == lower_size) {
637         if (lower_size == 0)
638           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
639         else
640           lower = (_case_t *) realloc((char *) lower,
641                                       sizeof(_case_t) * (lower_size + 8));
642         lower_size += 8;
643     }
644 
645     /*
646      * Locate the insertion point.
647      */
648     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
649 
650     if (i < lower_used) {
651         /*
652          * Shift the array up by one.
653          */
654         for (j = lower_used; j > i; j--)
655           (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
656                         sizeof(_case_t));
657     }
658 
659     lower[i].key = cases[1];    /* Lower */
660     lower[i].other1 = cases[0]; /* Upper */
661     lower[i].other2 = cases[2]; /* Title */
662 
663     lower_used++;
664 }
665 
666 static void
ordered_ccl_insert(ac_uint4 c,ac_uint4 ccl_code)667 ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code)
668 {
669     ac_uint4 i, j;
670 
671     if (ccl_used == ccl_size) {
672         if (ccl_size == 0)
673           ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24);
674         else
675           ccl = (ac_uint4 *)
676               realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24));
677         ccl_size += 24;
678     }
679 
680     /*
681      * Optimize adding the first item.
682      */
683     if (ccl_used == 0) {
684         ccl[0] = ccl[1] = c;
685         ccl[2] = ccl_code;
686         ccl_used += 3;
687         return;
688     }
689 
690     /*
691      * Handle the special case of extending the range on the end.  This
692      * requires that the combining class codes are the same.
693      */
694     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
695         ccl[ccl_used - 2] = c;
696         return;
697     }
698 
699     /*
700      * Handle the special case of adding another range on the end.
701      */
702     if (c > ccl[ccl_used - 2] + 1 ||
703         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
704         ccl[ccl_used++] = c;
705         ccl[ccl_used++] = c;
706         ccl[ccl_used++] = ccl_code;
707         return;
708     }
709 
710     /*
711      * Locate either the insertion point or range for the code.
712      */
713     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
714 
715     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
716         /*
717          * Extend an existing range.
718          */
719         ccl[i + 1] = c;
720         return;
721     } else if (c < ccl[i]) {
722         /*
723          * Start a new range before the current location.
724          */
725         for (j = ccl_used; j > i; j -= 3) {
726             ccl[j] = ccl[j - 3];
727             ccl[j - 1] = ccl[j - 4];
728             ccl[j - 2] = ccl[j - 5];
729         }
730         ccl[i] = ccl[i + 1] = c;
731         ccl[i + 2] = ccl_code;
732     }
733 }
734 
735 /*
736  * Adds a number if it does not already exist and returns an index value
737  * multiplied by 2.
738  */
739 static ac_uint4
make_number(short num,short denom)740 make_number(short num, short denom)
741 {
742     ac_uint4 n;
743 
744     /*
745      * Determine if the number already exists.
746      */
747     for (n = 0; n < nums_used; n++) {
748         if (nums[n].numerator == num && nums[n].denominator == denom)
749           return n << 1;
750     }
751 
752     if (nums_used == nums_size) {
753         if (nums_size == 0)
754           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
755         else
756           nums = (_num_t *) realloc((char *) nums,
757                                     sizeof(_num_t) * (nums_size + 8));
758         nums_size += 8;
759     }
760 
761     n = nums_used++;
762     nums[n].numerator = num;
763     nums[n].denominator = denom;
764 
765     return n << 1;
766 }
767 
768 static void
add_number(ac_uint4 code,short num,short denom)769 add_number(ac_uint4 code, short num, short denom)
770 {
771     ac_uint4 i, j;
772 
773     /*
774      * Insert the code in order.
775      */
776     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
777 
778     /*
779      * Handle the case of the codes matching and simply replace the number
780      * that was there before.
781      */
782     if (i < ncodes_used && code == ncodes[i].code) {
783         ncodes[i].idx = make_number(num, denom);
784         return;
785     }
786 
787     /*
788      * Resize the array if necessary.
789      */
790     if (ncodes_used == ncodes_size) {
791         if (ncodes_size == 0)
792           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
793         else
794           ncodes = (_codeidx_t *)
795               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
796 
797         ncodes_size += 8;
798     }
799 
800     /*
801      * Shift things around to insert the code if necessary.
802      */
803     if (i < ncodes_used) {
804         for (j = ncodes_used; j > i; j--) {
805             ncodes[j].code = ncodes[j - 1].code;
806             ncodes[j].idx = ncodes[j - 1].idx;
807         }
808     }
809     ncodes[i].code = code;
810     ncodes[i].idx = make_number(num, denom);
811 
812     ncodes_used++;
813 }
814 
815 /*
816  * This routine assumes that the line is a valid Unicode Character Database
817  * entry.
818  */
819 static void
read_cdata(FILE * in)820 read_cdata(FILE *in)
821 {
822     ac_uint4 i, lineno, skip, code, ccl_code;
823     short wnum, neg, number[2], compat;
824     char line[512], *s, *e;
825 
826     lineno = skip = 0;
827     while (fgets(line, sizeof(line), in)) {
828 	if( (s=strchr(line, '\n')) ) *s = '\0';
829         lineno++;
830 
831         /*
832          * Skip blank lines and lines that start with a '#'.
833          */
834         if (line[0] == 0 || line[0] == '#')
835           continue;
836 
837         /*
838          * If lines need to be skipped, do it here.
839          */
840         if (skip) {
841             skip--;
842             continue;
843         }
844 
845         /*
846          * Collect the code.  The code can be up to 6 hex digits in length to
847          * allow surrogates to be specified.
848          */
849         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
850             code <<= 4;
851             if (*s >= '0' && *s <= '9')
852               code += *s - '0';
853             else if (*s >= 'A' && *s <= 'F')
854               code += (*s - 'A') + 10;
855             else if (*s >= 'a' && *s <= 'f')
856               code += (*s - 'a') + 10;
857         }
858 
859         /*
860          * Handle the following special cases:
861          * 1. 4E00-9FA5 CJK Ideographs.
862          * 2. AC00-D7A3 Hangul Syllables.
863          * 3. D800-DFFF Surrogates.
864          * 4. E000-F8FF Private Use Area.
865          * 5. F900-FA2D Han compatibility.
866 	 * ...Plus additional ranges in newer Unicode versions...
867          */
868         switch (code) {
869 	  case 0x3400:
870 	    /* CJK Ideograph Extension A */
871             add_range(0x3400, 0x4db5, "Lo", "L");
872 
873             add_range(0x3400, 0x4db5, "Cp", 0);
874 
875 	    skip = 1;
876 	    break;
877           case 0x4e00:
878             /*
879              * The Han ideographs.
880              */
881             add_range(0x4e00, 0x9fff, "Lo", "L");
882 
883             /*
884              * Add the characters to the defined category.
885              */
886             add_range(0x4e00, 0x9fa5, "Cp", 0);
887 
888             skip = 1;
889             break;
890           case 0xac00:
891             /*
892              * The Hangul syllables.
893              */
894             add_range(0xac00, 0xd7a3, "Lo", "L");
895 
896             /*
897              * Add the characters to the defined category.
898              */
899             add_range(0xac00, 0xd7a3, "Cp", 0);
900 
901             skip = 1;
902             break;
903           case 0xd800:
904             /*
905              * Make a range of all surrogates and assume some default
906              * properties.
907              */
908             add_range(0x010000, 0x10ffff, "Cs", "L");
909             skip = 5;
910             break;
911           case 0xe000:
912             /*
913              * The Private Use area.  Add with a default set of properties.
914              */
915             add_range(0xe000, 0xf8ff, "Co", "L");
916             skip = 1;
917             break;
918           case 0xf900:
919             /*
920              * The CJK compatibility area.
921              */
922             add_range(0xf900, 0xfaff, "Lo", "L");
923 
924             /*
925              * Add the characters to the defined category.
926              */
927             add_range(0xf900, 0xfaff, "Cp", 0);
928 
929             skip = 1;
930 	    break;
931 	  case 0x20000:
932 	    /* CJK Ideograph Extension B */
933             add_range(0x20000, 0x2a6d6, "Lo", "L");
934 
935             add_range(0x20000, 0x2a6d6, "Cp", 0);
936 
937 	    skip = 1;
938 	    break;
939 	  case 0xf0000:
940 	    /* Plane 15 private use */
941 	    add_range(0xf0000, 0xffffd, "Co", "L");
942 	    skip = 1;
943 	    break;
944 
945 	  case 0x100000:
946 	    /* Plane 16 private use */
947 	    add_range(0x100000, 0x10fffd, "Co", "L");
948 	    skip = 1;
949 	    break;
950         }
951 
952         if (skip)
953           continue;
954 
955         /*
956          * Add the code to the defined category.
957          */
958         ordered_range_insert(code, "Cp", 2);
959 
960         /*
961          * Locate the first character property field.
962          */
963         for (i = 0; *s != 0 && i < 2; s++) {
964             if (*s == ';')
965               i++;
966         }
967         for (e = s; *e && *e != ';'; e++) ;
968 
969         ordered_range_insert(code, s, e - s);
970 
971         /*
972          * Locate the combining class code.
973          */
974         for (s = e; *s != 0 && i < 3; s++) {
975             if (*s == ';')
976               i++;
977         }
978 
979         /*
980          * Convert the combining class code from decimal.
981          */
982         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
983           ccl_code = (ccl_code * 10) + (*e - '0');
984 
985         /*
986          * Add the code if it not 0.
987          */
988         if (ccl_code != 0)
989           ordered_ccl_insert(code, ccl_code);
990 
991         /*
992          * Locate the second character property field.
993          */
994         for (s = e; *s != 0 && i < 4; s++) {
995             if (*s == ';')
996               i++;
997         }
998         for (e = s; *e && *e != ';'; e++) ;
999 
1000         ordered_range_insert(code, s, e - s);
1001 
1002         /*
1003          * Check for a decomposition.
1004          */
1005         s = ++e;
1006         if (*s != ';') {
1007 	    compat = *s == '<';
1008 	    if (compat) {
1009 		/*
1010 		 * Skip compatibility formatting tag.
1011 		 */
1012 		while (*s++ != '>');
1013 	    }
1014             /*
1015              * Collect the codes of the decomposition.
1016              */
1017             for (dectmp_size = 0; *s != ';'; ) {
1018                 /*
1019                  * Skip all leading non-hex digits.
1020                  */
1021                 while (!ishdigit(*s))
1022  		  s++;
1023 
1024                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
1025                     dectmp[dectmp_size] <<= 4;
1026                     if (*s >= '0' && *s <= '9')
1027                       dectmp[dectmp_size] += *s - '0';
1028                     else if (*s >= 'A' && *s <= 'F')
1029                       dectmp[dectmp_size] += (*s - 'A') + 10;
1030                     else if (*s >= 'a' && *s <= 'f')
1031                       dectmp[dectmp_size] += (*s - 'a') + 10;
1032                 }
1033                 dectmp_size++;
1034             }
1035 
1036             /*
1037              * If there are any codes in the temporary decomposition array,
1038              * then add the character with its decomposition.
1039              */
1040             if (dectmp_size > 0) {
1041 		if (!compat) {
1042 		    add_decomp(code, 0);
1043 		}
1044 		add_decomp(code, 1);
1045 	    }
1046         }
1047 
1048         /*
1049          * Skip to the number field.
1050          */
1051         for (i = 0; i < 3 && *s; s++) {
1052             if (*s == ';')
1053               i++;
1054         }
1055 
1056         /*
1057          * Scan the number in.
1058          */
1059         number[0] = number[1] = 0;
1060         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
1061             if (*e == '-') {
1062                 neg = 1;
1063                 continue;
1064             }
1065 
1066             if (*e == '/') {
1067                 /*
1068                  * Move the the denominator of the fraction.
1069                  */
1070                 if (neg)
1071                   number[wnum] *= -1;
1072                 neg = 0;
1073                 e++;
1074                 wnum++;
1075             }
1076             number[wnum] = (number[wnum] * 10) + (*e - '0');
1077         }
1078 
1079         if (e > s) {
1080             /*
1081              * Adjust the denominator in case of integers and add the number.
1082              */
1083             if (wnum == 0)
1084               number[1] = 1;
1085 
1086             add_number(code, number[0], number[1]);
1087         }
1088 
1089         /*
1090          * Skip to the start of the possible case mappings.
1091          */
1092         for (s = e, i = 0; i < 4 && *s; s++) {
1093             if (*s == ';')
1094               i++;
1095         }
1096 
1097         /*
1098          * Collect the case mappings.
1099          */
1100         cases[0] = cases[1] = cases[2] = 0;
1101         for (i = 0; i < 3; i++) {
1102             while (ishdigit(*s)) {
1103                 cases[i] <<= 4;
1104                 if (*s >= '0' && *s <= '9')
1105                   cases[i] += *s - '0';
1106                 else if (*s >= 'A' && *s <= 'F')
1107                   cases[i] += (*s - 'A') + 10;
1108                 else if (*s >= 'a' && *s <= 'f')
1109                   cases[i] += (*s - 'a') + 10;
1110                 s++;
1111             }
1112             if (*s == ';')
1113               s++;
1114         }
1115         if (cases[0] && cases[1])
1116           /*
1117            * Add the upper and lower mappings for a title case character.
1118            */
1119           add_title(code);
1120         else if (cases[1])
1121           /*
1122            * Add the lower and title case mappings for the upper case
1123            * character.
1124            */
1125           add_upper(code);
1126         else if (cases[0])
1127           /*
1128            * Add the upper and title case mappings for the lower case
1129            * character.
1130            */
1131           add_lower(code);
1132     }
1133 }
1134 
1135 #if 0
1136 
1137 static _decomp_t *
1138 find_decomp(ac_uint4 code, short compat)
1139 {
1140     long l, r, m;
1141     _decomp_t *decs;
1142 
1143     l = 0;
1144     r = (compat ? kdecomps_used : decomps_used) - 1;
1145     decs = compat ? kdecomps : decomps;
1146     while (l <= r) {
1147         m = (l + r) >> 1;
1148         if (code > decs[m].code)
1149           l = m + 1;
1150         else if (code < decs[m].code)
1151           r = m - 1;
1152         else
1153           return &decs[m];
1154     }
1155     return 0;
1156 }
1157 
1158 static void
1159 decomp_it(_decomp_t *d, short compat)
1160 {
1161     ac_uint4 i;
1162     _decomp_t *dp;
1163 
1164     for (i = 0; i < d->used; i++) {
1165         if ((dp = find_decomp(d->decomp[i], compat)) != 0)
1166           decomp_it(dp, compat);
1167         else
1168           dectmp[dectmp_size++] = d->decomp[i];
1169     }
1170 }
1171 
1172 
1173 /*
1174  * Expand all decompositions by recursively decomposing each character
1175  * in the decomposition.
1176  */
1177 static void
1178 expand_decomp(void)
1179 {
1180     ac_uint4 i;
1181 
1182     for (i = 0; i < decomps_used; i++) {
1183         dectmp_size = 0;
1184         decomp_it(&decomps[i], 0);
1185         if (dectmp_size > 0)
1186           add_decomp(decomps[i].code, 0);
1187     }
1188 
1189     for (i = 0; i < kdecomps_used; i++) {
1190         dectmp_size = 0;
1191         decomp_it(&kdecomps[i], 1);
1192         if (dectmp_size > 0)
1193           add_decomp(kdecomps[i].code, 1);
1194     }
1195 }
1196 
1197 static int
1198 cmpcomps(const void *v_comp1, const void *v_comp2)
1199 {
1200 	const _comp_t *comp1 = v_comp1, *comp2 = v_comp2;
1201     long diff = comp1->code1 - comp2->code1;
1202 
1203     if (!diff)
1204 	diff = comp1->code2 - comp2->code2;
1205     return (int) diff;
1206 }
1207 
1208 #endif
1209 
1210 /*
1211  * Load composition exclusion data
1212  */
1213 static void
read_compexdata(FILE * in)1214 read_compexdata(FILE *in)
1215 {
1216     ac_uint2 i;
1217     ac_uint4 code;
1218     char line[512], *s;
1219 
1220     (void) memset((char *) compexs, 0, sizeof(compexs));
1221 
1222     while (fgets(line, sizeof(line), in)) {
1223 	if( (s=strchr(line, '\n')) ) *s = '\0';
1224         /*
1225          * Skip blank lines and lines that start with a '#'.
1226          */
1227         if (line[0] == 0 || line[0] == '#')
1228 	    continue;
1229 
1230 	/*
1231          * Collect the code.  Assume max 6 digits
1232          */
1233 
1234 	for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) {
1235 	    if (isspace((unsigned char)*s)) break;
1236             code <<= 4;
1237             if (*s >= '0' && *s <= '9')
1238 		code += *s - '0';
1239             else if (*s >= 'A' && *s <= 'F')
1240 		code += (*s - 'A') + 10;
1241             else if (*s >= 'a' && *s <= 'f')
1242 		code += (*s - 'a') + 10;
1243         }
1244         COMPEX_SET(code);
1245     }
1246 }
1247 
1248 #if 0
1249 
1250 /*
1251  * Creates array of compositions from decomposition array
1252  */
1253 static void
1254 create_comps(void)
1255 {
1256     ac_uint4 i, cu;
1257 
1258     comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
1259 
1260     for (i = cu = 0; i < decomps_used; i++) {
1261 	if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
1262 	    continue;
1263 	comps[cu].comp = decomps[i].code;
1264 	comps[cu].count = 2;
1265 	comps[cu].code1 = decomps[i].decomp[0];
1266 	comps[cu].code2 = decomps[i].decomp[1];
1267 	cu++;
1268     }
1269     comps_used = cu;
1270     qsort(comps, comps_used, sizeof(_comp_t), cmpcomps);
1271 }
1272 
1273 #endif
1274 
1275 #if HARDCODE_DATA
1276 static void
write_case(FILE * out,_case_t * tab,int num,int first)1277 write_case(FILE *out, _case_t *tab, int num, int first)
1278 {
1279     int i;
1280 
1281     for (i=0; i<num; i++) {
1282 	if (first) first = 0;
1283 	else fprintf(out, ",");
1284 	fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx",
1285 		(unsigned long) tab[i].key, (unsigned long) tab[i].other1,
1286 		(unsigned long) tab[i].other2);
1287     }
1288 }
1289 
1290 #define PREF "static const "
1291 
1292 #endif
1293 
1294 static void
write_cdata(char * opath)1295 write_cdata(char *opath)
1296 {
1297     FILE *out;
1298 	ac_uint4 bytes;
1299     ac_uint4 i, idx, nprops;
1300 #if !(HARDCODE_DATA)
1301     ac_uint2 casecnt[2];
1302 #endif
1303     char path[BUFSIZ];
1304 #if HARDCODE_DATA
1305     int j, k;
1306 
1307     /*****************************************************************
1308      *
1309      * Generate the ctype data.
1310      *
1311      *****************************************************************/
1312 
1313     /*
1314      * Open the output file.
1315      */
1316     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath);
1317     if ((out = fopen(path, "w")) == 0)
1318       return;
1319 #else
1320     /*
1321      * Open the ctype.dat file.
1322      */
1323     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath);
1324     if ((out = fopen(path, "wb")) == 0)
1325       return;
1326 #endif
1327 
1328     /*
1329      * Collect the offsets for the properties.  The offsets array is
1330      * on a 4-byte boundary to keep things efficient for architectures
1331      * that need such a thing.
1332      */
1333     for (i = idx = 0; i < NUMPROPS; i++) {
1334         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1335         idx += proptbl[i].used;
1336     }
1337 
1338     /*
1339      * Add the sentinel index which is used by the binary search as the upper
1340      * bound for a search.
1341      */
1342     propcnt[i] = idx;
1343 
1344     /*
1345      * Record the actual number of property lists.  This may be different than
1346      * the number of offsets actually written because of aligning on a 4-byte
1347      * boundary.
1348      */
1349     hdr[1] = NUMPROPS;
1350 
1351     /*
1352      * Calculate the byte count needed and pad the property counts array to a
1353      * 4-byte boundary.
1354      */
1355     if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3)
1356       bytes += 4 - (bytes & 3);
1357     nprops = bytes / sizeof(ac_uint2);
1358     bytes += sizeof(ac_uint4) * idx;
1359 
1360 #if HARDCODE_DATA
1361     fprintf(out,
1362         "/* This file was generated from a modified version UCData's ucgendat.\n"
1363         " *\n"
1364         " *                     DO NOT EDIT THIS FILE!\n"
1365         " * \n"
1366         " * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download\n"
1367         " * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt\n"
1368         " * files from  http://www.unicode.org/Public/ and run this program.\n"
1369         " *\n"
1370         " * More information can be found in the UCData package. Unfortunately,\n"
1371         " * the project's page doesn't seem to be live anymore, so you can use\n"
1372         " * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */\n\n");
1373 
1374     fprintf(out, PREF "unsigned short _ucprop_size = %d;\n\n", NUMPROPS);
1375 
1376     fprintf(out, PREF "unsigned short  _ucprop_offsets[] = {");
1377 
1378     for (i = 0; i<nprops; i++) {
1379        if (i) fprintf(out, ",");
1380        if (!(i&7)) fprintf(out, "\n\t");
1381        else fprintf(out, " ");
1382        fprintf(out, "0x%04x", propcnt[i]);
1383     }
1384     fprintf(out, "\n};\n\n");
1385 
1386     fprintf(out, PREF "unsigned int _ucprop_ranges[] = {");
1387 
1388     k = 0;
1389     for (i = 0; i < NUMPROPS; i++) {
1390 	if (proptbl[i].used > 0) {
1391 	  for (j=0; j<proptbl[i].used; j++) {
1392 	    if (k) fprintf(out, ",");
1393 	    if (!(k&3)) fprintf(out,"\n\t");
1394 	    else fprintf(out, " ");
1395 	    k++;
1396 	    fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]);
1397 	  }
1398 	}
1399     }
1400     fprintf(out, "\n};\n\n");
1401 #else
1402     /*
1403      * Write the header.
1404      */
1405     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1406 
1407     /*
1408      * Write the byte count.
1409      */
1410     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1411 
1412     /*
1413      * Write the property list counts.
1414      */
1415     fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out);
1416 
1417     /*
1418      * Write the property lists.
1419      */
1420     for (i = 0; i < NUMPROPS; i++) {
1421         if (proptbl[i].used > 0)
1422           fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4),
1423                  proptbl[i].used, out);
1424     }
1425 
1426     fclose(out);
1427 #endif
1428 
1429     /*****************************************************************
1430      *
1431      * Generate the case mapping data.
1432      *
1433      *****************************************************************/
1434 
1435 #if HARDCODE_DATA
1436     fprintf(out, PREF "unsigned int _uccase_size = %ld;\n\n",
1437         (long) (upper_used + lower_used + title_used));
1438 
1439     fprintf(out,
1440         "/* Starting indexes of the case tables\n"
1441         " * UpperIndex = 0\n"
1442         " * LowerIndex = _uccase_len[0]\n"
1443         " * TitleIndex = LowerIndex + _uccase_len[1] */\n\n");
1444     fprintf(out, PREF "unsigned short _uccase_len[2] = {%ld, %ld};\n\n",
1445         (long) upper_used * 3, (long) lower_used * 3);
1446     fprintf(out, PREF "unsigned int _uccase_map[] = {");
1447 
1448     if (upper_used > 0)
1449       /*
1450        * Write the upper case table.
1451        */
1452       write_case(out, upper, upper_used, 1);
1453 
1454     if (lower_used > 0)
1455       /*
1456        * Write the lower case table.
1457        */
1458       write_case(out, lower, lower_used, !upper_used);
1459 
1460     if (title_used > 0)
1461       /*
1462        * Write the title case table.
1463        */
1464       write_case(out, title, title_used, !(upper_used||lower_used));
1465 
1466     if (!(upper_used || lower_used || title_used))
1467 	fprintf(out, "\t0");
1468 
1469     fprintf(out, "\n};\n\n");
1470 #else
1471     /*
1472      * Open the case.dat file.
1473      */
1474     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath);
1475     if ((out = fopen(path, "wb")) == 0)
1476       return;
1477 
1478     /*
1479      * Write the case mapping tables.
1480      */
1481     hdr[1] = upper_used + lower_used + title_used;
1482     casecnt[0] = upper_used;
1483     casecnt[1] = lower_used;
1484 
1485     /*
1486      * Write the header.
1487      */
1488     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1489 
1490     /*
1491      * Write the upper and lower case table sizes.
1492      */
1493     fwrite((char *) casecnt, sizeof(ac_uint2), 2, out);
1494 
1495     if (upper_used > 0)
1496       /*
1497        * Write the upper case table.
1498        */
1499       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1500 
1501     if (lower_used > 0)
1502       /*
1503        * Write the lower case table.
1504        */
1505       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1506 
1507     if (title_used > 0)
1508       /*
1509        * Write the title case table.
1510        */
1511       fwrite((char *) title, sizeof(_case_t), title_used, out);
1512 
1513     fclose(out);
1514 #endif
1515 
1516 #if 0
1517 
1518     /*****************************************************************
1519      *
1520      * Generate the composition data.
1521      *
1522      *****************************************************************/
1523 
1524     /*
1525      * Create compositions from decomposition data
1526      */
1527     create_comps();
1528 
1529 #if HARDCODE_DATA
1530     fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n",
1531         comps_used * 4L);
1532 
1533     fprintf(out, PREF "ac_uint4 _uccomp_data[] = {");
1534 
1535      /*
1536       * Now, if comps exist, write them out.
1537       */
1538     if (comps_used > 0) {
1539 	for (i=0; i<comps_used; i++) {
1540 	    if (i) fprintf(out, ",");
1541 	    fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx",
1542 	        (unsigned long) comps[i].comp, (unsigned long) comps[i].count,
1543 	        (unsigned long) comps[i].code1, (unsigned long) comps[i].code2);
1544 	}
1545     } else {
1546 	fprintf(out, "\t0");
1547     }
1548     fprintf(out, "\n};\n\n");
1549 #else
1550     /*
1551      * Open the comp.dat file.
1552      */
1553     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath);
1554     if ((out = fopen(path, "wb")) == 0)
1555 	return;
1556 
1557     /*
1558      * Write the header.
1559      */
1560     hdr[1] = (ac_uint2) comps_used * 4;
1561     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1562 
1563     /*
1564      * Write out the byte count to maintain header size.
1565      */
1566     bytes = comps_used * sizeof(_comp_t);
1567     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1568 
1569     /*
1570      * Now, if comps exist, write them out.
1571      */
1572     if (comps_used > 0)
1573         fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
1574 
1575     fclose(out);
1576 #endif
1577 
1578     /*****************************************************************
1579      *
1580      * Generate the decomposition data.
1581      *
1582      *****************************************************************/
1583 
1584     /*
1585      * Fully expand all decompositions before generating the output file.
1586      */
1587     expand_decomp();
1588 
1589 #if HARDCODE_DATA
1590     fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n",
1591         decomps_used * 2L);
1592 
1593     fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {");
1594 
1595     if (decomps_used) {
1596 	/*
1597 	 * Write the list of decomp nodes.
1598 	 */
1599 	for (i = idx = 0; i < decomps_used; i++) {
1600 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1601 	        (unsigned long) decomps[i].code, (unsigned long) idx);
1602 	    idx += decomps[i].used;
1603 	}
1604 
1605 	/*
1606 	 * Write the sentinel index as the last decomp node.
1607 	 */
1608 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1609 
1610 	fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {");
1611 	/*
1612 	 * Write the decompositions themselves.
1613 	 */
1614 	k = 0;
1615 	for (i = 0; i < decomps_used; i++)
1616 	  for (j=0; j<decomps[i].used; j++) {
1617 	    if (k) fprintf(out, ",");
1618 	    if (!(k&3)) fprintf(out,"\n\t");
1619 	    else fprintf(out, " ");
1620 	    k++;
1621 	    fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]);
1622 	  }
1623 	fprintf(out, "\n};\n\n");
1624     }
1625 #else
1626     /*
1627      * Open the decomp.dat file.
1628      */
1629     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath);
1630     if ((out = fopen(path, "wb")) == 0)
1631       return;
1632 
1633     hdr[1] = decomps_used;
1634 
1635     /*
1636      * Write the header.
1637      */
1638     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1639 
1640     /*
1641      * Write a temporary byte count which will be calculated as the
1642      * decompositions are written out.
1643      */
1644     bytes = 0;
1645     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1646 
1647     if (decomps_used) {
1648         /*
1649          * Write the list of decomp nodes.
1650          */
1651         for (i = idx = 0; i < decomps_used; i++) {
1652             fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out);
1653             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1654             idx += decomps[i].used;
1655         }
1656 
1657         /*
1658          * Write the sentinel index as the last decomp node.
1659          */
1660         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1661 
1662         /*
1663          * Write the decompositions themselves.
1664          */
1665         for (i = 0; i < decomps_used; i++)
1666           fwrite((char *) decomps[i].decomp, sizeof(ac_uint4),
1667                  decomps[i].used, out);
1668 
1669         /*
1670          * Seek back to the beginning and write the byte count.
1671          */
1672         bytes = (sizeof(ac_uint4) * idx) +
1673             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1674         fseek(out, sizeof(ac_uint2) << 1, 0L);
1675         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1676 
1677         fclose(out);
1678     }
1679 #endif
1680 
1681 #ifdef HARDCODE_DATA
1682     fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n",
1683         kdecomps_used * 2L);
1684 
1685     fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {");
1686 
1687     if (kdecomps_used) {
1688 	/*
1689 	 * Write the list of kdecomp nodes.
1690 	 */
1691 	for (i = idx = 0; i < kdecomps_used; i++) {
1692 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1693 	        (unsigned long) kdecomps[i].code, (unsigned long) idx);
1694 	    idx += kdecomps[i].used;
1695 	}
1696 
1697 	/*
1698 	 * Write the sentinel index as the last decomp node.
1699 	 */
1700 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1701 
1702 	fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {");
1703 
1704 	/*
1705 	 * Write the decompositions themselves.
1706 	 */
1707 	k = 0;
1708 	for (i = 0; i < kdecomps_used; i++)
1709 	  for (j=0; j<kdecomps[i].used; j++) {
1710 	    if (k) fprintf(out, ",");
1711 	    if (!(k&3)) fprintf(out,"\n\t");
1712 	    else fprintf(out, " ");
1713 	    k++;
1714 	    fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]);
1715 	  }
1716 	fprintf(out, "\n};\n\n");
1717     }
1718 #else
1719     /*
1720      * Open the kdecomp.dat file.
1721      */
1722     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath);
1723     if ((out = fopen(path, "wb")) == 0)
1724       return;
1725 
1726     hdr[1] = kdecomps_used;
1727 
1728     /*
1729      * Write the header.
1730      */
1731     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1732 
1733     /*
1734      * Write a temporary byte count which will be calculated as the
1735      * decompositions are written out.
1736      */
1737     bytes = 0;
1738     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1739 
1740     if (kdecomps_used) {
1741         /*
1742          * Write the list of kdecomp nodes.
1743          */
1744         for (i = idx = 0; i < kdecomps_used; i++) {
1745             fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out);
1746             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1747             idx += kdecomps[i].used;
1748         }
1749 
1750         /*
1751          * Write the sentinel index as the last decomp node.
1752          */
1753         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1754 
1755         /*
1756          * Write the decompositions themselves.
1757          */
1758         for (i = 0; i < kdecomps_used; i++)
1759           fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4),
1760                  kdecomps[i].used, out);
1761 
1762         /*
1763          * Seek back to the beginning and write the byte count.
1764          */
1765         bytes = (sizeof(ac_uint4) * idx) +
1766             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1767         fseek(out, sizeof(ac_uint2) << 1, 0L);
1768         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1769 
1770         fclose(out);
1771     }
1772 #endif
1773 
1774     /*****************************************************************
1775      *
1776      * Generate the combining class data.
1777      *
1778      *****************************************************************/
1779 #ifdef HARDCODE_DATA
1780     fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used);
1781 
1782     fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {");
1783 
1784     if (ccl_used > 0) {
1785 	/*
1786 	 * Write the combining class ranges out.
1787 	 */
1788 	for (i = 0; i<ccl_used; i++) {
1789 	    if (i) fprintf(out, ",");
1790 	    if (!(i&3)) fprintf(out, "\n\t");
1791 	    else fprintf(out, " ");
1792 	    fprintf(out, "0x%08lx", (unsigned long) ccl[i]);
1793 	}
1794     } else {
1795 	fprintf(out, "\t0");
1796     }
1797     fprintf(out, "\n};\n\n");
1798 #else
1799     /*
1800      * Open the cmbcl.dat file.
1801      */
1802     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath);
1803     if ((out = fopen(path, "wb")) == 0)
1804       return;
1805 
1806     /*
1807      * Set the number of ranges used.  Each range has a combining class which
1808      * means each entry is a 3-tuple.
1809      */
1810     hdr[1] = ccl_used / 3;
1811 
1812     /*
1813      * Write the header.
1814      */
1815     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1816 
1817     /*
1818      * Write out the byte count to maintain header size.
1819      */
1820     bytes = ccl_used * sizeof(ac_uint4);
1821     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1822 
1823     if (ccl_used > 0)
1824       /*
1825        * Write the combining class ranges out.
1826        */
1827       fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out);
1828 
1829     fclose(out);
1830 #endif
1831 
1832     /*****************************************************************
1833      *
1834      * Generate the number data.
1835      *
1836      *****************************************************************/
1837 
1838 #if HARDCODE_DATA
1839     fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n",
1840         (unsigned long)ncodes_used<<1);
1841 
1842     fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {");
1843 
1844     /*
1845      * Now, if number mappings exist, write them out.
1846      */
1847     if (ncodes_used > 0) {
1848 	for (i = 0; i<ncodes_used; i++) {
1849 	    if (i) fprintf(out, ",");
1850 	    if (!(i&1)) fprintf(out, "\n\t");
1851 	    else fprintf(out, " ");
1852 	    fprintf(out, "0x%08lx, 0x%08lx",
1853 	        (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx);
1854 	}
1855 	fprintf(out, "\n};\n\n");
1856 
1857 	fprintf(out, PREF "short _ucnum_vals[] = {");
1858 	for (i = 0; i<nums_used; i++) {
1859 	    if (i) fprintf(out, ",");
1860 	    if (!(i&3)) fprintf(out, "\n\t");
1861 	    else fprintf(out, " ");
1862 	    if (nums[i].numerator < 0) {
1863 		fprintf(out, "%6d, 0x%04x",
1864 		  nums[i].numerator, nums[i].denominator);
1865 	    } else {
1866 		fprintf(out, "0x%04x, 0x%04x",
1867 		  nums[i].numerator, nums[i].denominator);
1868 	    }
1869 	}
1870 	fprintf(out, "\n};\n\n");
1871     }
1872 #else
1873     /*
1874      * Open the num.dat file.
1875      */
1876     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath);
1877     if ((out = fopen(path, "wb")) == 0)
1878       return;
1879 
1880     /*
1881      * The count part of the header will be the total number of codes that
1882      * have numbers.
1883      */
1884     hdr[1] = (ac_uint2) (ncodes_used << 1);
1885     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1886 
1887     /*
1888      * Write the header.
1889      */
1890     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1891 
1892     /*
1893      * Write out the byte count to maintain header size.
1894      */
1895     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1896 
1897     /*
1898      * Now, if number mappings exist, write them out.
1899      */
1900     if (ncodes_used > 0) {
1901         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1902         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1903     }
1904 #endif
1905 
1906 #endif
1907 
1908     fclose(out);
1909 }
1910 
1911 static void
usage(char * prog)1912 usage(char *prog)
1913 {
1914     fprintf(stderr,
1915             "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
1916     fprintf(stderr, " datafile1 datafile2 ...\n\n");
1917     fprintf(stderr,
1918             "-o output-directory\n\t\tWrite the output files to a different");
1919     fprintf(stderr, " directory (default: .).\n");
1920     fprintf(stderr,
1921             "-x composition-exclusion\n\t\tFile of composition codes");
1922     fprintf(stderr, " that should be excluded.\n");
1923     exit(1);
1924 }
1925 
1926 int
main(int argc,char * argv[])1927 main(int argc, char *argv[])
1928 {
1929     FILE *in;
1930     char *prog, *opath;
1931 
1932     prog = argv[1];
1933 
1934     opath = 0;
1935     in = stdin;
1936 
1937     argc--;
1938     argv++;
1939 
1940     while (argc > 0) {
1941         if (argv[0][0] == '-') {
1942             switch (argv[0][1]) {
1943               case 'o':
1944                 argc--;
1945                 argv++;
1946                 opath = argv[0];
1947                 break;
1948               case 'x':
1949                 argc--;
1950                 argv++;
1951                 if ((in = fopen(argv[0], "r")) == 0)
1952                   fprintf(stderr,
1953                           "%s: unable to open composition exclusion file %s\n",
1954                           prog, argv[0]);
1955                 else {
1956                     read_compexdata(in);
1957                     fclose(in);
1958                     in = 0;
1959                 }
1960                 break;
1961               default:
1962                 usage(prog);
1963             }
1964         } else {
1965             if (in != stdin && in != NULL)
1966               fclose(in);
1967             if ((in = fopen(argv[0], "r")) == 0)
1968               fprintf(stderr, "%s: unable to open ctype file %s\n",
1969                       prog, argv[0]);
1970             else {
1971                 read_cdata(in);
1972                 fclose(in);
1973                 in = 0;
1974 	    }
1975         }
1976         argc--;
1977         argv++;
1978     }
1979 
1980     if (opath == 0)
1981       opath = ".";
1982     write_cdata(opath);
1983 
1984     return 0;
1985 }
1986