xref: /PHP-7.2/ext/mbstring/ucgendat/ucgendat.c (revision 24cfbfd5)
1 /* Further modified for PHP */
2 /* $Id$ */
3 
4 /* $OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucgendat.c,v 1.36.2.4 2007/01/02 21:43:51 kurt Exp $ */
5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6  *
7  * Copyright 1998-2007 The OpenLDAP Foundation.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted only as authorized by the OpenLDAP
12  * Public License.
13  *
14  * A copy of this license is available at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 
18 /* Copyright 2001 Computing Research Labs, New Mexico State University
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining a
21  * copy of this software and associated documentation files (the "Software"),
22  * to deal in the Software without restriction, including without limitation
23  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
24  * and/or sell copies of the Software, and to permit persons to whom the
25  * Software is furnished to do so, subject to the following conditions:
26  *
27  * The above copyright notice and this permission notice shall be included in
28  * all copies or substantial portions of the Software.
29  *
30  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
33  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
34  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
35  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
36  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37  */
38 /* orig Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */
39 
40 #include <stdio.h>
41 #include <ctype.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <unistd.h>
45 
46 #define ac_uint2 unsigned short
47 #define ac_uint4 unsigned int
48 #define LDAP_DIRSEP "/"
49 #define AC_MEMCPY memcpy
50 
51 #ifndef HARDCODE_DATA
52 #define	HARDCODE_DATA	1
53 #endif
54 
55 #undef ishdigit
56 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
57                       ((cc) >= 'A' && (cc) <= 'F') ||\
58                       ((cc) >= 'a' && (cc) <= 'f'))
59 
60 /*
61  * A header written to the output file with the byte-order-mark and the number
62  * of property nodes.
63  */
64 static ac_uint2 hdr[2] = {0xfeff, 0};
65 
66 #define NUMPROPS 50
67 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
68 
69 typedef struct {
70     char *name;
71     int len;
72 } _prop_t;
73 
74 /*
75  * List of properties expected to be found in the Unicode Character Database
76  * including some implementation specific properties.
77  *
78  * The implementation specific properties are:
79  * Cm = Composed (can be decomposed)
80  * Nb = Non-breaking
81  * Sy = Symmetric (has left and right forms)
82  * Hd = Hex digit
83  * Qm = Quote marks
84  * Mr = Mirroring
85  * Ss = Space, other
86  * Cp = Defined character
87  */
88 static _prop_t props[NUMPROPS] = {
89     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
90     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
91     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
92     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
93     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
94     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
95     {"S",  1}, {"WS", 2}, {"ON", 2},
96     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
97     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
98 };
99 
100 typedef struct {
101     ac_uint4 *ranges;
102     ac_uint2 used;
103     ac_uint2 size;
104 } _ranges_t;
105 
106 static _ranges_t proptbl[NUMPROPS];
107 
108 /*
109  * Make sure this array is sized to be on a 4-byte boundary at compile time.
110  */
111 static ac_uint2 propcnt[NEEDPROPS];
112 
113 /*
114  * Array used to collect a decomposition before adding it to the decomposition
115  * table.
116  */
117 static ac_uint4 dectmp[64];
118 static ac_uint4 dectmp_size;
119 
120 typedef struct {
121     ac_uint4 code;
122     ac_uint2 size;
123     ac_uint2 used;
124     ac_uint4 *decomp;
125 } _decomp_t;
126 
127 /*
128  * List of decomposition.  Created and expanded in order as the characters are
129  * encountered. First list contains canonical mappings, second also includes
130  * compatibility mappings.
131  */
132 static _decomp_t *decomps;
133 static ac_uint4 decomps_used;
134 static ac_uint4 decomps_size;
135 
136 static _decomp_t *kdecomps;
137 static ac_uint4 kdecomps_used;
138 static ac_uint4 kdecomps_size;
139 
140 /*
141  * Composition exclusion table stuff.
142  */
143 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31)))
144 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31)))
145 static ac_uint4 compexs[8192];
146 
147 /*
148  * Struct for holding a composition pair, and array of composition pairs
149  */
150 typedef struct {
151     ac_uint4 comp;
152     ac_uint4 count;
153     ac_uint4 code1;
154     ac_uint4 code2;
155 } _comp_t;
156 
157 #if 0
158 static _comp_t *comps;
159 #endif
160 static ac_uint4 comps_used;
161 
162 /*
163  * Types and lists for handling lists of case mappings.
164  */
165 typedef struct {
166     ac_uint4 key;
167     ac_uint4 other1;
168     ac_uint4 other2;
169 } _case_t;
170 
171 static _case_t *upper;
172 static _case_t *lower;
173 static _case_t *title;
174 static ac_uint4 upper_used;
175 static ac_uint4 upper_size;
176 static ac_uint4 lower_used;
177 static ac_uint4 lower_size;
178 static ac_uint4 title_used;
179 static ac_uint4 title_size;
180 
181 /*
182  * Array used to collect case mappings before adding them to a list.
183  */
184 static ac_uint4 cases[3];
185 
186 /*
187  * An array to hold ranges for combining classes.
188  */
189 static ac_uint4 *ccl;
190 static ac_uint4 ccl_used;
191 static ac_uint4 ccl_size;
192 
193 /*
194  * Structures for handling numbers.
195  */
196 typedef struct {
197     ac_uint4 code;
198     ac_uint4 idx;
199 } _codeidx_t;
200 
201 typedef struct {
202     short numerator;
203     short denominator;
204 } _num_t;
205 
206 /*
207  * Arrays to hold the mapping of codes to numbers.
208  */
209 static _codeidx_t *ncodes;
210 static ac_uint4 ncodes_used;
211 static ac_uint4 ncodes_size;
212 
213 static _num_t *nums;
214 static ac_uint4 nums_used;
215 static ac_uint4 nums_size;
216 
217 /*
218  * Array for holding numbers.
219  */
220 static _num_t *nums;
221 static ac_uint4 nums_used;
222 static ac_uint4 nums_size;
223 
224 static void
add_range(ac_uint4 start,ac_uint4 end,char * p1,char * p2)225 add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2)
226 {
227     int i, j, k, len;
228     _ranges_t *rlp;
229     char *name;
230 
231     for (k = 0; k < 2; k++) {
232         if (k == 0) {
233             name = p1;
234             len = 2;
235         } else {
236             if (p2 == 0)
237               break;
238 
239             name = p2;
240             len = 1;
241         }
242 
243         for (i = 0; i < NUMPROPS; i++) {
244             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
245               break;
246         }
247 
248         if (i == NUMPROPS)
249           continue;
250 
251         rlp = &proptbl[i];
252 
253         /*
254          * Resize the range list if necessary.
255          */
256         if (rlp->used == rlp->size) {
257             if (rlp->size == 0)
258               rlp->ranges = (ac_uint4 *)
259                   malloc(sizeof(ac_uint4) << 3);
260             else
261               rlp->ranges = (ac_uint4 *)
262                   realloc((char *) rlp->ranges,
263                           sizeof(ac_uint4) * (rlp->size + 8));
264             rlp->size += 8;
265         }
266 
267         /*
268          * If this is the first code for this property list, just add it
269          * and return.
270          */
271         if (rlp->used == 0) {
272             rlp->ranges[0] = start;
273             rlp->ranges[1] = end;
274             rlp->used += 2;
275             continue;
276         }
277 
278         /*
279          * Optimize the case of adding the range to the end.
280          */
281         j = rlp->used - 1;
282         if (start > rlp->ranges[j]) {
283             j = rlp->used;
284             rlp->ranges[j++] = start;
285             rlp->ranges[j++] = end;
286             rlp->used = j;
287             continue;
288         }
289 
290         /*
291          * Need to locate the insertion point.
292          */
293         for (i = 0;
294              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
295 
296         /*
297          * If the start value lies in the current range, then simply set the
298          * new end point of the range to the end value passed as a parameter.
299          */
300         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
301             rlp->ranges[i + 1] = end;
302             return;
303         }
304 
305         /*
306          * Shift following values up by two.
307          */
308         for (j = rlp->used; j > i; j -= 2) {
309             rlp->ranges[j] = rlp->ranges[j - 2];
310             rlp->ranges[j + 1] = rlp->ranges[j - 1];
311         }
312 
313         /*
314          * Add the new range at the insertion point.
315          */
316         rlp->ranges[i] = start;
317         rlp->ranges[i + 1] = end;
318         rlp->used += 2;
319     }
320 }
321 
322 static void
ordered_range_insert(ac_uint4 c,char * name,int len)323 ordered_range_insert(ac_uint4 c, char *name, int len)
324 {
325     int i, j;
326     ac_uint4 s, e;
327     _ranges_t *rlp;
328 
329     if (len == 0)
330       return;
331 
332     /*
333      * Deal with directionality codes introduced in Unicode 3.0.
334      */
335     if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
336         (len == 3 &&
337          (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
338           memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
339           memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0 ||
340           memcmp(name, "LRI", 3) == 0 || memcmp(name, "RLI", 3) == 0 ||
341           memcmp(name, "FSI", 3) == 0 || memcmp(name, "PDI", 3) == 0))) {
342         /*
343          * Mark all of these as Other Neutral to preserve compatibility with
344          * older versions.
345          */
346         len = 2;
347         name = "ON";
348     }
349 
350     for (i = 0; i < NUMPROPS; i++) {
351         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
352           break;
353     }
354 
355     if (i == NUMPROPS) {
356         printf("Unknown property %s\n", name);
357         return;
358     }
359 
360     /*
361      * Have a match, so insert the code in order.
362      */
363     rlp = &proptbl[i];
364 
365     /*
366      * Resize the range list if necessary.
367      */
368     if (rlp->used == rlp->size) {
369         if (rlp->size == 0)
370           rlp->ranges = (ac_uint4 *)
371               malloc(sizeof(ac_uint4) << 3);
372         else
373           rlp->ranges = (ac_uint4 *)
374               realloc((char *) rlp->ranges,
375                       sizeof(ac_uint4) * (rlp->size + 8));
376         rlp->size += 8;
377     }
378 
379     /*
380      * If this is the first code for this property list, just add it
381      * and return.
382      */
383     if (rlp->used == 0) {
384         rlp->ranges[0] = rlp->ranges[1] = c;
385         rlp->used += 2;
386         return;
387     }
388 
389     /*
390      * Optimize the cases of extending the last range and adding new ranges to
391      * the end.
392      */
393     j = rlp->used - 1;
394     e = rlp->ranges[j];
395     s = rlp->ranges[j - 1];
396 
397     if (c == e + 1) {
398         /*
399          * Extend the last range.
400          */
401         rlp->ranges[j] = c;
402         return;
403     }
404 
405     if (c > e + 1) {
406         /*
407          * Start another range on the end.
408          */
409         j = rlp->used;
410         rlp->ranges[j] = rlp->ranges[j + 1] = c;
411         rlp->used += 2;
412         return;
413     }
414 
415     if (c >= s)
416       /*
417        * The code is a duplicate of a code in the last range, so just return.
418        */
419       return;
420 
421     /*
422      * The code should be inserted somewhere before the last range in the
423      * list.  Locate the insertion point.
424      */
425     for (i = 0;
426          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
427 
428     s = rlp->ranges[i];
429     e = rlp->ranges[i + 1];
430 
431     if (c == e + 1)
432       /*
433        * Simply extend the current range.
434        */
435       rlp->ranges[i + 1] = c;
436     else if (c < s) {
437         /*
438          * Add a new entry before the current location.  Shift all entries
439          * before the current one up by one to make room.
440          */
441         for (j = rlp->used; j > i; j -= 2) {
442             rlp->ranges[j] = rlp->ranges[j - 2];
443             rlp->ranges[j + 1] = rlp->ranges[j - 1];
444         }
445         rlp->ranges[i] = rlp->ranges[i + 1] = c;
446 
447         rlp->used += 2;
448     }
449 }
450 
451 static void
add_decomp(ac_uint4 code,short compat)452 add_decomp(ac_uint4 code, short compat)
453 {
454     ac_uint4 i, j, size;
455     _decomp_t **pdecomps;
456     ac_uint4 *pdecomps_used;
457     ac_uint4 *pdecomps_size;
458 
459     if (compat) {
460 	pdecomps = &kdecomps;
461 	pdecomps_used = &kdecomps_used;
462 	pdecomps_size = &kdecomps_size;
463     } else {
464 	pdecomps = &decomps;
465 	pdecomps_used = &decomps_used;
466 	pdecomps_size = &decomps_size;
467     }
468 
469     /*
470      * Add the code to the composite property.
471      */
472     if (!compat) {
473 	ordered_range_insert(code, "Cm", 2);
474     }
475 
476     /*
477      * Locate the insertion point for the code.
478      */
479     for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ;
480 
481     /*
482      * Allocate space for a new decomposition.
483      */
484     if (*pdecomps_used == *pdecomps_size) {
485         if (*pdecomps_size == 0)
486           *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
487         else
488           *pdecomps = (_decomp_t *)
489               realloc((char *) *pdecomps,
490                       sizeof(_decomp_t) * (*pdecomps_size + 8));
491         (void) memset((char *) (*pdecomps + *pdecomps_size), '\0',
492                       sizeof(_decomp_t) << 3);
493         *pdecomps_size += 8;
494     }
495 
496     if (i < *pdecomps_used && code != (*pdecomps)[i].code) {
497         /*
498          * Shift the decomps up by one if the codes don't match.
499          */
500         for (j = *pdecomps_used; j > i; j--)
501           (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1],
502                         sizeof(_decomp_t));
503     }
504 
505     /*
506      * Insert or replace a decomposition.
507      */
508     size = dectmp_size + (4 - (dectmp_size & 3));
509     if ((*pdecomps)[i].size < size) {
510         if ((*pdecomps)[i].size == 0)
511           (*pdecomps)[i].decomp = (ac_uint4 *)
512               malloc(sizeof(ac_uint4) * size);
513         else
514           (*pdecomps)[i].decomp = (ac_uint4 *)
515               realloc((char *) (*pdecomps)[i].decomp,
516                       sizeof(ac_uint4) * size);
517         (*pdecomps)[i].size = size;
518     }
519 
520     if ((*pdecomps)[i].code != code)
521       (*pdecomps_used)++;
522 
523     (*pdecomps)[i].code = code;
524     (*pdecomps)[i].used = dectmp_size;
525     (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp,
526                   sizeof(ac_uint4) * dectmp_size);
527 
528     /*
529      * NOTICE: This needs changing later so it is more general than simply
530      * pairs.  This calculation is done here to simplify allocation elsewhere.
531      */
532     if (!compat && dectmp_size == 2)
533       comps_used++;
534 }
535 
536 static void
add_title(ac_uint4 code)537 add_title(ac_uint4 code)
538 {
539     ac_uint4 i, j;
540 
541     /*
542      * Always map the code to itself.
543      */
544     cases[2] = code;
545 
546     /* If lower/upper case does not exist, stay the same */
547     if (!cases[0]) cases[0] = code;
548     if (!cases[1]) cases[1] = code;
549 
550     if (title_used == title_size) {
551         if (title_size == 0)
552           title = (_case_t *) malloc(sizeof(_case_t) << 3);
553         else
554           title = (_case_t *) realloc((char *) title,
555                                       sizeof(_case_t) * (title_size + 8));
556         title_size += 8;
557     }
558 
559     /*
560      * Locate the insertion point.
561      */
562     for (i = 0; i < title_used && code > title[i].key; i++) ;
563 
564     if (i < title_used) {
565         /*
566          * Shift the array up by one.
567          */
568         for (j = title_used; j > i; j--)
569           (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
570                         sizeof(_case_t));
571     }
572 
573     title[i].key = cases[2];    /* Title */
574     title[i].other1 = cases[0]; /* Upper */
575     title[i].other2 = cases[1]; /* Lower */
576 
577     title_used++;
578 }
579 
580 static void
add_upper(ac_uint4 code)581 add_upper(ac_uint4 code)
582 {
583     ac_uint4 i, j;
584 
585     /*
586      * Always map the code to itself.
587      */
588     cases[0] = code;
589 
590     /*
591      * If the title case character is not present, then make it the same as
592      * the upper case.
593      */
594     if (cases[2] == 0)
595       cases[2] = code;
596 
597     if (upper_used == upper_size) {
598         if (upper_size == 0)
599           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
600         else
601           upper = (_case_t *) realloc((char *) upper,
602                                       sizeof(_case_t) * (upper_size + 8));
603         upper_size += 8;
604     }
605 
606     /*
607      * Locate the insertion point.
608      */
609     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
610 
611     if (i < upper_used) {
612         /*
613          * Shift the array up by one.
614          */
615         for (j = upper_used; j > i; j--)
616           (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
617                         sizeof(_case_t));
618     }
619 
620     upper[i].key = cases[0];    /* Upper */
621     upper[i].other1 = cases[1]; /* Lower */
622     upper[i].other2 = cases[2]; /* Title */
623 
624     upper_used++;
625 }
626 
627 static void
add_lower(ac_uint4 code)628 add_lower(ac_uint4 code)
629 {
630     ac_uint4 i, j;
631 
632     /*
633      * Always map the code to itself.
634      */
635     cases[1] = code;
636 
637     /*
638      * If the title case character is empty, then make it the same as the
639      * upper case.
640      */
641     if (cases[2] == 0)
642       cases[2] = cases[0];
643 
644     if (lower_used == lower_size) {
645         if (lower_size == 0)
646           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
647         else
648           lower = (_case_t *) realloc((char *) lower,
649                                       sizeof(_case_t) * (lower_size + 8));
650         lower_size += 8;
651     }
652 
653     /*
654      * Locate the insertion point.
655      */
656     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
657 
658     if (i < lower_used) {
659         /*
660          * Shift the array up by one.
661          */
662         for (j = lower_used; j > i; j--)
663           (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
664                         sizeof(_case_t));
665     }
666 
667     lower[i].key = cases[1];    /* Lower */
668     lower[i].other1 = cases[0]; /* Upper */
669     lower[i].other2 = cases[2]; /* Title */
670 
671     lower_used++;
672 }
673 
674 static void
ordered_ccl_insert(ac_uint4 c,ac_uint4 ccl_code)675 ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code)
676 {
677     ac_uint4 i, j;
678 
679     if (ccl_used == ccl_size) {
680         if (ccl_size == 0)
681           ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24);
682         else
683           ccl = (ac_uint4 *)
684               realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24));
685         ccl_size += 24;
686     }
687 
688     /*
689      * Optimize adding the first item.
690      */
691     if (ccl_used == 0) {
692         ccl[0] = ccl[1] = c;
693         ccl[2] = ccl_code;
694         ccl_used += 3;
695         return;
696     }
697 
698     /*
699      * Handle the special case of extending the range on the end.  This
700      * requires that the combining class codes are the same.
701      */
702     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
703         ccl[ccl_used - 2] = c;
704         return;
705     }
706 
707     /*
708      * Handle the special case of adding another range on the end.
709      */
710     if (c > ccl[ccl_used - 2] + 1 ||
711         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
712         ccl[ccl_used++] = c;
713         ccl[ccl_used++] = c;
714         ccl[ccl_used++] = ccl_code;
715         return;
716     }
717 
718     /*
719      * Locate either the insertion point or range for the code.
720      */
721     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
722 
723     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
724         /*
725          * Extend an existing range.
726          */
727         ccl[i + 1] = c;
728         return;
729     } else if (c < ccl[i]) {
730         /*
731          * Start a new range before the current location.
732          */
733         for (j = ccl_used; j > i; j -= 3) {
734             ccl[j] = ccl[j - 3];
735             ccl[j - 1] = ccl[j - 4];
736             ccl[j - 2] = ccl[j - 5];
737         }
738         ccl[i] = ccl[i + 1] = c;
739         ccl[i + 2] = ccl_code;
740     }
741 }
742 
743 /*
744  * Adds a number if it does not already exist and returns an index value
745  * multiplied by 2.
746  */
747 static ac_uint4
make_number(short num,short denom)748 make_number(short num, short denom)
749 {
750     ac_uint4 n;
751 
752     /*
753      * Determine if the number already exists.
754      */
755     for (n = 0; n < nums_used; n++) {
756         if (nums[n].numerator == num && nums[n].denominator == denom)
757           return n << 1;
758     }
759 
760     if (nums_used == nums_size) {
761         if (nums_size == 0)
762           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
763         else
764           nums = (_num_t *) realloc((char *) nums,
765                                     sizeof(_num_t) * (nums_size + 8));
766         nums_size += 8;
767     }
768 
769     n = nums_used++;
770     nums[n].numerator = num;
771     nums[n].denominator = denom;
772 
773     return n << 1;
774 }
775 
776 static void
add_number(ac_uint4 code,short num,short denom)777 add_number(ac_uint4 code, short num, short denom)
778 {
779     ac_uint4 i, j;
780 
781     /*
782      * Insert the code in order.
783      */
784     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
785 
786     /*
787      * Handle the case of the codes matching and simply replace the number
788      * that was there before.
789      */
790     if (i < ncodes_used && code == ncodes[i].code) {
791         ncodes[i].idx = make_number(num, denom);
792         return;
793     }
794 
795     /*
796      * Resize the array if necessary.
797      */
798     if (ncodes_used == ncodes_size) {
799         if (ncodes_size == 0)
800           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
801         else
802           ncodes = (_codeidx_t *)
803               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
804 
805         ncodes_size += 8;
806     }
807 
808     /*
809      * Shift things around to insert the code if necessary.
810      */
811     if (i < ncodes_used) {
812         for (j = ncodes_used; j > i; j--) {
813             ncodes[j].code = ncodes[j - 1].code;
814             ncodes[j].idx = ncodes[j - 1].idx;
815         }
816     }
817     ncodes[i].code = code;
818     ncodes[i].idx = make_number(num, denom);
819 
820     ncodes_used++;
821 }
822 
823 /*
824  * This routine assumes that the line is a valid Unicode Character Database
825  * entry.
826  */
827 static void
read_cdata(FILE * in)828 read_cdata(FILE *in)
829 {
830     ac_uint4 i, lineno, skip, code, ccl_code;
831     short wnum, neg, number[2], compat;
832     char line[512], *s, *e;
833 
834     lineno = skip = 0;
835     while (fgets(line, sizeof(line), in)) {
836         int is_title = 0;
837 
838         if( (s=strchr(line, '\n')) ) *s = '\0';
839         lineno++;
840 
841         /*
842          * Skip blank lines and lines that start with a '#'.
843          */
844         if (line[0] == 0 || line[0] == '#')
845           continue;
846 
847         /*
848          * If lines need to be skipped, do it here.
849          */
850         if (skip) {
851             skip--;
852             continue;
853         }
854 
855         /*
856          * Collect the code.  The code can be up to 6 hex digits in length to
857          * allow surrogates to be specified.
858          */
859         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
860             code <<= 4;
861             if (*s >= '0' && *s <= '9')
862               code += *s - '0';
863             else if (*s >= 'A' && *s <= 'F')
864               code += (*s - 'A') + 10;
865             else if (*s >= 'a' && *s <= 'f')
866               code += (*s - 'a') + 10;
867         }
868 
869         /*
870          * Handle the following special cases:
871          * 1. 4E00-9FA5 CJK Ideographs.
872          * 2. AC00-D7A3 Hangul Syllables.
873          * 3. D800-DFFF Surrogates.
874          * 4. E000-F8FF Private Use Area.
875          * 5. F900-FA2D Han compatibility.
876 	 * ...Plus additional ranges in newer Unicode versions...
877          */
878         switch (code) {
879 	  case 0x3400:
880 	    /* CJK Ideograph Extension A */
881             add_range(0x3400, 0x4db5, "Lo", "L");
882 
883             add_range(0x3400, 0x4db5, "Cp", 0);
884 
885 	    skip = 1;
886 	    break;
887           case 0x4e00:
888             /*
889              * The Han ideographs.
890              */
891             add_range(0x4e00, 0x9fff, "Lo", "L");
892 
893             /*
894              * Add the characters to the defined category.
895              */
896             add_range(0x4e00, 0x9fa5, "Cp", 0);
897 
898             skip = 1;
899             break;
900           case 0xac00:
901             /*
902              * The Hangul syllables.
903              */
904             add_range(0xac00, 0xd7a3, "Lo", "L");
905 
906             /*
907              * Add the characters to the defined category.
908              */
909             add_range(0xac00, 0xd7a3, "Cp", 0);
910 
911             skip = 1;
912             break;
913           case 0xd800:
914             /*
915              * Make a range of all surrogates and assume some default
916              * properties.
917              */
918             add_range(0x010000, 0x10ffff, "Cs", "L");
919             skip = 5;
920             break;
921           case 0xe000:
922             /*
923              * The Private Use area.  Add with a default set of properties.
924              */
925             add_range(0xe000, 0xf8ff, "Co", "L");
926             skip = 1;
927             break;
928           case 0xf900:
929             /*
930              * The CJK compatibility area.
931              */
932             add_range(0xf900, 0xfaff, "Lo", "L");
933 
934             /*
935              * Add the characters to the defined category.
936              */
937             add_range(0xf900, 0xfaff, "Cp", 0);
938 
939             skip = 1;
940 	    break;
941 	  case 0x20000:
942 	    /* CJK Ideograph Extension B */
943             add_range(0x20000, 0x2a6d6, "Lo", "L");
944 
945             add_range(0x20000, 0x2a6d6, "Cp", 0);
946 
947 	    skip = 1;
948 	    break;
949 	  case 0xf0000:
950 	    /* Plane 15 private use */
951 	    add_range(0xf0000, 0xffffd, "Co", "L");
952 	    skip = 1;
953 	    break;
954 
955 	  case 0x100000:
956 	    /* Plane 16 private use */
957 	    add_range(0x100000, 0x10fffd, "Co", "L");
958 	    skip = 1;
959 	    break;
960         }
961 
962         if (skip)
963           continue;
964 
965         /*
966          * Add the code to the defined category.
967          */
968         ordered_range_insert(code, "Cp", 2);
969 
970         /*
971          * Locate the first character property field.
972          */
973         for (i = 0; *s != 0 && i < 2; s++) {
974             if (*s == ';')
975               i++;
976         }
977         for (e = s; *e && *e != ';'; e++) ;
978 
979         ordered_range_insert(code, s, e - s);
980 
981         if (e - s == 2 && s[0] == 'L' && s[1] == 't') {
982             is_title = 1;
983         }
984 
985         /*
986          * Locate the combining class code.
987          */
988         for (s = e; *s != 0 && i < 3; s++) {
989             if (*s == ';')
990               i++;
991         }
992 
993         /*
994          * Convert the combining class code from decimal.
995          */
996         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
997           ccl_code = (ccl_code * 10) + (*e - '0');
998 
999         /*
1000          * Add the code if it not 0.
1001          */
1002         if (ccl_code != 0)
1003           ordered_ccl_insert(code, ccl_code);
1004 
1005         /*
1006          * Locate the second character property field.
1007          */
1008         for (s = e; *s != 0 && i < 4; s++) {
1009             if (*s == ';')
1010               i++;
1011         }
1012         for (e = s; *e && *e != ';'; e++) ;
1013 
1014         ordered_range_insert(code, s, e - s);
1015 
1016         /*
1017          * Check for a decomposition.
1018          */
1019         s = ++e;
1020         if (*s != ';') {
1021 	    compat = *s == '<';
1022 	    if (compat) {
1023 		/*
1024 		 * Skip compatibility formatting tag.
1025 		 */
1026 		while (*s++ != '>');
1027 	    }
1028             /*
1029              * Collect the codes of the decomposition.
1030              */
1031             for (dectmp_size = 0; *s != ';'; ) {
1032                 /*
1033                  * Skip all leading non-hex digits.
1034                  */
1035                 while (!ishdigit(*s))
1036  		  s++;
1037 
1038                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
1039                     dectmp[dectmp_size] <<= 4;
1040                     if (*s >= '0' && *s <= '9')
1041                       dectmp[dectmp_size] += *s - '0';
1042                     else if (*s >= 'A' && *s <= 'F')
1043                       dectmp[dectmp_size] += (*s - 'A') + 10;
1044                     else if (*s >= 'a' && *s <= 'f')
1045                       dectmp[dectmp_size] += (*s - 'a') + 10;
1046                 }
1047                 dectmp_size++;
1048             }
1049 
1050             /*
1051              * If there are any codes in the temporary decomposition array,
1052              * then add the character with its decomposition.
1053              */
1054             if (dectmp_size > 0) {
1055 		if (!compat) {
1056 		    add_decomp(code, 0);
1057 		}
1058 		add_decomp(code, 1);
1059 	    }
1060         }
1061 
1062         /*
1063          * Skip to the number field.
1064          */
1065         for (i = 0; i < 3 && *s; s++) {
1066             if (*s == ';')
1067               i++;
1068         }
1069 
1070         /*
1071          * Scan the number in.
1072          */
1073         number[0] = number[1] = 0;
1074         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
1075             if (*e == '-') {
1076                 neg = 1;
1077                 continue;
1078             }
1079 
1080             if (*e == '/') {
1081                 /*
1082                  * Move the denominator of the fraction.
1083                  */
1084                 if (neg)
1085                   number[wnum] *= -1;
1086                 neg = 0;
1087                 e++;
1088                 wnum++;
1089             }
1090             number[wnum] = (number[wnum] * 10) + (*e - '0');
1091         }
1092 
1093         if (e > s) {
1094             /*
1095              * Adjust the denominator in case of integers and add the number.
1096              */
1097             if (wnum == 0)
1098               number[1] = 1;
1099 
1100             add_number(code, number[0], number[1]);
1101         }
1102 
1103         /*
1104          * Skip to the start of the possible case mappings.
1105          */
1106         for (s = e, i = 0; i < 4 && *s; s++) {
1107             if (*s == ';')
1108               i++;
1109         }
1110 
1111         /*
1112          * Collect the case mappings.
1113          */
1114         cases[0] = cases[1] = cases[2] = 0;
1115         for (i = 0; i < 3; i++) {
1116             while (ishdigit(*s)) {
1117                 cases[i] <<= 4;
1118                 if (*s >= '0' && *s <= '9')
1119                   cases[i] += *s - '0';
1120                 else if (*s >= 'A' && *s <= 'F')
1121                   cases[i] += (*s - 'A') + 10;
1122                 else if (*s >= 'a' && *s <= 'f')
1123                   cases[i] += (*s - 'a') + 10;
1124                 s++;
1125             }
1126             if (*s == ';')
1127               s++;
1128         }
1129         if (is_title)
1130           /*
1131            * Add the upper and lower mappings for a title case character.
1132            */
1133           add_title(code);
1134         else if (cases[1])
1135           /*
1136            * Add the lower and title case mappings for the upper case
1137            * character.
1138            */
1139           add_upper(code);
1140         else if (cases[0])
1141           /*
1142            * Add the upper and title case mappings for the lower case
1143            * character.
1144            */
1145           add_lower(code);
1146     }
1147 }
1148 
1149 #if 0
1150 
1151 static _decomp_t *
1152 find_decomp(ac_uint4 code, short compat)
1153 {
1154     long l, r, m;
1155     _decomp_t *decs;
1156 
1157     l = 0;
1158     r = (compat ? kdecomps_used : decomps_used) - 1;
1159     decs = compat ? kdecomps : decomps;
1160     while (l <= r) {
1161         m = (l + r) >> 1;
1162         if (code > decs[m].code)
1163           l = m + 1;
1164         else if (code < decs[m].code)
1165           r = m - 1;
1166         else
1167           return &decs[m];
1168     }
1169     return 0;
1170 }
1171 
1172 static void
1173 decomp_it(_decomp_t *d, short compat)
1174 {
1175     ac_uint4 i;
1176     _decomp_t *dp;
1177 
1178     for (i = 0; i < d->used; i++) {
1179         if ((dp = find_decomp(d->decomp[i], compat)) != 0)
1180           decomp_it(dp, compat);
1181         else
1182           dectmp[dectmp_size++] = d->decomp[i];
1183     }
1184 }
1185 
1186 
1187 /*
1188  * Expand all decompositions by recursively decomposing each character
1189  * in the decomposition.
1190  */
1191 static void
1192 expand_decomp(void)
1193 {
1194     ac_uint4 i;
1195 
1196     for (i = 0; i < decomps_used; i++) {
1197         dectmp_size = 0;
1198         decomp_it(&decomps[i], 0);
1199         if (dectmp_size > 0)
1200           add_decomp(decomps[i].code, 0);
1201     }
1202 
1203     for (i = 0; i < kdecomps_used; i++) {
1204         dectmp_size = 0;
1205         decomp_it(&kdecomps[i], 1);
1206         if (dectmp_size > 0)
1207           add_decomp(kdecomps[i].code, 1);
1208     }
1209 }
1210 
1211 static int
1212 cmpcomps(const void *v_comp1, const void *v_comp2)
1213 {
1214 	const _comp_t *comp1 = v_comp1, *comp2 = v_comp2;
1215     long diff = comp1->code1 - comp2->code1;
1216 
1217     if (!diff)
1218 	diff = comp1->code2 - comp2->code2;
1219     return (int) diff;
1220 }
1221 
1222 #endif
1223 
1224 /*
1225  * Load composition exclusion data
1226  */
1227 static void
read_compexdata(FILE * in)1228 read_compexdata(FILE *in)
1229 {
1230     ac_uint2 i;
1231     ac_uint4 code;
1232     char line[512], *s;
1233 
1234     (void) memset((char *) compexs, 0, sizeof(compexs));
1235 
1236     while (fgets(line, sizeof(line), in)) {
1237 	if( (s=strchr(line, '\n')) ) *s = '\0';
1238         /*
1239          * Skip blank lines and lines that start with a '#'.
1240          */
1241         if (line[0] == 0 || line[0] == '#')
1242 	    continue;
1243 
1244 	/*
1245          * Collect the code.  Assume max 6 digits
1246          */
1247 
1248 	for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) {
1249 	    if (isspace((unsigned char)*s)) break;
1250             code <<= 4;
1251             if (*s >= '0' && *s <= '9')
1252 		code += *s - '0';
1253             else if (*s >= 'A' && *s <= 'F')
1254 		code += (*s - 'A') + 10;
1255             else if (*s >= 'a' && *s <= 'f')
1256 		code += (*s - 'a') + 10;
1257         }
1258         COMPEX_SET(code);
1259     }
1260 }
1261 
1262 #if 0
1263 
1264 /*
1265  * Creates array of compositions from decomposition array
1266  */
1267 static void
1268 create_comps(void)
1269 {
1270     ac_uint4 i, cu;
1271 
1272     comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
1273 
1274     for (i = cu = 0; i < decomps_used; i++) {
1275 	if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
1276 	    continue;
1277 	comps[cu].comp = decomps[i].code;
1278 	comps[cu].count = 2;
1279 	comps[cu].code1 = decomps[i].decomp[0];
1280 	comps[cu].code2 = decomps[i].decomp[1];
1281 	cu++;
1282     }
1283     comps_used = cu;
1284     qsort(comps, comps_used, sizeof(_comp_t), cmpcomps);
1285 }
1286 
1287 #endif
1288 
1289 #if HARDCODE_DATA
1290 static void
write_case(FILE * out,_case_t * tab,int num,int first)1291 write_case(FILE *out, _case_t *tab, int num, int first)
1292 {
1293     int i;
1294 
1295     for (i=0; i<num; i++) {
1296 	if (first) first = 0;
1297 	else fprintf(out, ",");
1298 	fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx",
1299 		(unsigned long) tab[i].key, (unsigned long) tab[i].other1,
1300 		(unsigned long) tab[i].other2);
1301     }
1302 }
1303 
1304 #define PREF "static const "
1305 
1306 #endif
1307 
1308 static void
write_cdata(char * opath)1309 write_cdata(char *opath)
1310 {
1311     FILE *out;
1312 	ac_uint4 bytes;
1313     ac_uint4 i, idx, nprops;
1314 #if !(HARDCODE_DATA)
1315     ac_uint2 casecnt[2];
1316 #endif
1317     char path[BUFSIZ];
1318 #if HARDCODE_DATA
1319     int j, k;
1320 
1321     /*****************************************************************
1322      *
1323      * Generate the ctype data.
1324      *
1325      *****************************************************************/
1326 
1327     /*
1328      * Open the output file.
1329      */
1330     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath);
1331     if ((out = fopen(path, "w")) == 0)
1332       return;
1333 #else
1334     /*
1335      * Open the ctype.dat file.
1336      */
1337     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath);
1338     if ((out = fopen(path, "wb")) == 0)
1339       return;
1340 #endif
1341 
1342     /*
1343      * Collect the offsets for the properties.  The offsets array is
1344      * on a 4-byte boundary to keep things efficient for architectures
1345      * that need such a thing.
1346      */
1347     for (i = idx = 0; i < NUMPROPS; i++) {
1348         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1349         idx += proptbl[i].used;
1350     }
1351 
1352     /*
1353      * Add the sentinel index which is used by the binary search as the upper
1354      * bound for a search.
1355      */
1356     propcnt[i] = idx;
1357 
1358     /*
1359      * Record the actual number of property lists.  This may be different than
1360      * the number of offsets actually written because of aligning on a 4-byte
1361      * boundary.
1362      */
1363     hdr[1] = NUMPROPS;
1364 
1365     /*
1366      * Calculate the byte count needed and pad the property counts array to a
1367      * 4-byte boundary.
1368      */
1369     if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3)
1370       bytes += 4 - (bytes & 3);
1371     nprops = bytes / sizeof(ac_uint2);
1372     bytes += sizeof(ac_uint4) * idx;
1373 
1374 #if HARDCODE_DATA
1375     fprintf(out,
1376         "/* This file was generated from a modified version UCData's ucgendat.\n"
1377         " *\n"
1378         " *                     DO NOT EDIT THIS FILE!\n"
1379         " * \n"
1380         " * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download\n"
1381         " * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt\n"
1382         " * files from  http://www.unicode.org/Public/ and run this program.\n"
1383         " *\n"
1384         " * More information can be found in the UCData package. Unfortunately,\n"
1385         " * the project's page doesn't seem to be live anymore, so you can use\n"
1386         " * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */\n\n");
1387 
1388     fprintf(out, PREF "unsigned short _ucprop_size = %d;\n\n", NUMPROPS);
1389 
1390     fprintf(out, PREF "unsigned short  _ucprop_offsets[] = {");
1391 
1392     for (i = 0; i<nprops; i++) {
1393        if (i) fprintf(out, ",");
1394        if (!(i&7)) fprintf(out, "\n\t");
1395        else fprintf(out, " ");
1396        fprintf(out, "0x%04x", propcnt[i]);
1397     }
1398     fprintf(out, "\n};\n\n");
1399 
1400     fprintf(out, PREF "unsigned int _ucprop_ranges[] = {");
1401 
1402     k = 0;
1403     for (i = 0; i < NUMPROPS; i++) {
1404 	if (proptbl[i].used > 0) {
1405 	  for (j=0; j<proptbl[i].used; j++) {
1406 	    if (k) fprintf(out, ",");
1407 	    if (!(k&3)) fprintf(out,"\n\t");
1408 	    else fprintf(out, " ");
1409 	    k++;
1410 	    fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]);
1411 	  }
1412 	}
1413     }
1414     fprintf(out, "\n};\n\n");
1415 #else
1416     /*
1417      * Write the header.
1418      */
1419     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1420 
1421     /*
1422      * Write the byte count.
1423      */
1424     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1425 
1426     /*
1427      * Write the property list counts.
1428      */
1429     fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out);
1430 
1431     /*
1432      * Write the property lists.
1433      */
1434     for (i = 0; i < NUMPROPS; i++) {
1435         if (proptbl[i].used > 0)
1436           fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4),
1437                  proptbl[i].used, out);
1438     }
1439 
1440     fclose(out);
1441 #endif
1442 
1443     /*****************************************************************
1444      *
1445      * Generate the case mapping data.
1446      *
1447      *****************************************************************/
1448 
1449 #if HARDCODE_DATA
1450     fprintf(out, PREF "unsigned int _uccase_size = %ld;\n\n",
1451         (long) (upper_used + lower_used + title_used));
1452 
1453     fprintf(out,
1454         "/* Starting indexes of the case tables\n"
1455         " * UpperIndex = 0\n"
1456         " * LowerIndex = _uccase_len[0]\n"
1457         " * TitleIndex = LowerIndex + _uccase_len[1] */\n\n");
1458     fprintf(out, PREF "unsigned short _uccase_len[2] = {%ld, %ld};\n\n",
1459         (long) upper_used, (long) lower_used);
1460     fprintf(out, PREF "unsigned int _uccase_map[] = {");
1461 
1462     if (upper_used > 0)
1463       /*
1464        * Write the upper case table.
1465        */
1466       write_case(out, upper, upper_used, 1);
1467 
1468     if (lower_used > 0)
1469       /*
1470        * Write the lower case table.
1471        */
1472       write_case(out, lower, lower_used, !upper_used);
1473 
1474     if (title_used > 0)
1475       /*
1476        * Write the title case table.
1477        */
1478       write_case(out, title, title_used, !(upper_used||lower_used));
1479 
1480     if (!(upper_used || lower_used || title_used))
1481 	fprintf(out, "\t0");
1482 
1483     fprintf(out, "\n};\n\n");
1484 #else
1485     /*
1486      * Open the case.dat file.
1487      */
1488     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath);
1489     if ((out = fopen(path, "wb")) == 0)
1490       return;
1491 
1492     /*
1493      * Write the case mapping tables.
1494      */
1495     hdr[1] = upper_used + lower_used + title_used;
1496     casecnt[0] = upper_used;
1497     casecnt[1] = lower_used;
1498 
1499     /*
1500      * Write the header.
1501      */
1502     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1503 
1504     /*
1505      * Write the upper and lower case table sizes.
1506      */
1507     fwrite((char *) casecnt, sizeof(ac_uint2), 2, out);
1508 
1509     if (upper_used > 0)
1510       /*
1511        * Write the upper case table.
1512        */
1513       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1514 
1515     if (lower_used > 0)
1516       /*
1517        * Write the lower case table.
1518        */
1519       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1520 
1521     if (title_used > 0)
1522       /*
1523        * Write the title case table.
1524        */
1525       fwrite((char *) title, sizeof(_case_t), title_used, out);
1526 
1527     fclose(out);
1528 #endif
1529 
1530 #if 0
1531 
1532     /*****************************************************************
1533      *
1534      * Generate the composition data.
1535      *
1536      *****************************************************************/
1537 
1538     /*
1539      * Create compositions from decomposition data
1540      */
1541     create_comps();
1542 
1543 #if HARDCODE_DATA
1544     fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n",
1545         comps_used * 4L);
1546 
1547     fprintf(out, PREF "ac_uint4 _uccomp_data[] = {");
1548 
1549      /*
1550       * Now, if comps exist, write them out.
1551       */
1552     if (comps_used > 0) {
1553 	for (i=0; i<comps_used; i++) {
1554 	    if (i) fprintf(out, ",");
1555 	    fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx",
1556 	        (unsigned long) comps[i].comp, (unsigned long) comps[i].count,
1557 	        (unsigned long) comps[i].code1, (unsigned long) comps[i].code2);
1558 	}
1559     } else {
1560 	fprintf(out, "\t0");
1561     }
1562     fprintf(out, "\n};\n\n");
1563 #else
1564     /*
1565      * Open the comp.dat file.
1566      */
1567     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath);
1568     if ((out = fopen(path, "wb")) == 0)
1569 	return;
1570 
1571     /*
1572      * Write the header.
1573      */
1574     hdr[1] = (ac_uint2) comps_used * 4;
1575     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1576 
1577     /*
1578      * Write out the byte count to maintain header size.
1579      */
1580     bytes = comps_used * sizeof(_comp_t);
1581     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1582 
1583     /*
1584      * Now, if comps exist, write them out.
1585      */
1586     if (comps_used > 0)
1587         fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
1588 
1589     fclose(out);
1590 #endif
1591 
1592     /*****************************************************************
1593      *
1594      * Generate the decomposition data.
1595      *
1596      *****************************************************************/
1597 
1598     /*
1599      * Fully expand all decompositions before generating the output file.
1600      */
1601     expand_decomp();
1602 
1603 #if HARDCODE_DATA
1604     fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n",
1605         decomps_used * 2L);
1606 
1607     fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {");
1608 
1609     if (decomps_used) {
1610 	/*
1611 	 * Write the list of decomp nodes.
1612 	 */
1613 	for (i = idx = 0; i < decomps_used; i++) {
1614 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1615 	        (unsigned long) decomps[i].code, (unsigned long) idx);
1616 	    idx += decomps[i].used;
1617 	}
1618 
1619 	/*
1620 	 * Write the sentinel index as the last decomp node.
1621 	 */
1622 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1623 
1624 	fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {");
1625 	/*
1626 	 * Write the decompositions themselves.
1627 	 */
1628 	k = 0;
1629 	for (i = 0; i < decomps_used; i++)
1630 	  for (j=0; j<decomps[i].used; j++) {
1631 	    if (k) fprintf(out, ",");
1632 	    if (!(k&3)) fprintf(out,"\n\t");
1633 	    else fprintf(out, " ");
1634 	    k++;
1635 	    fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]);
1636 	  }
1637 	fprintf(out, "\n};\n\n");
1638     }
1639 #else
1640     /*
1641      * Open the decomp.dat file.
1642      */
1643     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath);
1644     if ((out = fopen(path, "wb")) == 0)
1645       return;
1646 
1647     hdr[1] = decomps_used;
1648 
1649     /*
1650      * Write the header.
1651      */
1652     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1653 
1654     /*
1655      * Write a temporary byte count which will be calculated as the
1656      * decompositions are written out.
1657      */
1658     bytes = 0;
1659     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1660 
1661     if (decomps_used) {
1662         /*
1663          * Write the list of decomp nodes.
1664          */
1665         for (i = idx = 0; i < decomps_used; i++) {
1666             fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out);
1667             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1668             idx += decomps[i].used;
1669         }
1670 
1671         /*
1672          * Write the sentinel index as the last decomp node.
1673          */
1674         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1675 
1676         /*
1677          * Write the decompositions themselves.
1678          */
1679         for (i = 0; i < decomps_used; i++)
1680           fwrite((char *) decomps[i].decomp, sizeof(ac_uint4),
1681                  decomps[i].used, out);
1682 
1683         /*
1684          * Seek back to the beginning and write the byte count.
1685          */
1686         bytes = (sizeof(ac_uint4) * idx) +
1687             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1688         fseek(out, sizeof(ac_uint2) << 1, 0L);
1689         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1690 
1691         fclose(out);
1692     }
1693 #endif
1694 
1695 #ifdef HARDCODE_DATA
1696     fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n",
1697         kdecomps_used * 2L);
1698 
1699     fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {");
1700 
1701     if (kdecomps_used) {
1702 	/*
1703 	 * Write the list of kdecomp nodes.
1704 	 */
1705 	for (i = idx = 0; i < kdecomps_used; i++) {
1706 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1707 	        (unsigned long) kdecomps[i].code, (unsigned long) idx);
1708 	    idx += kdecomps[i].used;
1709 	}
1710 
1711 	/*
1712 	 * Write the sentinel index as the last decomp node.
1713 	 */
1714 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1715 
1716 	fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {");
1717 
1718 	/*
1719 	 * Write the decompositions themselves.
1720 	 */
1721 	k = 0;
1722 	for (i = 0; i < kdecomps_used; i++)
1723 	  for (j=0; j<kdecomps[i].used; j++) {
1724 	    if (k) fprintf(out, ",");
1725 	    if (!(k&3)) fprintf(out,"\n\t");
1726 	    else fprintf(out, " ");
1727 	    k++;
1728 	    fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]);
1729 	  }
1730 	fprintf(out, "\n};\n\n");
1731     }
1732 #else
1733     /*
1734      * Open the kdecomp.dat file.
1735      */
1736     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath);
1737     if ((out = fopen(path, "wb")) == 0)
1738       return;
1739 
1740     hdr[1] = kdecomps_used;
1741 
1742     /*
1743      * Write the header.
1744      */
1745     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1746 
1747     /*
1748      * Write a temporary byte count which will be calculated as the
1749      * decompositions are written out.
1750      */
1751     bytes = 0;
1752     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1753 
1754     if (kdecomps_used) {
1755         /*
1756          * Write the list of kdecomp nodes.
1757          */
1758         for (i = idx = 0; i < kdecomps_used; i++) {
1759             fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out);
1760             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1761             idx += kdecomps[i].used;
1762         }
1763 
1764         /*
1765          * Write the sentinel index as the last decomp node.
1766          */
1767         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1768 
1769         /*
1770          * Write the decompositions themselves.
1771          */
1772         for (i = 0; i < kdecomps_used; i++)
1773           fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4),
1774                  kdecomps[i].used, out);
1775 
1776         /*
1777          * Seek back to the beginning and write the byte count.
1778          */
1779         bytes = (sizeof(ac_uint4) * idx) +
1780             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1781         fseek(out, sizeof(ac_uint2) << 1, 0L);
1782         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1783 
1784         fclose(out);
1785     }
1786 #endif
1787 
1788     /*****************************************************************
1789      *
1790      * Generate the combining class data.
1791      *
1792      *****************************************************************/
1793 #ifdef HARDCODE_DATA
1794     fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used);
1795 
1796     fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {");
1797 
1798     if (ccl_used > 0) {
1799 	/*
1800 	 * Write the combining class ranges out.
1801 	 */
1802 	for (i = 0; i<ccl_used; i++) {
1803 	    if (i) fprintf(out, ",");
1804 	    if (!(i&3)) fprintf(out, "\n\t");
1805 	    else fprintf(out, " ");
1806 	    fprintf(out, "0x%08lx", (unsigned long) ccl[i]);
1807 	}
1808     } else {
1809 	fprintf(out, "\t0");
1810     }
1811     fprintf(out, "\n};\n\n");
1812 #else
1813     /*
1814      * Open the cmbcl.dat file.
1815      */
1816     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath);
1817     if ((out = fopen(path, "wb")) == 0)
1818       return;
1819 
1820     /*
1821      * Set the number of ranges used.  Each range has a combining class which
1822      * means each entry is a 3-tuple.
1823      */
1824     hdr[1] = ccl_used / 3;
1825 
1826     /*
1827      * Write the header.
1828      */
1829     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1830 
1831     /*
1832      * Write out the byte count to maintain header size.
1833      */
1834     bytes = ccl_used * sizeof(ac_uint4);
1835     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1836 
1837     if (ccl_used > 0)
1838       /*
1839        * Write the combining class ranges out.
1840        */
1841       fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out);
1842 
1843     fclose(out);
1844 #endif
1845 
1846     /*****************************************************************
1847      *
1848      * Generate the number data.
1849      *
1850      *****************************************************************/
1851 
1852 #if HARDCODE_DATA
1853     fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n",
1854         (unsigned long)ncodes_used<<1);
1855 
1856     fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {");
1857 
1858     /*
1859      * Now, if number mappings exist, write them out.
1860      */
1861     if (ncodes_used > 0) {
1862 	for (i = 0; i<ncodes_used; i++) {
1863 	    if (i) fprintf(out, ",");
1864 	    if (!(i&1)) fprintf(out, "\n\t");
1865 	    else fprintf(out, " ");
1866 	    fprintf(out, "0x%08lx, 0x%08lx",
1867 	        (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx);
1868 	}
1869 	fprintf(out, "\n};\n\n");
1870 
1871 	fprintf(out, PREF "short _ucnum_vals[] = {");
1872 	for (i = 0; i<nums_used; i++) {
1873 	    if (i) fprintf(out, ",");
1874 	    if (!(i&3)) fprintf(out, "\n\t");
1875 	    else fprintf(out, " ");
1876 	    if (nums[i].numerator < 0) {
1877 		fprintf(out, "%6d, 0x%04x",
1878 		  nums[i].numerator, nums[i].denominator);
1879 	    } else {
1880 		fprintf(out, "0x%04x, 0x%04x",
1881 		  nums[i].numerator, nums[i].denominator);
1882 	    }
1883 	}
1884 	fprintf(out, "\n};\n\n");
1885     }
1886 #else
1887     /*
1888      * Open the num.dat file.
1889      */
1890     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath);
1891     if ((out = fopen(path, "wb")) == 0)
1892       return;
1893 
1894     /*
1895      * The count part of the header will be the total number of codes that
1896      * have numbers.
1897      */
1898     hdr[1] = (ac_uint2) (ncodes_used << 1);
1899     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1900 
1901     /*
1902      * Write the header.
1903      */
1904     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1905 
1906     /*
1907      * Write out the byte count to maintain header size.
1908      */
1909     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1910 
1911     /*
1912      * Now, if number mappings exist, write them out.
1913      */
1914     if (ncodes_used > 0) {
1915         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1916         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1917     }
1918 #endif
1919 
1920 #endif
1921 
1922     fclose(out);
1923 }
1924 
1925 static void
usage(char * prog)1926 usage(char *prog)
1927 {
1928     fprintf(stderr,
1929             "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
1930     fprintf(stderr, " datafile1 datafile2 ...\n\n");
1931     fprintf(stderr,
1932             "-o output-directory\n\t\tWrite the output files to a different");
1933     fprintf(stderr, " directory (default: .).\n");
1934     fprintf(stderr,
1935             "-x composition-exclusion\n\t\tFile of composition codes");
1936     fprintf(stderr, " that should be excluded.\n");
1937     exit(1);
1938 }
1939 
1940 int
main(int argc,char * argv[])1941 main(int argc, char *argv[])
1942 {
1943     FILE *in;
1944     char *prog, *opath;
1945 
1946     prog = argv[1];
1947 
1948     opath = 0;
1949     in = stdin;
1950 
1951     argc--;
1952     argv++;
1953 
1954     while (argc > 0) {
1955         if (argv[0][0] == '-') {
1956             switch (argv[0][1]) {
1957               case 'o':
1958                 argc--;
1959                 argv++;
1960                 opath = argv[0];
1961                 break;
1962               case 'x':
1963                 argc--;
1964                 argv++;
1965                 if ((in = fopen(argv[0], "r")) == 0)
1966                   fprintf(stderr,
1967                           "%s: unable to open composition exclusion file %s\n",
1968                           prog, argv[0]);
1969                 else {
1970                     read_compexdata(in);
1971                     fclose(in);
1972                     in = 0;
1973                 }
1974                 break;
1975               default:
1976                 usage(prog);
1977             }
1978         } else {
1979             if (in != stdin && in != NULL)
1980               fclose(in);
1981             if ((in = fopen(argv[0], "r")) == 0)
1982               fprintf(stderr, "%s: unable to open ctype file %s\n",
1983                       prog, argv[0]);
1984             else {
1985                 read_cdata(in);
1986                 fclose(in);
1987                 in = 0;
1988 	    }
1989         }
1990         argc--;
1991         argv++;
1992     }
1993 
1994     if (opath == 0)
1995       opath = ".";
1996     write_cdata(opath);
1997 
1998     return 0;
1999 }
2000