2 * Copyright (C) 2002 Manuel Novoa III
3 * Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org>
5 * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
9 * Initial test implementation of strcoll, strxfrm, wcscoll, and wcsxfrm.
10 * The code needs to be cleaned up a good bit, but I'd like to see people
22 #ifdef __UCLIBC_HAS_LOCALE__
23 #if defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l)
27 #error WANT_WIDE should be defined for L_strxfrm
30 #error L_wcsxfrm already defined for L_strxfrm
32 #endif /* L_strxfrm */
34 #if defined(L_strxfrm) || defined(L_strxfrm_l)
36 #define wcscoll strcoll
37 #define wcscoll_l strcoll_l
38 #define wcsxfrm strxfrm
39 #define wcsxfrm_l strxfrm_l
49 #endif /* defined(L_strxfrm) || defined(L_strxfrm_l) */
51 #if defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE)
54 int wcscoll (const Wchar *s0, const Wchar *s1)
56 return wcscoll_l(s0, s1, __UCLIBC_CURLOCALE );
58 libc_hidden_def(wcscoll)
61 size_t wcsxfrm(Wchar *__restrict ws1, const Wchar *__restrict ws2, size_t n)
63 return wcsxfrm_l(ws1, ws2, n, __UCLIBC_CURLOCALE );
65 libc_hidden_def(wcsxfrm)
67 #else /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
71 #define CUR_COLLATE (&__UCLIBC_CURLOCALE->collate)
73 #define CUR_COLLATE (& __LOCALE_PTR->collate)
80 const Wchar *eob; /* end of backward */
83 __uwchar_t ui_weight; /* undefined or invalid */
88 /* should be wchar_t. if wchar < 0 do EILSEQ? */
90 __uwchar_t ci_pending[MAX_PENDING]; /* nul-terminated */
93 char *bbe; /* end of back_buf (actual last... not 1 past end) */
94 char *bp; /* ptr into backbuf, NULL if not in backward mode */
102 #define WEIGHT_MASK 0x3fffU
103 #define RULE_MASK 0xc000U
105 #define RULE_FORWARD (1 << 14)
106 #define RULE_POSITION (1 << 15)
108 #define UI_IDX (WEIGHT_MASK-6)
109 #define POSIT_IDX (WEIGHT_MASK-5)
110 #define RANGE_IDX (WEIGHT_MASK-4)
111 #define UNDEF_IDX (WEIGHT_MASK-3)
112 #define INVAL_IDX (WEIGHT_MASK-2)
113 #define DITTO_IDX (WEIGHT_MASK-1)
118 #define TRACE(X) printf X
120 #define TRACE(X) ((void)0)
123 static int lookup(wchar_t wc __LOCALE_PARAM )
125 unsigned int sc, n, i0, i1;
127 if (((__uwchar_t) wc) > 0xffffU) {
131 sc = wc & CUR_COLLATE->ti_mask;
132 wc >>= CUR_COLLATE->ti_shift;
133 n = wc & CUR_COLLATE->ii_mask;
134 wc >>= CUR_COLLATE->ii_shift;
136 i0 = CUR_COLLATE->wcs2colidt_tbl[wc];
137 i0 <<= CUR_COLLATE->ii_shift;
138 i1 = CUR_COLLATE->wcs2colidt_tbl[CUR_COLLATE->ii_len + i0 + n];
139 i1 <<= CUR_COLLATE->ti_shift;
140 return CUR_COLLATE->wcs2colidt_tbl[CUR_COLLATE->ii_len + CUR_COLLATE->ti_len + i1 + sc];
144 static void init_col_state(col_state_t *cs, const Wchar *wcs)
146 memset(cs, 0, sizeof(col_state_t));
148 cs->bp = cs->back_buf = cs->ibb;
150 cs->bbe = cs->back_buf + (cs->bb_size -1);
153 static void next_weight(col_state_t *cs, int pass __LOCALE_PARAM )
155 int r, w, ru, ri, popping_backup_stack;
161 #else /* WANT_WIDE */
166 #endif /* WANT_WIDE */
172 TRACE(("ru_pushed = %d\n", ru));
177 #ifdef __UCLIBC_MJN3_ONLY__
178 #warning should we walk pendings backwards?
180 if (cs->cip) { /* possible pending weight */
181 if ((r = *(cs->cip++)) == 0) {
185 cs->weightidx = r & WEIGHT_MASK;
186 assert(cs->weightidx);
187 /* assert(cs->weightidx != WEIGHT_MASK); */
188 } else { /* get the next collation item from the string */
189 TRACE(("clearing popping flag\n"));
190 popping_backup_stack = 0;
193 /* keep first pos as 0 for a sentinal */
194 if (*cs->bp) { /* pending backward chars */
196 popping_backup_stack = 1;
197 TRACE(("setting popping flag\n"));
199 if (*cs->bp > 0) { /* singles pending */
201 if ((*cs->bp -= 1) == 0) {
204 } else { /* last was a multi */
208 } else if (!*cs->s) { /* not in backward mode and end of string */
218 cs->colitem = r = lookup(*cs->s __LOCALE_ARG );
219 #else /* WANT_WIDE */
220 n = n0 = __locale_mbrtowc_l(&WC, cs->s, __LOCALE_PTR);
226 cs->colitem = r = lookup(WC __LOCALE_ARG );
227 #endif /* WANT_WIDE */
229 TRACE((" r=%d WC=%#lx\n", r, (unsigned long)(WC)));
231 if (r > CUR_COLLATE->max_col_index) { /* starting char for one or more sequences */
232 p = CUR_COLLATE->multistart_tbl;
233 p += p[r-CUR_COLLATE->max_col_index -1];
238 if (!*p) { /* found it */
240 TRACE((" found multi %d\n", n));
244 /* the lookup check here is safe since we're assured that *p is a valid colidx */
245 if (!cs->s[n] || (lookup(cs->s[n] __LOCALE_ARG ) != *p)) {
251 #else /* WANT_WIDE */
253 nx = __locale_mbrtowc_l(&WC, cs->s + n, __LOCALE_PTR);
260 if (!cs->s[n] || (lookup(WC __LOCALE_ARG ) != *p)) {
265 n += nx; /* Only gets here if cs->s[n] != 0, so nx is set. */
266 #endif /* WANT_WIDE */
269 } else if (r == 0) { /* illegal, undefined, or part of a range */
270 if ((CUR_COLLATE->range_count)
271 #ifdef __UCLIBC_MJN3_ONLY__
272 #warning .. need to introduce range as a collating item?
274 && (((__uwchar_t)(WC - CUR_COLLATE->range_low)) <= CUR_COLLATE->range_count)
275 ) { /* part of a range */
276 /* Note: cs->colitem = 0 already. */
277 TRACE((" found range\n"));
278 ru = CUR_COLLATE->ruletable[CUR_COLLATE->range_rule_offset*CUR_COLLATE->MAX_WEIGHTS + pass];
279 assert((ru & WEIGHT_MASK) != DITTO_IDX);
280 if ((ru & WEIGHT_MASK) == WEIGHT_MASK) {
281 ru = (ru & RULE_MASK) | RANGE_IDX;
282 cs->weight = CUR_COLLATE->range_base_weight + (WC - CUR_COLLATE->range_low);
285 } else if (((__uwchar_t)(WC)) <= 0x7fffffffUL) { /* legal but undefined */
287 /* Note: cs->colitem = 0 already. */
288 ri = CUR_COLLATE->undefined_idx;
289 assert(ri != 0); /* implicit undefined isn't supported */
291 TRACE((" found explicit UNDEFINED\n"));
292 #ifdef __UCLIBC_MJN3_ONLY__
293 #warning right now single weight locales do not support ..
295 if (CUR_COLLATE->num_weights == 1) {
296 TRACE((" single weight UNDEFINED\n"));
297 cs->weightidx = RANGE_IDX;
303 ri = CUR_COLLATE->index2ruleidx[ri - 1];
304 ru = CUR_COLLATE->ruletable[ri * CUR_COLLATE->MAX_WEIGHTS + pass];
305 assert((ru & WEIGHT_MASK) != WEIGHT_MASK); /* TODO: handle ".." */
306 if ((ru & WEIGHT_MASK) == DITTO_IDX) {
307 cs->colitem = CUR_COLLATE->undefined_idx;
310 } else { /* illegal */
311 TRACE((" found illegal\n"));
313 /* We put all illegals in the same equiv class with maximal weight,
314 * and ignore them after the first pass. */
319 ru = (RULE_FORWARD | RANGE_IDX);
320 cs->weight = 0xffffU;
323 } else if (CUR_COLLATE->num_weights == 1) {
324 TRACE((" single weight\n"));
325 cs->weightidx = RANGE_IDX;
326 cs->weight = cs->colitem;
330 TRACE((" normal\n"));
333 /* if we get here, it is a normal char either singlely weighted, undefined, or in a range */
335 ri = CUR_COLLATE->index2ruleidx[cs->colitem - 1];
336 TRACE((" ri=%d ", ri));
337 #ifdef __UCLIBC_MJN3_ONLY__
338 #warning make sure this is correct
341 TRACE(("NOT IN THIS LOCALE\n"));
344 ru = CUR_COLLATE->ruletable[ri * CUR_COLLATE->MAX_WEIGHTS + pass];
348 #ifdef __UCLIBC_MJN3_ONLY__
349 #warning ignoreables probably should not interrupt backwards processing, but this is wrong
351 /* if (!(ru & WEIGHT_MASK)) { */
352 /* TRACE(("IGNORE\n")); */
358 TRACE((" rule = %#x weight = %#x popping = %d s = %p eob = %p\n",
359 ru & RULE_MASK, ru & WEIGHT_MASK, popping_backup_stack,
361 /* now we need to check if we're going backwards... */
363 if (!popping_backup_stack) {
364 if (!(ru & RULE_MASK)) { /* backward */
365 TRACE(("backwards\n"));
366 assert(cs->bp <= cs->bbe);
367 if (cs->bp == cs->bbe) {
368 if (cs->back_buf == cs->ibb) { /* was using internal buffer */
369 cs->bp = malloc(cs->bb_size + 128);
372 #ifdef __UCLIBC_MJN3_ONLY__
373 #warning what to do here?
378 memcpy(cs->bp, cs->back_buf, cs->bb_size);
381 cs->bp = realloc(cs->back_buf, cs->bb_size + 128);
384 #ifdef __UCLIBC_MJN3_ONLY__
385 #warning what to do here?
392 cs->bbe = cs->bp + (cs->bbe - cs->back_buf);
393 cs->back_buf = cs->bp;
397 if (n==1) { /* single char */
398 if (*cs->bp && (((unsigned char)(*cs->bp)) < CHAR_MAX)) {
399 *cs->bp += 1; /* increment last single's count */
400 } else { /* last was a multi, or just starting */
402 cs->bp = cs->back_buf;
404 assert(cs->bp < cs->bbe);
409 } else { /* multichar */
411 assert(cs->bp < cs->bbe);
418 /* end-of-string so start popping */
420 TRACE(("popping\n"));
422 } else if (*cs->bp) { /* was going backward but this element isn't */
423 /* discard current and use previous backward element */
426 TRACE(("popping\n"));
428 } else { /* was and still going forward */
429 TRACE(("forwards\n"));
430 if ((ru & (RULE_POSITION|WEIGHT_MASK)) > RULE_POSITION) {
431 assert(ru & WEIGHT_MASK);
433 cs->weight = cs->position;
434 #ifdef __UCLIBC_MJN3_ONLY__
437 cs->position = 0; /* reset to reduce size for strcoll? */
439 cs->weightidx = RANGE_IDX;
443 } else { /* popping backwards stack */
444 TRACE(("popping (continued)\n"));
453 cs->weightidx = ru & WEIGHT_MASK;
454 cs->rule = ru & RULE_MASK;
457 #ifdef __UCLIBC_MJN3_ONLY__
458 #warning for pending we only want the weight... _not_ the rule
460 if (!cs->weightidx) { /* ignore */
465 assert(cs->weightidx);
468 if (((unsigned int)(cs->weightidx - UI_IDX)) <= (INVAL_IDX-UI_IDX)) {
469 if (cs->weightidx == UI_IDX) {
470 cs->weight = cs->ui_weight;
475 assert(cs->weightidx != WEIGHT_MASK);
476 if (cs->weightidx == DITTO_IDX) { /* want the weight of the current collating item */
477 TRACE(("doing ditto\n"));
478 w = CUR_COLLATE->index2weight[cs->colitem -1];
479 } else if (cs->weightidx <= CUR_COLLATE->max_col_index) { /* normal */
480 TRACE(("doing normal\n"));
481 w = CUR_COLLATE->index2weight[cs->weightidx -1];
482 } else { /* a string */
483 TRACE(("doing string\n"));
484 assert(!(cs->weightidx & RULE_MASK));
485 /* note: iso14561 allows null string here */
486 p = CUR_COLLATE->weightstr + (cs->weightidx - (CUR_COLLATE->max_col_index + 2));
487 if (*p & WEIGHT_MASK) {
490 assert(r < MAX_PENDING);
491 cs->ci_pending[r++] = *p++;
492 } while (*p & WEIGHT_MASK);
493 cs->cip = cs->ci_pending;
503 libc_hidden_proto(__XL_NPP(wcscoll))
504 int __XL_NPP(wcscoll) (const Wchar *s0, const Wchar *s1 __LOCALE_PARAM )
509 if (!CUR_COLLATE->num_weights) { /* C locale */
511 return wcscmp(s0, s1);
513 return strcmp(s0, s1);
518 do { /* loop through the weights levels */
519 init_col_state(ws, s0);
520 init_col_state(ws+1, s1);
521 do { /* loop through the strings */
522 /* for each string, get the next weight */
523 next_weight(ws, pass __LOCALE_ARG );
524 next_weight(ws+1, pass __LOCALE_ARG );
525 TRACE(("w0=%lu w1=%lu\n",
526 (unsigned long) ws[0].weight,
527 (unsigned long) ws[1].weight));
529 if (ws[0].weight != ws[1].weight) {
530 return ws[0].weight - ws[1].weight;
532 } while (ws[0].weight);
533 } while (++pass < CUR_COLLATE->num_weights);
537 libc_hidden_def(__XL_NPP(wcscoll))
541 extern size_t __wcslcpy(wchar_t *__restrict dst,
542 const wchar_t *__restrict src, size_t n);
544 libc_hidden_proto(__XL_NPP(wcsxfrm))
545 size_t __XL_NPP(wcsxfrm)(wchar_t *__restrict ws1, const wchar_t *__restrict ws2,
546 size_t n __LOCALE_PARAM )
552 if (!CUR_COLLATE->num_weights) { /* C locale */
553 return __wcslcpy(ws1, ws2, n);
556 #ifdef __UCLIBC_MJN3_ONLY__
557 #warning handle empty string as a special case
561 do { /* loop through the weights levels */
562 init_col_state(&cs, ws2);
563 do { /* loop through the string */
564 next_weight(&cs, pass __LOCALE_ARG );
565 TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs.weight, (unsigned long) cs.weight));
567 ws1[count] = cs.weight +1;
570 TRACE(("--------------------------------------------\n"));
572 if (count <= n) { /* overwrite the trailing 0 end-of-pass marker */
575 TRACE(("-------------------- pass %d --------------------\n", pass));
576 } while (++pass < CUR_COLLATE->num_weights);
577 if (count <= n) { /* oops... change it back */
582 libc_hidden_def(__XL_NPP(wcsxfrm))
584 #else /* WANT_WIDE */
586 static const unsigned long bound[] = {
594 static unsigned char first[] = {
595 0x0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
598 /* Use an extension of UTF-8 to store a 32 bit val in max 6 bytes. */
600 static size_t store(unsigned char *s, size_t count, size_t n, __uwchar_t weight)
606 if (weight < bound[i]) {
609 } while (++i < sizeof(bound)/sizeof(bound[0]));
616 s[i] = 0x80 | (weight & 0x3f);
626 libc_hidden_proto(__XL_NPP(strxfrm))
627 size_t __XL_NPP(strxfrm)(char *__restrict ws1, const char *__restrict ws2, size_t n
634 if (!CUR_COLLATE->num_weights) { /* C locale */
635 return strlcpy(ws1, ws2, n);
638 #ifdef __UCLIBC_MJN3_ONLY__
639 #warning handle empty string as a special case
642 inc = count = pass = 0;
643 do { /* loop through the weights levels */
644 init_col_state(&cs, ws2);
645 do { /* loop through the string */
646 next_weight(&cs, pass __LOCALE_ARG );
647 TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs.weight, (unsigned long) cs.weight));
648 inc = store((unsigned char *)ws1, count, n, cs.weight + 1);
650 TRACE(("--------------------------------------------\n"));
652 /* overwrite the trailing 0 end-of-pass marker */
657 TRACE(("-------------------- pass %d --------------------\n", pass));
658 } while (++pass < CUR_COLLATE->num_weights);
659 if (count <= n) { /* oops... change it back */
664 libc_hidden_def(__XL_NPP(strxfrm))
666 #endif /* WANT_WIDE */
668 #endif /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
670 #endif /* defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l) */
672 #endif /* __UCLIBC_HAS_LOCALE__ */
673 /**********************************************************************/