-: 0:Source:ure.c -: 0:Graph:/var/tsitkova/Sources/v10/trunk/src/lib/krb5/unicode/ure.so.gcno -: 0:Data:/var/tsitkova/Sources/v10/trunk/src/lib/krb5/unicode/ure.so.gcda -: 0:Runs:1602 -: 0:Programs:1 -: 1:/* -: 2: * Copyright 1998-2008 The OpenLDAP Foundation. -: 3: * All rights reserved. -: 4: * -: 5: * Redistribution and use in source and binary forms, with or without -: 6: * modification, are permitted only as authorized by the OpenLDAP -: 7: * Public License. -: 8: * -: 9: * A copy of this license is available in file LICENSE in the -: 10: * top-level directory of the distribution or, alternatively, at -: 11: * . -: 12: */ -: 13:/* Copyright 1997, 1998, 1999 Computing Research Labs, -: 14: * New Mexico State University -: 15: * -: 16: * Permission is hereby granted, free of charge, to any person obtaining a -: 17: * copy of this software and associated documentation files (the "Software"), -: 18: * to deal in the Software without restriction, including without limitation -: 19: * the rights to use, copy, modify, merge, publish, distribute, sublicense, -: 20: * and/or sell copies of the Software, and to permit persons to whom the -: 21: * Software is furnished to do so, subject to the following conditions: -: 22: * -: 23: * The above copyright notice and this permission notice shall be included in -: 24: * all copies or substantial portions of the Software. -: 25: * -: 26: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -: 27: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -: 28: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -: 29: * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY -: 30: * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT -: 31: * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -: 32: * THE USE OR OTHER DEALINGS IN THE SOFTWARE. -: 33: */ -: 34: -: 35:/* -: 36: * This work is part of OpenLDAP Software . -: 37: * $OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.c,v 1.19 2008/01/07 23:20:05 kurt Exp $ -: 38: * $Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $" -: 39: */ -: 40: -: 41:#include -: 42: -: 43:#include -: 44:#include -: 45:#ifndef _WIN32 -: 46:#include -: 47:#endif -: 48: -: 49:#include "ure.h" -: 50: -: 51:/* -: 52: * Flags used internally in the DFA. -: 53: */ -: 54:#define _URE_DFA_CASEFOLD 0x01 -: 55:#define _URE_DFA_BLANKLINE 0x02 -: 56: -: 57:static unsigned long cclass_flags[] = { -: 58: 0, -: 59: _URE_NONSPACING, -: 60: _URE_COMBINING, -: 61: _URE_NUMDIGIT, -: 62: _URE_NUMOTHER, -: 63: _URE_SPACESEP, -: 64: _URE_LINESEP, -: 65: _URE_PARASEP, -: 66: _URE_CNTRL, -: 67: _URE_PUA, -: 68: _URE_UPPER, -: 69: _URE_LOWER, -: 70: _URE_TITLE, -: 71: _URE_MODIFIER, -: 72: _URE_OTHERLETTER, -: 73: _URE_DASHPUNCT, -: 74: _URE_OPENPUNCT, -: 75: _URE_CLOSEPUNCT, -: 76: _URE_OTHERPUNCT, -: 77: _URE_MATHSYM, -: 78: _URE_CURRENCYSYM, -: 79: _URE_OTHERSYM, -: 80: _URE_LTR, -: 81: _URE_RTL, -: 82: _URE_EURONUM, -: 83: _URE_EURONUMSEP, -: 84: _URE_EURONUMTERM, -: 85: _URE_ARABNUM, -: 86: _URE_COMMONSEP, -: 87: _URE_BLOCKSEP, -: 88: _URE_SEGMENTSEP, -: 89: _URE_WHITESPACE, -: 90: _URE_OTHERNEUT, -: 91:}; -: 92: -: 93:/* -: 94: * Symbol types for the DFA. -: 95: */ -: 96:#define _URE_ANY_CHAR 1 -: 97:#define _URE_CHAR 2 -: 98:#define _URE_CCLASS 3 -: 99:#define _URE_NCCLASS 4 -: 100:#define _URE_BOL_ANCHOR 5 -: 101:#define _URE_EOL_ANCHOR 6 -: 102: -: 103:/* -: 104: * Op codes for converting the NFA to a DFA. -: 105: */ -: 106:#define _URE_SYMBOL 10 -: 107:#define _URE_PAREN 11 -: 108:#define _URE_QUEST 12 -: 109:#define _URE_STAR 13 -: 110:#define _URE_PLUS 14 -: 111:#define _URE_ONE 15 -: 112:#define _URE_AND 16 -: 113:#define _URE_OR 17 -: 114: -: 115:#define _URE_NOOP 0xffff -: 116: -: 117:#define _URE_REGSTART 0x8000 -: 118:#define _URE_REGEND 0x4000 -: 119: -: 120:/* -: 121: * Structure used to handle a compacted range of characters. -: 122: */ -: 123:typedef struct { -: 124: ucs4_t min_code; -: 125: ucs4_t max_code; -: 126:} _ure_range_t; -: 127: -: 128:typedef struct { -: 129: _ure_range_t *ranges; -: 130: ucs2_t ranges_used; -: 131: ucs2_t ranges_size; -: 132:} _ure_ccl_t; -: 133: -: 134:typedef union { -: 135: ucs4_t chr; -: 136: _ure_ccl_t ccl; -: 137:} _ure_sym_t; -: 138: -: 139:/* -: 140: * This is a general element structure used for expressions and stack -: 141: * elements. -: 142: */ -: 143:typedef struct { -: 144: ucs2_t reg; -: 145: ucs2_t onstack; -: 146: ucs2_t type; -: 147: ucs2_t lhs; -: 148: ucs2_t rhs; -: 149:} _ure_elt_t; -: 150: -: 151:/* -: 152: * This is a structure used to track a list or a stack of states. -: 153: */ -: 154:typedef struct { -: 155: ucs2_t *slist; -: 156: ucs2_t slist_size; -: 157: ucs2_t slist_used; -: 158:} _ure_stlist_t; -: 159: -: 160:/* -: 161: * Structure to track the list of unique states for a symbol -: 162: * during reduction. -: 163: */ -: 164:typedef struct { -: 165: ucs2_t id; -: 166: ucs2_t type; -: 167: unsigned long mods; -: 168: unsigned long props; -: 169: _ure_sym_t sym; -: 170: _ure_stlist_t states; -: 171:} _ure_symtab_t; -: 172: -: 173:/* -: 174: * Structure to hold a single state. -: 175: */ -: 176:typedef struct { -: 177: ucs2_t id; -: 178: ucs2_t accepting; -: 179: ucs2_t pad; -: 180: _ure_stlist_t st; -: 181: _ure_elt_t *trans; -: 182: ucs2_t trans_size; -: 183: ucs2_t trans_used; -: 184:} _ure_state_t; -: 185: -: 186:/* -: 187: * Structure used for keeping lists of states. -: 188: */ -: 189:typedef struct { -: 190: _ure_state_t *states; -: 191: ucs2_t states_size; -: 192: ucs2_t states_used; -: 193:} _ure_statetable_t; -: 194: -: 195:/* -: 196: * Structure to track pairs of DFA states when equivalent states are -: 197: * merged. -: 198: */ -: 199:typedef struct { -: 200: ucs2_t l; -: 201: ucs2_t r; -: 202:} _ure_equiv_t; -: 203: -: 204:/* -: 205: * Structure used for constructing the NFA and reducing to a minimal DFA. -: 206: */ -: 207:typedef struct _ure_buffer_t { -: 208: int reducing; -: 209: int error; -: 210: unsigned long flags; -: 211: -: 212: _ure_stlist_t stack; -: 213: -: 214: /* -: 215: * Table of unique symbols encountered. -: 216: */ -: 217: _ure_symtab_t *symtab; -: 218: ucs2_t symtab_size; -: 219: ucs2_t symtab_used; -: 220: -: 221: /* -: 222: * Tracks the unique expressions generated for the NFA and when the NFA is -: 223: * reduced. -: 224: */ -: 225: _ure_elt_t *expr; -: 226: ucs2_t expr_used; -: 227: ucs2_t expr_size; -: 228: -: 229: /* -: 230: * The reduced table of unique groups of NFA states. -: 231: */ -: 232: _ure_statetable_t states; -: 233: -: 234: /* -: 235: * Tracks states when equivalent states are merged. -: 236: */ -: 237: _ure_equiv_t *equiv; -: 238: ucs2_t equiv_used; -: 239: ucs2_t equiv_size; -: 240:} _ure_buffer_t; -: 241: -: 242:typedef struct { -: 243: ucs2_t symbol; -: 244: ucs2_t next_state; -: 245:} _ure_trans_t; -: 246: -: 247:typedef struct { -: 248: ucs2_t accepting; -: 249: ucs2_t ntrans; -: 250: _ure_trans_t *trans; -: 251:} _ure_dstate_t; -: 252: -: 253:typedef struct _ure_dfa_t { -: 254: unsigned long flags; -: 255: -: 256: _ure_symtab_t *syms; -: 257: ucs2_t nsyms; -: 258: -: 259: _ure_dstate_t *states; -: 260: ucs2_t nstates; -: 261: -: 262: _ure_trans_t *trans; -: 263: ucs2_t ntrans; -: 264:} _ure_dfa_t; -: 265: -: 266:/************************************************************************* -: 267: * -: 268: * Functions. -: 269: * -: 270: *************************************************************************/ -: 271: -: 272:static void #####: 273:_ure_memmove(char *dest, char *src, unsigned long bytes) -: 274:{ -: 275: long i, j; -: 276: #####: 277: i = (long) bytes; #####: 278: j = i & 7; #####: 279: i = (i + 7) >> 3; -: 280: -: 281: /* -: 282: * Do a memmove using Ye Olde Duff's Device for efficiency. -: 283: */ #####: 284: if (src < dest) { #####: 285: src += bytes; #####: 286: dest += bytes; -: 287: #####: 288: switch (j) { -: 289: case 0: do { #####: 290: *--dest = *--src; #####: 291: case 7: *--dest = *--src; #####: 292: case 6: *--dest = *--src; #####: 293: case 5: *--dest = *--src; #####: 294: case 4: *--dest = *--src; #####: 295: case 3: *--dest = *--src; #####: 296: case 2: *--dest = *--src; #####: 297: case 1: *--dest = *--src; #####: 298: } while (--i > 0); -: 299: } #####: 300: } else if (src > dest) { #####: 301: switch (j) { -: 302: case 0: do { #####: 303: *dest++ = *src++; #####: 304: case 7: *dest++ = *src++; #####: 305: case 6: *dest++ = *src++; #####: 306: case 5: *dest++ = *src++; #####: 307: case 4: *dest++ = *src++; #####: 308: case 3: *dest++ = *src++; #####: 309: case 2: *dest++ = *src++; #####: 310: case 1: *dest++ = *src++; #####: 311: } while (--i > 0); -: 312: } -: 313: } #####: 314:} -: 315: -: 316:static void #####: 317:_ure_push(ucs2_t v, _ure_buffer_t *b) -: 318:{ -: 319: _ure_stlist_t *s; -: 320: #####: 321: if (b == 0) #####: 322: return; -: 323: -: 324: /* -: 325: * If the `reducing' parameter is non-zero, check to see if the value -: 326: * passed is already on the stack. -: 327: */ #####: 328: if (b->reducing != 0 && b->expr[v].onstack != 0) #####: 329: return; -: 330: #####: 331: s = &b->stack; #####: 332: if (s->slist_used == s->slist_size) { #####: 333: if (s->slist_size == 0) #####: 334: s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); -: 335: else #####: 336: s->slist = (ucs2_t *) realloc((char *) s->slist, -: 337: sizeof(ucs2_t) * (s->slist_size + 8)); #####: 338: s->slist_size += 8; -: 339: } #####: 340: s->slist[s->slist_used++] = v; -: 341: -: 342: /* -: 343: * If the `reducing' parameter is non-zero, flag the element as being on -: 344: * the stack. -: 345: */ #####: 346: if (b->reducing != 0) #####: 347: b->expr[v].onstack = 1; -: 348:} -: 349: -: 350:static ucs2_t #####: 351:_ure_peek(_ure_buffer_t *b) -: 352:{ #####: 353: if (b == 0 || b->stack.slist_used == 0) #####: 354: return _URE_NOOP; -: 355: #####: 356: return b->stack.slist[b->stack.slist_used - 1]; -: 357:} -: 358: -: 359:static ucs2_t #####: 360:_ure_pop(_ure_buffer_t *b) -: 361:{ -: 362: ucs2_t v; -: 363: #####: 364: if (b == 0 || b->stack.slist_used == 0) #####: 365: return _URE_NOOP; -: 366: #####: 367: v = b->stack.slist[--b->stack.slist_used]; #####: 368: if (b->reducing) #####: 369: b->expr[v].onstack = 0; -: 370: #####: 371: return v; -: 372:} -: 373: -: 374:/************************************************************************* -: 375: * -: 376: * Start symbol parse functions. -: 377: * -: 378: *************************************************************************/ -: 379: -: 380:/* -: 381: * Parse a comma-separated list of integers that represent character -: 382: * properties. Combine them into a mask that is returned in the `mask' -: 383: * variable, and return the number of characters consumed. -: 384: */ -: 385:static unsigned long #####: 386:_ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask, -: 387: _ure_buffer_t *b) -: 388:{ -: 389: unsigned long n, m; -: 390: ucs2_t *sp, *ep; -: 391: #####: 392: sp = pp; #####: 393: ep = sp + limit; -: 394: #####: 395: for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) { #####: 396: if (*sp == ',') { -: 397: /* -: 398: * Encountered a comma, so select the next character property flag -: 399: * and reset the number. -: 400: */ #####: 401: m |= cclass_flags[n]; #####: 402: n = 0; #####: 403: } else if (*sp >= '0' && *sp <= '9') -: 404: /* -: 405: * Encountered a digit, so start or continue building the cardinal -: 406: * that represents the character property flag. -: 407: */ #####: 408: n = (n * 10) + (*sp - '0'); -: 409: else -: 410: /* -: 411: * Encountered something that is not part of the property list. -: 412: * Indicate that we are done. -: 413: */ -: 414: break; -: 415: -: 416: /* -: 417: * If a property number greater than 32 occurs, then there is a -: 418: * problem. Most likely a missing comma separator. -: 419: */ #####: 420: if (n > 32) #####: 421: b->error = _URE_INVALID_PROPERTY; -: 422: } -: 423: #####: 424: if (n != 0) #####: 425: m |= cclass_flags[n]; -: 426: -: 427: /* -: 428: * Set the mask that represents the group of character properties. -: 429: */ #####: 430: *mask = m; -: 431: -: 432: /* -: 433: * Return the number of characters consumed. -: 434: */ #####: 435: return sp - pp; -: 436:} -: 437: -: 438:/* -: 439: * Collect a hex number with 1 to 4 digits and return the number -: 440: * of characters used. -: 441: */ -: 442:static unsigned long #####: 443:_ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n) -: 444:{ -: 445: ucs2_t i; -: 446: ucs2_t *sp, *ep; -: 447: ucs4_t nn; -: 448: #####: 449: sp = np; #####: 450: ep = sp + limit; -: 451: #####: 452: for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) { #####: 453: if (*sp >= '0' && *sp <= '9') #####: 454: nn = (nn << 4) + (*sp - '0'); #####: 455: else if (*sp >= 'A' && *sp <= 'F') #####: 456: nn = (nn << 4) + ((*sp - 'A') + 10); #####: 457: else if (*sp >= 'a' && *sp <= 'f') #####: 458: nn = (nn << 4) + ((*sp - 'a') + 10); -: 459: else -: 460: /* -: 461: * Encountered something that is not a hex digit. -: 462: */ -: 463: break; -: 464: } -: 465: -: 466: /* -: 467: * Assign the character code collected and return the number of -: 468: * characters used. -: 469: */ #####: 470: *n = nn; -: 471: #####: 472: return sp - np; -: 473:} -: 474: -: 475:/* -: 476: * Insert a range into a character class, removing duplicates and ordering -: 477: * them in increasing range-start order. -: 478: */ -: 479:static void #####: 480:_ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b) -: 481:{ -: 482: ucs2_t i; -: 483: ucs4_t tmp; -: 484: _ure_range_t *rp; -: 485: -: 486: /* -: 487: * If the `casefold' flag is set, then make sure both endpoints of the -: 488: * range are converted to lower case. -: 489: */ #####: 490: if (b->flags & _URE_DFA_CASEFOLD) { #####: 491: r->min_code = _ure_tolower(r->min_code); #####: 492: r->max_code = _ure_tolower(r->max_code); -: 493: } -: 494: -: 495: /* -: 496: * Swap the range endpoints if they are not in increasing order. -: 497: */ #####: 498: if (r->min_code > r->max_code) { #####: 499: tmp = r->min_code; #####: 500: r->min_code = r->max_code; #####: 501: r->max_code = tmp; -: 502: } -: 503: #####: 504: for (i = 0, rp = ccl->ranges; #####: 505: i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ; -: 506: -: 507: /* -: 508: * Check for a duplicate. -: 509: */ #####: 510: if (i < ccl->ranges_used && #####: 511: r->min_code == rp->min_code && r->max_code == rp->max_code) #####: 512: return; -: 513: #####: 514: if (ccl->ranges_used == ccl->ranges_size) { #####: 515: if (ccl->ranges_size == 0) #####: 516: ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3); -: 517: else #####: 518: ccl->ranges = (_ure_range_t *) -: 519: realloc((char *) ccl->ranges, -: 520: sizeof(_ure_range_t) * (ccl->ranges_size + 8)); #####: 521: ccl->ranges_size += 8; -: 522: } -: 523: #####: 524: rp = ccl->ranges + ccl->ranges_used; -: 525: #####: 526: if (i < ccl->ranges_used) #####: 527: _ure_memmove((char *) (rp + 1), (char *) rp, #####: 528: sizeof(_ure_range_t) * (ccl->ranges_used - i)); -: 529: #####: 530: ccl->ranges_used++; #####: 531: rp->min_code = r->min_code; #####: 532: rp->max_code = r->max_code; -: 533:} -: 534: -: 535:#define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\ -: 536:_URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING) -: 537:#define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT) -: 538:#define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\ -: 539:_URE_OTHERPUNCT) -: 540:#define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\ -: 541:_URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM) -: 542:#define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP) -: 543:#define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP) -: 544: -: 545:typedef void (*_ure_cclsetup_t)( -: 546: _ure_symtab_t *sym, -: 547: unsigned long mask, -: 548: _ure_buffer_t *b -: 549:); -: 550: -: 551:typedef struct { -: 552: ucs2_t key; -: 553: unsigned int len : 8; -: 554: unsigned int next : 8; -: 555: _ure_cclsetup_t func; -: 556: unsigned long mask; -: 557:} _ure_trie_t; -: 558: -: 559:static void #####: 560:_ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) -: 561:{ #####: 562: sym->props |= mask; #####: 563:} -: 564: -: 565:static void #####: 566:_ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) -: 567:{ -: 568: _ure_range_t range; -: 569: #####: 570: sym->props |= mask; -: 571: -: 572: /* -: 573: * Add the additional characters needed for handling isspace(). -: 574: */ #####: 575: range.min_code = range.max_code = '\t'; #####: 576: _ure_add_range(&sym->sym.ccl, &range, b); #####: 577: range.min_code = range.max_code = '\r'; #####: 578: _ure_add_range(&sym->sym.ccl, &range, b); #####: 579: range.min_code = range.max_code = '\n'; #####: 580: _ure_add_range(&sym->sym.ccl, &range, b); #####: 581: range.min_code = range.max_code = '\f'; #####: 582: _ure_add_range(&sym->sym.ccl, &range, b); #####: 583: range.min_code = range.max_code = 0xfeff; #####: 584: _ure_add_range(&sym->sym.ccl, &range, b); #####: 585:} -: 586: -: 587:static void #####: 588:_ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) -: 589:{ -: 590: _ure_range_t range; -: 591: -: 592: /* -: 593: * Add the additional characters needed for handling isxdigit(). -: 594: */ #####: 595: range.min_code = '0'; #####: 596: range.max_code = '9'; #####: 597: _ure_add_range(&sym->sym.ccl, &range, b); #####: 598: range.min_code = 'A'; #####: 599: range.max_code = 'F'; #####: 600: _ure_add_range(&sym->sym.ccl, &range, b); #####: 601: range.min_code = 'a'; #####: 602: range.max_code = 'f'; #####: 603: _ure_add_range(&sym->sym.ccl, &range, b); #####: 604:} -: 605: -: 606:static const _ure_trie_t cclass_trie[] = { -: 607: {0x003a, 1, 1, 0, 0}, -: 608: {0x0061, 9, 10, 0, 0}, -: 609: {0x0063, 8, 19, 0, 0}, -: 610: {0x0064, 7, 24, 0, 0}, -: 611: {0x0067, 6, 29, 0, 0}, -: 612: {0x006c, 5, 34, 0, 0}, -: 613: {0x0070, 4, 39, 0, 0}, -: 614: {0x0073, 3, 49, 0, 0}, -: 615: {0x0075, 2, 54, 0, 0}, -: 616: {0x0078, 1, 59, 0, 0}, -: 617: {0x006c, 1, 11, 0, 0}, -: 618: {0x006e, 2, 13, 0, 0}, -: 619: {0x0070, 1, 16, 0, 0}, -: 620: {0x0075, 1, 14, 0, 0}, -: 621: {0x006d, 1, 15, 0, 0}, -: 622: {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK}, -: 623: {0x0068, 1, 17, 0, 0}, -: 624: {0x0061, 1, 18, 0, 0}, -: 625: {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK}, -: 626: {0x006e, 1, 20, 0, 0}, -: 627: {0x0074, 1, 21, 0, 0}, -: 628: {0x0072, 1, 22, 0, 0}, -: 629: {0x006c, 1, 23, 0, 0}, -: 630: {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL}, -: 631: {0x0069, 1, 25, 0, 0}, -: 632: {0x0067, 1, 26, 0, 0}, -: 633: {0x0069, 1, 27, 0, 0}, -: 634: {0x0074, 1, 28, 0, 0}, -: 635: {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT}, -: 636: {0x0072, 1, 30, 0, 0}, -: 637: {0x0061, 1, 31, 0, 0}, -: 638: {0x0070, 1, 32, 0, 0}, -: 639: {0x0068, 1, 33, 0, 0}, -: 640: {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK}, -: 641: {0x006f, 1, 35, 0, 0}, -: 642: {0x0077, 1, 36, 0, 0}, -: 643: {0x0065, 1, 37, 0, 0}, -: 644: {0x0072, 1, 38, 0, 0}, -: 645: {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER}, -: 646: {0x0072, 2, 41, 0, 0}, -: 647: {0x0075, 1, 45, 0, 0}, -: 648: {0x0069, 1, 42, 0, 0}, -: 649: {0x006e, 1, 43, 0, 0}, -: 650: {0x0074, 1, 44, 0, 0}, -: 651: {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK}, -: 652: {0x006e, 1, 46, 0, 0}, -: 653: {0x0063, 1, 47, 0, 0}, -: 654: {0x0074, 1, 48, 0, 0}, -: 655: {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK}, -: 656: {0x0070, 1, 50, 0, 0}, -: 657: {0x0061, 1, 51, 0, 0}, -: 658: {0x0063, 1, 52, 0, 0}, -: 659: {0x0065, 1, 53, 0, 0}, -: 660: {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK}, -: 661: {0x0070, 1, 55, 0, 0}, -: 662: {0x0070, 1, 56, 0, 0}, -: 663: {0x0065, 1, 57, 0, 0}, -: 664: {0x0072, 1, 58, 0, 0}, -: 665: {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER}, -: 666: {0x0064, 1, 60, 0, 0}, -: 667: {0x0069, 1, 61, 0, 0}, -: 668: {0x0067, 1, 62, 0, 0}, -: 669: {0x0069, 1, 63, 0, 0}, -: 670: {0x0074, 1, 64, 0, 0}, -: 671: {0x003a, 1, 65, _ure_xdigit_setup, 0}, -: 672:}; -: 673: -: 674:/* -: 675: * Probe for one of the POSIX colon delimited character classes in the static -: 676: * trie. -: 677: */ -: 678:static unsigned long #####: 679:_ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym, -: 680: _ure_buffer_t *b) -: 681:{ -: 682: int i; -: 683: unsigned long n; -: 684: const _ure_trie_t *tp; -: 685: ucs2_t *sp, *ep; -: 686: -: 687: /* -: 688: * If the number of characters left is less than 7, then this cannot be -: 689: * interpreted as one of the colon delimited classes. -: 690: */ #####: 691: if (limit < 7) #####: 692: return 0; -: 693: #####: 694: sp = cp; #####: 695: ep = sp + limit; #####: 696: tp = cclass_trie; #####: 697: for (i = 0; sp < ep && i < 8; i++, sp++) { #####: 698: n = tp->len; -: 699: #####: 700: for (; n > 0 && tp->key != *sp; tp++, n--) ; -: 701: #####: 702: if (n == 0) #####: 703: return 0; -: 704: #####: 705: if (*sp == ':' && (i == 6 || i == 7)) { #####: 706: sp++; #####: 707: break; -: 708: } #####: 709: if (sp + 1 < ep) #####: 710: tp = cclass_trie + tp->next; -: 711: } #####: 712: if (tp->func == 0) #####: 713: return 0; -: 714: #####: 715: (*tp->func)(sym, tp->mask, b); -: 716: #####: 717: return sp - cp; -: 718:} -: 719: -: 720:/* -: 721: * Construct a list of ranges and return the number of characters consumed. -: 722: */ -: 723:static unsigned long #####: 724:_ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp, -: 725: _ure_buffer_t *b) -: 726:{ -: 727: int range_end; -: 728: unsigned long n; -: 729: ucs2_t *sp, *ep; -: 730: ucs4_t c, last; -: 731: _ure_ccl_t *cclp; -: 732: _ure_range_t range; -: 733: #####: 734: sp = cp; #####: 735: ep = sp + limit; -: 736: #####: 737: if (*sp == '^') { #####: 738: symp->type = _URE_NCCLASS; #####: 739: sp++; -: 740: } else #####: 741: symp->type = _URE_CCLASS; -: 742: #####: 743: for (last = 0, range_end = 0; #####: 744: b->error == _URE_OK && sp < ep && *sp != ']'; ) { #####: 745: c = *sp++; #####: 746: if (c == '\\') { #####: 747: if (sp == ep) { -: 748: /* -: 749: * The EOS was encountered when expecting the reverse solidus -: 750: * to be followed by the character it is escaping. Set an -: 751: * error code and return the number of characters consumed up -: 752: * to this point. -: 753: */ #####: 754: b->error = _URE_UNEXPECTED_EOS; #####: 755: return sp - cp; -: 756: } -: 757: #####: 758: c = *sp++; #####: 759: switch (c) { -: 760: case 'a': #####: 761: c = 0x07; #####: 762: break; -: 763: case 'b': #####: 764: c = 0x08; #####: 765: break; -: 766: case 'f': #####: 767: c = 0x0c; #####: 768: break; -: 769: case 'n': #####: 770: c = 0x0a; #####: 771: break; -: 772: case 'r': #####: 773: c = 0x0d; #####: 774: break; -: 775: case 't': #####: 776: c = 0x09; #####: 777: break; -: 778: case 'v': #####: 779: c = 0x0b; #####: 780: break; -: 781: case 'p': -: 782: case 'P': #####: 783: sp += _ure_prop_list(sp, ep - sp, &symp->props, b); -: 784: /* -: 785: * Invert the bit mask of the properties if this is a negated -: 786: * character class or if 'P' is used to specify a list of -: 787: * character properties that should *not* match in a -: 788: * character class. -: 789: */ #####: 790: if (c == 'P') #####: 791: symp->props = ~symp->props; #####: 792: continue; -: 793: break; -: 794: case 'x': -: 795: case 'X': -: 796: case 'u': -: 797: case 'U': #####: 798: if (sp < ep && #####: 799: ((*sp >= '0' && *sp <= '9') || #####: 800: (*sp >= 'A' && *sp <= 'F') || #####: 801: (*sp >= 'a' && *sp <= 'f'))) #####: 802: sp += _ure_hex(sp, ep - sp, &c); -: 803: } #####: 804: } else if (c == ':') { -: 805: /* -: 806: * Probe for a POSIX colon delimited character class. -: 807: */ #####: 808: sp--; #####: 809: if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0) #####: 810: sp++; -: 811: else { #####: 812: sp += n; #####: 813: continue; -: 814: } -: 815: } -: 816: #####: 817: cclp = &symp->sym.ccl; -: 818: -: 819: /* -: 820: * Check to see if the current character is a low surrogate that needs -: 821: * to be combined with a preceding high surrogate. -: 822: */ #####: 823: if (last != 0) { #####: 824: if (c >= 0xdc00 && c <= 0xdfff) -: 825: /* -: 826: * Construct the UTF16 character code. -: 827: */ #####: 828: c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff)); -: 829: else { -: 830: /* -: 831: * Add the isolated high surrogate to the range. -: 832: */ #####: 833: if (range_end == 1) #####: 834: range.max_code = last & 0xffff; -: 835: else #####: 836: range.min_code = range.max_code = last & 0xffff; -: 837: #####: 838: _ure_add_range(cclp, &range, b); #####: 839: range_end = 0; -: 840: } -: 841: } -: 842: -: 843: /* -: 844: * Clear the last character code. -: 845: */ #####: 846: last = 0; -: 847: -: 848: /* -: 849: * This slightly awkward code handles the different cases needed to -: 850: * construct a range. -: 851: */ #####: 852: if (c >= 0xd800 && c <= 0xdbff) { -: 853: /* -: 854: * If the high surrogate is followed by a range indicator, simply -: 855: * add it as the range start. Otherwise, save it in case the next -: 856: * character is a low surrogate. -: 857: */ #####: 858: if (*sp == '-') { #####: 859: sp++; #####: 860: range.min_code = c; #####: 861: range_end = 1; -: 862: } else #####: 863: last = c; #####: 864: } else if (range_end == 1) { #####: 865: range.max_code = c; #####: 866: _ure_add_range(cclp, &range, b); #####: 867: range_end = 0; -: 868: } else { #####: 869: range.min_code = range.max_code = c; #####: 870: if (*sp == '-') { #####: 871: sp++; #####: 872: range_end = 1; -: 873: } else #####: 874: _ure_add_range(cclp, &range, b); -: 875: } -: 876: } -: 877: #####: 878: if (sp < ep && *sp == ']') #####: 879: sp++; -: 880: else -: 881: /* -: 882: * The parse was not terminated by the character class close symbol -: 883: * (']'), so set an error code. -: 884: */ #####: 885: b->error = _URE_CCLASS_OPEN; -: 886: #####: 887: return sp - cp; -: 888:} -: 889: -: 890:/* -: 891: * Probe for a low surrogate hex code. -: 892: */ -: 893:static unsigned long #####: 894:_ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c) -: 895:{ -: 896: ucs4_t i, code; -: 897: ucs2_t *sp, *ep; -: 898: #####: 899: for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) { #####: 900: if (*sp >= '0' && *sp <= '9') #####: 901: code = (code << 4) + (*sp - '0'); #####: 902: else if (*sp >= 'A' && *sp <= 'F') #####: 903: code = (code << 4) + ((*sp - 'A') + 10); #####: 904: else if (*sp >= 'a' && *sp <= 'f') #####: 905: code = (code << 4) + ((*sp - 'a') + 10); -: 906: else -: 907: break; -: 908: } -: 909: #####: 910: *c = code; #####: 911: return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0; -: 912:} -: 913: -: 914:static unsigned long #####: 915:_ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp, -: 916: _ure_buffer_t *b) -: 917:{ -: 918: ucs4_t c; -: 919: ucs2_t *sp, *ep; -: 920: #####: 921: sp = sym; #####: 922: ep = sym + limit; -: 923: #####: 924: if ((c = *sp++) == '\\') { -: 925: #####: 926: if (sp == ep) { -: 927: /* -: 928: * The EOS was encountered when expecting the reverse solidus to -: 929: * be followed by the character it is escaping. Set an error code -: 930: * and return the number of characters consumed up to this point. -: 931: */ #####: 932: b->error = _URE_UNEXPECTED_EOS; #####: 933: return sp - sym; -: 934: } -: 935: #####: 936: c = *sp++; #####: 937: switch (c) { -: 938: case 'p': -: 939: case 'P': #####: 940: symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS; #####: 941: sp += _ure_prop_list(sp, ep - sp, &symp->props, b); #####: 942: break; -: 943: case 'a': #####: 944: symp->type = _URE_CHAR; #####: 945: symp->sym.chr = 0x07; #####: 946: break; -: 947: case 'b': #####: 948: symp->type = _URE_CHAR; #####: 949: symp->sym.chr = 0x08; #####: 950: break; -: 951: case 'f': #####: 952: symp->type = _URE_CHAR; #####: 953: symp->sym.chr = 0x0c; #####: 954: break; -: 955: case 'n': #####: 956: symp->type = _URE_CHAR; #####: 957: symp->sym.chr = 0x0a; #####: 958: break; -: 959: case 'r': #####: 960: symp->type = _URE_CHAR; #####: 961: symp->sym.chr = 0x0d; #####: 962: break; -: 963: case 't': #####: 964: symp->type = _URE_CHAR; #####: 965: symp->sym.chr = 0x09; #####: 966: break; -: 967: case 'v': #####: 968: symp->type = _URE_CHAR; #####: 969: symp->sym.chr = 0x0b; #####: 970: break; -: 971: case 'x': -: 972: case 'X': -: 973: case 'u': -: 974: case 'U': -: 975: /* -: 976: * Collect between 1 and 4 digits representing a UCS2 code. Fall -: 977: * through to the next case. -: 978: */ #####: 979: if (sp < ep && #####: 980: ((*sp >= '0' && *sp <= '9') || #####: 981: (*sp >= 'A' && *sp <= 'F') || #####: 982: (*sp >= 'a' && *sp <= 'f'))) #####: 983: sp += _ure_hex(sp, ep - sp, &c); -: 984: /* FALLTHROUGH */ -: 985: default: -: 986: /* -: 987: * Simply add an escaped character here. -: 988: */ #####: 989: symp->type = _URE_CHAR; #####: 990: symp->sym.chr = c; -: 991: } #####: 992: } else if (c == '^' || c == '$') -: 993: /* -: 994: * Handle the BOL and EOL anchors. This actually consists simply of -: 995: * setting a flag that indicates that the user supplied anchor match -: 996: * function should be called. This needs to be done instead of simply -: 997: * matching line/paragraph separators because beginning-of-text and -: 998: * end-of-text tests are needed as well. -: 999: */ #####: 1000: symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR; #####: 1001: else if (c == '[') -: 1002: /* -: 1003: * Construct a character class. -: 1004: */ #####: 1005: sp += _ure_cclass(sp, ep - sp, symp, b); #####: 1006: else if (c == '.') #####: 1007: symp->type = _URE_ANY_CHAR; -: 1008: else { #####: 1009: symp->type = _URE_CHAR; #####: 1010: symp->sym.chr = c; -: 1011: } -: 1012: -: 1013: /* -: 1014: * If the symbol type happens to be a character and is a high surrogate, -: 1015: * then probe forward to see if it is followed by a low surrogate that -: 1016: * needs to be added. -: 1017: */ #####: 1018: if (sp < ep && symp->type == _URE_CHAR && #####: 1019: 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) { -: 1020: #####: 1021: if (0xdc00 <= *sp && *sp <= 0xdfff) { #####: 1022: symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | #####: 1023: (*sp & 0x03ff)); #####: 1024: sp++; #####: 1025: } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' || #####: 1026: *(sp + 1) == 'u' || *(sp + 1) == 'U')) { #####: 1027: sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c); #####: 1028: if (0xdc00 <= c && c <= 0xdfff) { -: 1029: /* -: 1030: * Take into account the \[xu] in front of the hex code. -: 1031: */ #####: 1032: sp += 2; #####: 1033: symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | #####: 1034: (c & 0x03ff)); -: 1035: } -: 1036: } -: 1037: } -: 1038: -: 1039: /* -: 1040: * Last, make sure any _URE_CHAR type symbols are changed to lower case if -: 1041: * the `casefold' flag is set. -: 1042: */ #####: 1043: if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR) #####: 1044: symp->sym.chr = _ure_tolower(symp->sym.chr); -: 1045: -: 1046: /* -: 1047: * If the symbol constructed is anything other than one of the anchors, -: 1048: * make sure the _URE_DFA_BLANKLINE flag is removed. -: 1049: */ #####: 1050: if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR) #####: 1051: b->flags &= ~_URE_DFA_BLANKLINE; -: 1052: -: 1053: /* -: 1054: * Return the number of characters consumed. -: 1055: */ #####: 1056: return sp - sym; -: 1057:} -: 1058: -: 1059:static int #####: 1060:_ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b) -: 1061:{ #####: 1062: if (a->type != b->type || a->mods != b->mods || a->props != b->props) #####: 1063: return 1; -: 1064: #####: 1065: if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) { #####: 1066: if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used) #####: 1067: return 1; #####: 1068: if (a->sym.ccl.ranges_used > 0 && -: 1069: memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges, #####: 1070: sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0) #####: 1071: return 1; #####: 1072: } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr) #####: 1073: return 1; #####: 1074: return 0; -: 1075:} -: 1076: -: 1077:/* -: 1078: * Construct a symbol, but only keep unique symbols. -: 1079: */ -: 1080:static ucs2_t #####: 1081:_ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed, -: 1082: _ure_buffer_t *b) -: 1083:{ -: 1084: ucs2_t i; -: 1085: _ure_symtab_t *sp, symbol; -: 1086: -: 1087: /* -: 1088: * Build the next symbol so we can test to see if it is already in the -: 1089: * symbol table. -: 1090: */ #####: 1091: (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t)); #####: 1092: *consumed = _ure_compile_symbol(sym, limit, &symbol, b); -: 1093: -: 1094: /* -: 1095: * Check to see if the symbol exists. -: 1096: */ #####: 1097: for (i = 0, sp = b->symtab; #####: 1098: i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ; -: 1099: #####: 1100: if (i < b->symtab_used) { -: 1101: /* -: 1102: * Free up any ranges used for the symbol. -: 1103: */ #####: 1104: if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) && #####: 1105: symbol.sym.ccl.ranges_size > 0) #####: 1106: free((char *) symbol.sym.ccl.ranges); -: 1107: #####: 1108: return b->symtab[i].id; -: 1109: } -: 1110: -: 1111: /* -: 1112: * Need to add the new symbol. -: 1113: */ #####: 1114: if (b->symtab_used == b->symtab_size) { #####: 1115: if (b->symtab_size == 0) #####: 1116: b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3); -: 1117: else #####: 1118: b->symtab = (_ure_symtab_t *) -: 1119: realloc((char *) b->symtab, -: 1120: sizeof(_ure_symtab_t) * (b->symtab_size + 8)); #####: 1121: sp = b->symtab + b->symtab_size; #####: 1122: (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3); #####: 1123: b->symtab_size += 8; -: 1124: } -: 1125: #####: 1126: symbol.id = b->symtab_used++; #####: 1127: (void) memcpy((char *) &b->symtab[symbol.id], (char *) &symbol, -: 1128: sizeof(_ure_symtab_t)); -: 1129: #####: 1130: return symbol.id; -: 1131:} -: 1132: -: 1133:/************************************************************************* -: 1134: * -: 1135: * End symbol parse functions. -: 1136: * -: 1137: *************************************************************************/ -: 1138: -: 1139:static ucs2_t #####: 1140:_ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b) -: 1141:{ -: 1142: ucs2_t i; -: 1143: #####: 1144: if (b == 0) #####: 1145: return _URE_NOOP; -: 1146: -: 1147: /* -: 1148: * Determine if the expression already exists or not. -: 1149: */ #####: 1150: for (i = 0; i < b->expr_used; i++) { #####: 1151: if (b->expr[i].type == type && b->expr[i].lhs == lhs && #####: 1152: b->expr[i].rhs == rhs) #####: 1153: break; -: 1154: } #####: 1155: if (i < b->expr_used) #####: 1156: return i; -: 1157: -: 1158: /* -: 1159: * Need to add a new expression. -: 1160: */ #####: 1161: if (b->expr_used == b->expr_size) { #####: 1162: if (b->expr_size == 0) #####: 1163: b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3); -: 1164: else #####: 1165: b->expr = (_ure_elt_t *) -: 1166: realloc((char *) b->expr, -: 1167: sizeof(_ure_elt_t) * (b->expr_size + 8)); #####: 1168: b->expr_size += 8; -: 1169: } -: 1170: #####: 1171: b->expr[b->expr_used].onstack = 0; #####: 1172: b->expr[b->expr_used].type = type; #####: 1173: b->expr[b->expr_used].lhs = lhs; #####: 1174: b->expr[b->expr_used].rhs = rhs; -: 1175: #####: 1176: return b->expr_used++; -: 1177:} -: 1178: -: 1179:static unsigned char spmap[] = { -: 1180: 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, -: 1181: 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -: 1182: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -: 1183:}; -: 1184: -: 1185:#define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \ -: 1186: (spmap[(cc) >> 3] & (1 << ((cc) & 7)))) -: 1187: -: 1188:/* -: 1189: * Convert the regular expression into an NFA in a form that will be easy to -: 1190: * reduce to a DFA. The starting state for the reduction will be returned. -: 1191: */ -: 1192:static ucs2_t #####: 1193:_ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b) -: 1194:{ -: 1195: ucs2_t c, state, top, sym, *sp, *ep; -: 1196: unsigned long used; -: 1197: #####: 1198: state = _URE_NOOP; -: 1199: #####: 1200: sp = re; #####: 1201: ep = sp + relen; #####: 1202: while (b->error == _URE_OK && sp < ep) { #####: 1203: c = *sp++; #####: 1204: switch (c) { -: 1205: case '(': #####: 1206: _ure_push(_URE_PAREN, b); #####: 1207: break; -: 1208: case ')': -: 1209: /* -: 1210: * Check for the case of too many close parentheses. -: 1211: */ #####: 1212: if (_ure_peek(b) == _URE_NOOP) { #####: 1213: b->error = _URE_UNBALANCED_GROUP; #####: 1214: break; -: 1215: } -: 1216: #####: 1217: while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) -: 1218: /* -: 1219: * Make an expression with the AND or OR operator and its right -: 1220: * hand side. -: 1221: */ #####: 1222: state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); -: 1223: -: 1224: /* -: 1225: * Remove the _URE_PAREN off the stack. -: 1226: */ #####: 1227: (void) _ure_pop(b); #####: 1228: break; -: 1229: case '*': #####: 1230: state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b); #####: 1231: break; -: 1232: case '+': #####: 1233: state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b); #####: 1234: break; -: 1235: case '?': #####: 1236: state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b); #####: 1237: break; -: 1238: case '|': #####: 1239: while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) -: 1240: /* -: 1241: * Make an expression with the AND or OR operator and its right -: 1242: * hand side. -: 1243: */ #####: 1244: state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); -: 1245: #####: 1246: _ure_push(state, b); #####: 1247: _ure_push(_URE_OR, b); #####: 1248: break; -: 1249: default: #####: 1250: sp--; #####: 1251: sym = _ure_make_symbol(sp, ep - sp, &used, b); #####: 1252: sp += used; #####: 1253: state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b); -: 1254: break; -: 1255: } -: 1256: #####: 1257: if (c != '(' && c != '|' && sp < ep && #####: 1258: (!_ure_isspecial(*sp) || *sp == '(')) { #####: 1259: _ure_push(state, b); #####: 1260: _ure_push(_URE_AND, b); -: 1261: } -: 1262: } #####: 1263: while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) -: 1264: /* -: 1265: * Make an expression with the AND or OR operator and its right -: 1266: * hand side. -: 1267: */ #####: 1268: state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); -: 1269: #####: 1270: if (b->stack.slist_used > 0) #####: 1271: b->error = _URE_UNBALANCED_GROUP; -: 1272: #####: 1273: return (b->error == _URE_OK) ? state : _URE_NOOP; -: 1274:} -: 1275: -: 1276:static void #####: 1277:_ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b) -: 1278:{ -: 1279: ucs2_t i, *stp; -: 1280: _ure_symtab_t *sp; -: 1281: -: 1282: /* -: 1283: * Locate the symbol in the symbol table so the state can be added. -: 1284: * If the symbol doesn't exist, then a real problem exists. -: 1285: */ #####: 1286: for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id; #####: 1287: i++, sp++) ; -: 1288: -: 1289: /* -: 1290: * Now find out if the state exists in the symbol's state list. -: 1291: */ #####: 1292: for (i = 0, stp = sp->states.slist; #####: 1293: i < sp->states.slist_used && state > *stp; i++, stp++) ; -: 1294: #####: 1295: if (i == sp->states.slist_used || state < *stp) { -: 1296: /* -: 1297: * Need to add the state in order. -: 1298: */ #####: 1299: if (sp->states.slist_used == sp->states.slist_size) { #####: 1300: if (sp->states.slist_size == 0) #####: 1301: sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); -: 1302: else #####: 1303: sp->states.slist = (ucs2_t *) -: 1304: realloc((char *) sp->states.slist, -: 1305: sizeof(ucs2_t) * (sp->states.slist_size + 8)); #####: 1306: sp->states.slist_size += 8; -: 1307: } #####: 1308: if (i < sp->states.slist_used) #####: 1309: (void) _ure_memmove((char *) (sp->states.slist + i + 1), #####: 1310: (char *) (sp->states.slist + i), #####: 1311: sizeof(ucs2_t) * (sp->states.slist_used - i)); #####: 1312: sp->states.slist[i] = state; #####: 1313: sp->states.slist_used++; -: 1314: } #####: 1315:} -: 1316: -: 1317:static ucs2_t #####: 1318:_ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b) -: 1319:{ -: 1320: ucs2_t i; -: 1321: _ure_state_t *sp; -: 1322: #####: 1323: for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) { #####: 1324: if (sp->st.slist_used == nstates && -: 1325: memcmp((char *) states, (char *) sp->st.slist, #####: 1326: sizeof(ucs2_t) * nstates) == 0) #####: 1327: break; -: 1328: } -: 1329: #####: 1330: if (i == b->states.states_used) { -: 1331: /* -: 1332: * Need to add a new DFA state (set of NFA states). -: 1333: */ #####: 1334: if (b->states.states_used == b->states.states_size) { #####: 1335: if (b->states.states_size == 0) #####: 1336: b->states.states = (_ure_state_t *) -: 1337: malloc(sizeof(_ure_state_t) << 3); -: 1338: else #####: 1339: b->states.states = (_ure_state_t *) -: 1340: realloc((char *) b->states.states, -: 1341: sizeof(_ure_state_t) * (b->states.states_size + 8)); #####: 1342: sp = b->states.states + b->states.states_size; #####: 1343: (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3); #####: 1344: b->states.states_size += 8; -: 1345: } -: 1346: #####: 1347: sp = b->states.states + b->states.states_used++; #####: 1348: sp->id = i; -: 1349: #####: 1350: if (sp->st.slist_used + nstates > sp->st.slist_size) { #####: 1351: if (sp->st.slist_size == 0) #####: 1352: sp->st.slist = (ucs2_t *) #####: 1353: malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates)); -: 1354: else #####: 1355: sp->st.slist = (ucs2_t *) -: 1356: realloc((char *) sp->st.slist, #####: 1357: sizeof(ucs2_t) * (sp->st.slist_used + nstates)); #####: 1358: sp->st.slist_size = sp->st.slist_used + nstates; -: 1359: } #####: 1360: sp->st.slist_used = nstates; #####: 1361: (void) memcpy((char *) sp->st.slist, (char *) states, -: 1362: sizeof(ucs2_t) * nstates); -: 1363: } -: 1364: -: 1365: /* -: 1366: * Return the ID of the DFA state representing a group of NFA states. -: 1367: */ #####: 1368: return i; -: 1369:} -: 1370: -: 1371:static void #####: 1372:_ure_reduce(ucs2_t start, _ure_buffer_t *b) -: 1373:{ -: 1374: ucs2_t i, j, state, eval, syms, rhs; -: 1375: ucs2_t s1, s2, ns1, ns2; -: 1376: _ure_state_t *sp; -: 1377: _ure_symtab_t *smp; -: 1378: #####: 1379: b->reducing = 1; -: 1380: -: 1381: /* -: 1382: * Add the starting state for the reduction. -: 1383: */ #####: 1384: _ure_add_state(1, &start, b); -: 1385: -: 1386: /* -: 1387: * Process each set of NFA states that get created. -: 1388: */ #####: 1389: for (i = 0; i < b->states.states_used; i++) { #####: 1390: sp = b->states.states + i; -: 1391: -: 1392: /* -: 1393: * Push the current states on the stack. -: 1394: */ #####: 1395: for (j = 0; j < sp->st.slist_used; j++) #####: 1396: _ure_push(sp->st.slist[j], b); -: 1397: -: 1398: /* -: 1399: * Reduce the NFA states. -: 1400: */ #####: 1401: for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) { #####: 1402: state = b->stack.slist[j]; #####: 1403: eval = 1; -: 1404: -: 1405: /* -: 1406: * This inner loop is the iterative equivalent of recursively -: 1407: * reducing subexpressions generated as a result of a reduction. -: 1408: */ #####: 1409: while (eval) { #####: 1410: switch (b->expr[state].type) { -: 1411: case _URE_SYMBOL: #####: 1412: ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); #####: 1413: _ure_add_symstate(b->expr[state].lhs, ns1, b); #####: 1414: syms++; #####: 1415: eval = 0; #####: 1416: break; -: 1417: case _URE_ONE: #####: 1418: sp->accepting = 1; #####: 1419: eval = 0; #####: 1420: break; -: 1421: case _URE_QUEST: #####: 1422: s1 = b->expr[state].lhs; #####: 1423: ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); #####: 1424: state = _ure_make_expr(_URE_OR, ns1, s1, b); #####: 1425: break; -: 1426: case _URE_PLUS: #####: 1427: s1 = b->expr[state].lhs; #####: 1428: ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b); #####: 1429: state = _ure_make_expr(_URE_AND, s1, ns1, b); #####: 1430: break; -: 1431: case _URE_STAR: #####: 1432: s1 = b->expr[state].lhs; #####: 1433: ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); #####: 1434: ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b); #####: 1435: state = _ure_make_expr(_URE_OR, ns1, ns2, b); #####: 1436: break; -: 1437: case _URE_OR: #####: 1438: s1 = b->expr[state].lhs; #####: 1439: s2 = b->expr[state].rhs; #####: 1440: _ure_push(s1, b); #####: 1441: _ure_push(s2, b); #####: 1442: eval = 0; #####: 1443: break; -: 1444: case _URE_AND: #####: 1445: s1 = b->expr[state].lhs; #####: 1446: s2 = b->expr[state].rhs; #####: 1447: switch (b->expr[s1].type) { -: 1448: case _URE_SYMBOL: #####: 1449: _ure_add_symstate(b->expr[s1].lhs, s2, b); #####: 1450: syms++; #####: 1451: eval = 0; #####: 1452: break; -: 1453: case _URE_ONE: #####: 1454: state = s2; #####: 1455: break; -: 1456: case _URE_QUEST: #####: 1457: ns1 = b->expr[s1].lhs; #####: 1458: ns2 = _ure_make_expr(_URE_AND, ns1, s2, b); #####: 1459: state = _ure_make_expr(_URE_OR, s2, ns2, b); #####: 1460: break; -: 1461: case _URE_PLUS: #####: 1462: ns1 = b->expr[s1].lhs; #####: 1463: ns2 = _ure_make_expr(_URE_OR, s2, state, b); #####: 1464: state = _ure_make_expr(_URE_AND, ns1, ns2, b); #####: 1465: break; -: 1466: case _URE_STAR: #####: 1467: ns1 = b->expr[s1].lhs; #####: 1468: ns2 = _ure_make_expr(_URE_AND, ns1, state, b); #####: 1469: state = _ure_make_expr(_URE_OR, s2, ns2, b); #####: 1470: break; -: 1471: case _URE_OR: #####: 1472: ns1 = b->expr[s1].lhs; #####: 1473: ns2 = b->expr[s1].rhs; #####: 1474: ns1 = _ure_make_expr(_URE_AND, ns1, s2, b); #####: 1475: ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); #####: 1476: state = _ure_make_expr(_URE_OR, ns1, ns2, b); #####: 1477: break; -: 1478: case _URE_AND: #####: 1479: ns1 = b->expr[s1].lhs; #####: 1480: ns2 = b->expr[s1].rhs; #####: 1481: ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); #####: 1482: state = _ure_make_expr(_URE_AND, ns1, ns2, b); -: 1483: break; -: 1484: } -: 1485: } -: 1486: } -: 1487: } -: 1488: -: 1489: /* -: 1490: * Clear the state stack. -: 1491: */ #####: 1492: while (_ure_pop(b) != _URE_NOOP) ; -: 1493: -: 1494: /* -: 1495: * Reset the state pointer because the reduction may have moved it -: 1496: * during a reallocation. -: 1497: */ #####: 1498: sp = b->states.states + i; -: 1499: -: 1500: /* -: 1501: * Generate the DFA states for the symbols collected during the -: 1502: * current reduction. -: 1503: */ #####: 1504: if (sp->trans_used + syms > sp->trans_size) { #####: 1505: if (sp->trans_size == 0) #####: 1506: sp->trans = (_ure_elt_t *) #####: 1507: malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms)); -: 1508: else #####: 1509: sp->trans = (_ure_elt_t *) -: 1510: realloc((char *) sp->trans, #####: 1511: sizeof(_ure_elt_t) * (sp->trans_used + syms)); #####: 1512: sp->trans_size = sp->trans_used + syms; -: 1513: } -: 1514: -: 1515: /* -: 1516: * Go through the symbol table and generate the DFA state transitions -: 1517: * for each symbol that has collected NFA states. -: 1518: */ #####: 1519: for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) { #####: 1520: sp = b->states.states + i; -: 1521: #####: 1522: if (smp->states.slist_used > 0) { #####: 1523: sp->trans[syms].lhs = smp->id; #####: 1524: rhs = _ure_add_state(smp->states.slist_used, -: 1525: smp->states.slist, b); -: 1526: /* -: 1527: * Reset the state pointer in case the reallocation moves it -: 1528: * in memory. -: 1529: */ #####: 1530: sp = b->states.states + i; #####: 1531: sp->trans[syms].rhs = rhs; -: 1532: #####: 1533: smp->states.slist_used = 0; #####: 1534: syms++; -: 1535: } -: 1536: } -: 1537: -: 1538: /* -: 1539: * Set the number of transitions actually used. -: 1540: */ #####: 1541: sp->trans_used = syms; -: 1542: } #####: 1543: b->reducing = 0; #####: 1544:} -: 1545: -: 1546:static void #####: 1547:_ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b) -: 1548:{ -: 1549: ucs2_t tmp; -: 1550: #####: 1551: l = b->states.states[l].id; #####: 1552: r = b->states.states[r].id; -: 1553: #####: 1554: if (l == r) #####: 1555: return; -: 1556: #####: 1557: if (l > r) { #####: 1558: tmp = l; #####: 1559: l = r; #####: 1560: r = tmp; -: 1561: } -: 1562: -: 1563: /* -: 1564: * Check to see if the equivalence pair already exists. -: 1565: */ #####: 1566: for (tmp = 0; tmp < b->equiv_used && #####: 1567: (b->equiv[tmp].l != l || b->equiv[tmp].r != r); #####: 1568: tmp++) ; -: 1569: #####: 1570: if (tmp < b->equiv_used) #####: 1571: return; -: 1572: #####: 1573: if (b->equiv_used == b->equiv_size) { #####: 1574: if (b->equiv_size == 0) #####: 1575: b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3); -: 1576: else #####: 1577: b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv, -: 1578: sizeof(_ure_equiv_t) * -: 1579: (b->equiv_size + 8)); #####: 1580: b->equiv_size += 8; -: 1581: } #####: 1582: b->equiv[b->equiv_used].l = l; #####: 1583: b->equiv[b->equiv_used].r = r; #####: 1584: b->equiv_used++; -: 1585:} -: 1586: -: 1587:/* -: 1588: * Merge the DFA states that are equivalent. -: 1589: */ -: 1590:static void #####: 1591:_ure_merge_equiv(_ure_buffer_t *b) -: 1592:{ -: 1593: ucs2_t i, j, k, eq, done; -: 1594: _ure_state_t *sp1, *sp2, *ls, *rs; -: 1595: #####: 1596: for (i = 0; i < b->states.states_used; i++) { #####: 1597: sp1 = b->states.states + i; #####: 1598: if (sp1->id != i) #####: 1599: continue; #####: 1600: for (j = 0; j < i; j++) { #####: 1601: sp2 = b->states.states + j; #####: 1602: if (sp2->id != j) #####: 1603: continue; #####: 1604: b->equiv_used = 0; #####: 1605: _ure_add_equiv(i, j, b); #####: 1606: for (eq = 0, done = 0; eq < b->equiv_used; eq++) { #####: 1607: ls = b->states.states + b->equiv[eq].l; #####: 1608: rs = b->states.states + b->equiv[eq].r; #####: 1609: if (ls->accepting != rs->accepting || #####: 1610: ls->trans_used != rs->trans_used) { #####: 1611: done = 1; #####: 1612: break; -: 1613: } #####: 1614: for (k = 0; k < ls->trans_used && #####: 1615: ls->trans[k].lhs == rs->trans[k].lhs; k++) ; #####: 1616: if (k < ls->trans_used) { #####: 1617: done = 1; #####: 1618: break; -: 1619: } -: 1620: #####: 1621: for (k = 0; k < ls->trans_used; k++) #####: 1622: _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b); -: 1623: } #####: 1624: if (done == 0) #####: 1625: break; -: 1626: } #####: 1627: for (eq = 0; j < i && eq < b->equiv_used; eq++) #####: 1628: b->states.states[b->equiv[eq].r].id = #####: 1629: b->states.states[b->equiv[eq].l].id; -: 1630: } -: 1631: -: 1632: /* -: 1633: * Renumber the states appropriately. -: 1634: */ #####: 1635: for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used; #####: 1636: sp1++, i++) #####: 1637: sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id; #####: 1638:} -: 1639: -: 1640:/************************************************************************* -: 1641: * -: 1642: * API. -: 1643: * -: 1644: *************************************************************************/ -: 1645: -: 1646:ure_buffer_t #####: 1647:ure_buffer_create(void) -: 1648:{ -: 1649: ure_buffer_t b; -: 1650: #####: 1651: b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t)); -: 1652: #####: 1653: return b; -: 1654:} -: 1655: -: 1656:void #####: 1657:ure_buffer_free(ure_buffer_t buf) -: 1658:{ -: 1659: unsigned long i; -: 1660: #####: 1661: if (buf == 0) #####: 1662: return; -: 1663: #####: 1664: if (buf->stack.slist_size > 0) #####: 1665: free((char *) buf->stack.slist); -: 1666: #####: 1667: if (buf->expr_size > 0) #####: 1668: free((char *) buf->expr); -: 1669: #####: 1670: for (i = 0; i < buf->symtab_size; i++) { #####: 1671: if (buf->symtab[i].states.slist_size > 0) #####: 1672: free((char *) buf->symtab[i].states.slist); -: 1673: } -: 1674: #####: 1675: if (buf->symtab_size > 0) #####: 1676: free((char *) buf->symtab); -: 1677: #####: 1678: for (i = 0; i < buf->states.states_size; i++) { #####: 1679: if (buf->states.states[i].trans_size > 0) #####: 1680: free((char *) buf->states.states[i].trans); #####: 1681: if (buf->states.states[i].st.slist_size > 0) #####: 1682: free((char *) buf->states.states[i].st.slist); -: 1683: } -: 1684: #####: 1685: if (buf->states.states_size > 0) #####: 1686: free((char *) buf->states.states); -: 1687: #####: 1688: if (buf->equiv_size > 0) #####: 1689: free((char *) buf->equiv); -: 1690: #####: 1691: free((char *) buf); -: 1692:} -: 1693: -: 1694:ure_dfa_t #####: 1695:ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf) -: 1696:{ -: 1697: ucs2_t i, j, state; -: 1698: _ure_state_t *sp; -: 1699: _ure_dstate_t *dsp; -: 1700: _ure_trans_t *tp; -: 1701: ure_dfa_t dfa; -: 1702: #####: 1703: if (re == 0 || *re == 0 || relen == 0 || buf == 0) #####: 1704: return 0; -: 1705: -: 1706: /* -: 1707: * Reset the various fields of the compilation buffer. Default the flags -: 1708: * to indicate the presense of the "^$" pattern. If any other pattern -: 1709: * occurs, then this flag will be removed. This is done to catch this -: 1710: * special pattern and handle it specially when matching. -: 1711: */ #####: 1712: buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0); #####: 1713: buf->reducing = 0; #####: 1714: buf->stack.slist_used = 0; #####: 1715: buf->expr_used = 0; -: 1716: #####: 1717: for (i = 0; i < buf->symtab_used; i++) #####: 1718: buf->symtab[i].states.slist_used = 0; #####: 1719: buf->symtab_used = 0; -: 1720: #####: 1721: for (i = 0; i < buf->states.states_used; i++) { #####: 1722: buf->states.states[i].st.slist_used = 0; #####: 1723: buf->states.states[i].trans_used = 0; -: 1724: } #####: 1725: buf->states.states_used = 0; -: 1726: -: 1727: /* -: 1728: * Construct the NFA. If this stage returns a 0, then an error occured or -: 1729: * an empty expression was passed. -: 1730: */ #####: 1731: if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP) #####: 1732: return 0; -: 1733: -: 1734: /* -: 1735: * Do the expression reduction to get the initial DFA. -: 1736: */ #####: 1737: _ure_reduce(state, buf); -: 1738: -: 1739: /* -: 1740: * Merge all the equivalent DFA states. -: 1741: */ #####: 1742: _ure_merge_equiv(buf); -: 1743: -: 1744: /* -: 1745: * Construct the minimal DFA. -: 1746: */ #####: 1747: dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t)); #####: 1748: (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t)); -: 1749: #####: 1750: dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE); -: 1751: -: 1752: /* -: 1753: * Free up the NFA state groups and transfer the symbols from the buffer -: 1754: * to the DFA. -: 1755: */ #####: 1756: for (i = 0; i < buf->symtab_size; i++) { #####: 1757: if (buf->symtab[i].states.slist_size > 0) #####: 1758: free((char *) buf->symtab[i].states.slist); -: 1759: } #####: 1760: dfa->syms = buf->symtab; #####: 1761: dfa->nsyms = buf->symtab_used; -: 1762: #####: 1763: buf->symtab_used = buf->symtab_size = 0; -: 1764: -: 1765: /* -: 1766: * Collect the total number of states and transitions needed for the DFA. -: 1767: */ #####: 1768: for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; #####: 1769: i++, sp++) { #####: 1770: if (sp->id == state) { #####: 1771: dfa->nstates++; #####: 1772: dfa->ntrans += sp->trans_used; #####: 1773: state++; -: 1774: } -: 1775: } -: 1776: -: 1777: /* -: 1778: * Allocate enough space for the states and transitions. -: 1779: */ #####: 1780: dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) * -: 1781: dfa->nstates); #####: 1782: dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans); -: 1783: -: 1784: /* -: 1785: * Actually transfer the DFA states from the buffer. -: 1786: */ #####: 1787: dsp = dfa->states; #####: 1788: tp = dfa->trans; #####: 1789: for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; #####: 1790: i++, sp++) { #####: 1791: if (sp->id == state) { #####: 1792: dsp->trans = tp; #####: 1793: dsp->ntrans = sp->trans_used; #####: 1794: dsp->accepting = sp->accepting; -: 1795: -: 1796: /* -: 1797: * Add the transitions for the state. -: 1798: */ #####: 1799: for (j = 0; j < dsp->ntrans; j++, tp++) { #####: 1800: tp->symbol = sp->trans[j].lhs; #####: 1801: tp->next_state = buf->states.states[sp->trans[j].rhs].id; -: 1802: } -: 1803: #####: 1804: dsp++; #####: 1805: state++; -: 1806: } -: 1807: } -: 1808: #####: 1809: return dfa; -: 1810:} -: 1811: -: 1812:void #####: 1813:ure_dfa_free(ure_dfa_t dfa) -: 1814:{ -: 1815: ucs2_t i; -: 1816: #####: 1817: if (dfa == 0) #####: 1818: return; -: 1819: #####: 1820: for (i = 0; i < dfa->nsyms; i++) { #####: 1821: if ((dfa->syms[i].type == _URE_CCLASS || #####: 1822: dfa->syms[i].type == _URE_NCCLASS) && #####: 1823: dfa->syms[i].sym.ccl.ranges_size > 0) #####: 1824: free((char *) dfa->syms[i].sym.ccl.ranges); -: 1825: } #####: 1826: if (dfa->nsyms > 0) #####: 1827: free((char *) dfa->syms); -: 1828: #####: 1829: if (dfa->nstates > 0) #####: 1830: free((char *) dfa->states); #####: 1831: if (dfa->ntrans > 0) #####: 1832: free((char *) dfa->trans); #####: 1833: free((char *) dfa); -: 1834:} -: 1835: -: 1836:void #####: 1837:ure_write_dfa(ure_dfa_t dfa, FILE *out) -: 1838:{ -: 1839: ucs2_t i, j, k, h, l; -: 1840: _ure_dstate_t *sp; -: 1841: _ure_symtab_t *sym; -: 1842: _ure_range_t *rp; -: 1843: #####: 1844: if (dfa == 0 || out == 0) #####: 1845: return; -: 1846: -: 1847: /* -: 1848: * Write all the different character classes. -: 1849: */ #####: 1850: for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) { #####: 1851: if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) { #####: 1852: fprintf(out, "C%hd = ", sym->id); #####: 1853: if (sym->sym.ccl.ranges_used > 0) { #####: 1854: putc('[', out); #####: 1855: if (sym->type == _URE_NCCLASS) #####: 1856: putc('^', out); -: 1857: } #####: 1858: if (sym->props != 0) { #####: 1859: if (sym->type == _URE_NCCLASS) #####: 1860: fprintf(out, "\\P"); -: 1861: else #####: 1862: fprintf(out, "\\p"); #####: 1863: for (k = h = 0; k < 32; k++) { #####: 1864: if (sym->props & (1 << k)) { #####: 1865: if (h != 0) #####: 1866: putc(',', out); #####: 1867: fprintf(out, "%hd", k + 1); #####: 1868: h = 1; -: 1869: } -: 1870: } -: 1871: } -: 1872: /* -: 1873: * Dump the ranges. -: 1874: */ #####: 1875: for (k = 0, rp = sym->sym.ccl.ranges; #####: 1876: k < sym->sym.ccl.ranges_used; k++, rp++) { -: 1877: /* -: 1878: * Check for UTF16 characters. -: 1879: */ #####: 1880: if (0x10000 <= rp->min_code && #####: 1881: rp->min_code <= 0x10ffff) { #####: 1882: h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800); #####: 1883: l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00); #####: 1884: fprintf(out, "\\x%04hX\\x%04hX", h, l); -: 1885: } else #####: 1886: fprintf(out, "\\x%04lX", -: 1887: (unsigned long)(rp->min_code & 0xffff)); #####: 1888: if (rp->max_code != rp->min_code) { #####: 1889: putc('-', out); #####: 1890: if (rp->max_code >= 0x10000 && #####: 1891: rp->max_code <= 0x10ffff) { #####: 1892: h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800); #####: 1893: l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00); #####: 1894: fprintf(out, "\\x%04hX\\x%04hX", h, l); -: 1895: } else #####: 1896: fprintf(out, "\\x%04lX", -: 1897: (unsigned long)(rp->max_code & 0xffff)); -: 1898: } -: 1899: } #####: 1900: if (sym->sym.ccl.ranges_used > 0) #####: 1901: putc(']', out); #####: 1902: putc('\n', out); -: 1903: } -: 1904: } -: 1905: #####: 1906: for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) { #####: 1907: fprintf(out, "S%hd = ", i); #####: 1908: if (sp->accepting) { #####: 1909: fprintf(out, "1 "); #####: 1910: if (sp->ntrans) #####: 1911: fprintf(out, "| "); -: 1912: } #####: 1913: for (j = 0; j < sp->ntrans; j++) { #####: 1914: if (j > 0) #####: 1915: fprintf(out, "| "); -: 1916: #####: 1917: sym = dfa->syms + sp->trans[j].symbol; #####: 1918: switch (sym->type) { -: 1919: case _URE_CHAR: #####: 1920: if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) { -: 1921: /* -: 1922: * Take care of UTF16 characters. -: 1923: */ #####: 1924: h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800); #####: 1925: l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00); #####: 1926: fprintf(out, "\\x%04hX\\x%04hX ", h, l); -: 1927: } else #####: 1928: fprintf(out, "\\x%04lX ", -: 1929: (unsigned long)(sym->sym.chr & 0xffff)); #####: 1930: break; -: 1931: case _URE_ANY_CHAR: #####: 1932: fprintf(out, " "); #####: 1933: break; -: 1934: case _URE_BOL_ANCHOR: #####: 1935: fprintf(out, " "); #####: 1936: break; -: 1937: case _URE_EOL_ANCHOR: #####: 1938: fprintf(out, " "); #####: 1939: break; -: 1940: case _URE_CCLASS: -: 1941: case _URE_NCCLASS: #####: 1942: fprintf(out, "[C%hd] ", sym->id); -: 1943: break; -: 1944: } #####: 1945: fprintf(out, "S%hd", sp->trans[j].next_state); #####: 1946: if (j + 1 < sp->ntrans) #####: 1947: putc(' ', out); -: 1948: } #####: 1949: putc('\n', out); -: 1950: } -: 1951:} -: 1952: -: 1953:#define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\ -: 1954: (cc) == 0x2029) -: 1955: -: 1956:int #####: 1957:ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen, -: 1958: unsigned long *match_start, unsigned long *match_end) -: 1959:{ -: 1960: int i, j, matched, found, skip; -: 1961: unsigned long ms, me; -: 1962: ucs4_t c; -: 1963: ucs2_t *sp, *ep, *lp; -: 1964: _ure_dstate_t *stp; -: 1965: _ure_symtab_t *sym; -: 1966: _ure_range_t *rp; -: 1967: #####: 1968: if (dfa == 0 || text == 0) #####: 1969: return 0; -: 1970: -: 1971: /* -: 1972: * Handle the special case of an empty string matching the "^$" pattern. -: 1973: */ #####: 1974: if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) { #####: 1975: *match_start = *match_end = 0; #####: 1976: return 1; -: 1977: } -: 1978: #####: 1979: sp = text; #####: 1980: ep = sp + textlen; -: 1981: #####: 1982: ms = me = ~0; -: 1983: #####: 1984: stp = dfa->states; -: 1985: #####: 1986: for (found = skip = 0; found == 0 && sp < ep; ) { #####: 1987: lp = sp; #####: 1988: c = *sp++; -: 1989: -: 1990: /* -: 1991: * Check to see if this is a high surrogate that should be -: 1992: * combined with a following low surrogate. -: 1993: */ #####: 1994: if (sp < ep && 0xd800 <= c && c <= 0xdbff && #####: 1995: 0xdc00 <= *sp && *sp <= 0xdfff) #####: 1996: c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff)); -: 1997: -: 1998: /* -: 1999: * Determine if the character is non-spacing and should be skipped. -: 2000: */ #####: 2001: if (_ure_matches_properties(_URE_NONSPACING, c) && #####: 2002: (flags & URE_IGNORE_NONSPACING)) { #####: 2003: sp++; #####: 2004: continue; -: 2005: } -: 2006: #####: 2007: if (dfa->flags & _URE_DFA_CASEFOLD) #####: 2008: c = _ure_tolower(c); -: 2009: -: 2010: /* -: 2011: * See if one of the transitions matches. -: 2012: */ #####: 2013: for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) { #####: 2014: sym = dfa->syms + stp->trans[i].symbol; #####: 2015: switch (sym->type) { -: 2016: case _URE_ANY_CHAR: #####: 2017: if ((flags & URE_DOT_MATCHES_SEPARATORS) || -: 2018: !_ure_issep(c)) #####: 2019: matched = 1; #####: 2020: break; -: 2021: case _URE_CHAR: #####: 2022: if (c == sym->sym.chr) #####: 2023: matched = 1; #####: 2024: break; -: 2025: case _URE_BOL_ANCHOR: #####: 2026: if (lp == text) { #####: 2027: sp = lp; #####: 2028: matched = 1; #####: 2029: } else if (_ure_issep(c)) { #####: 2030: if (c == '\r' && sp < ep && *sp == '\n') #####: 2031: sp++; #####: 2032: lp = sp; #####: 2033: matched = 1; -: 2034: } #####: 2035: break; -: 2036: case _URE_EOL_ANCHOR: #####: 2037: if (_ure_issep(c)) { -: 2038: /* -: 2039: * Put the pointer back before the separator so the match -: 2040: * end position will be correct. This case will also -: 2041: * cause the `sp' pointer to be advanced over the current -: 2042: * separator once the match end point has been recorded. -: 2043: */ #####: 2044: sp = lp; #####: 2045: matched = 1; -: 2046: } #####: 2047: break; -: 2048: case _URE_CCLASS: -: 2049: case _URE_NCCLASS: #####: 2050: if (sym->props != 0) #####: 2051: matched = _ure_matches_properties(sym->props, c); #####: 2052: for (j = 0, rp = sym->sym.ccl.ranges; #####: 2053: j < sym->sym.ccl.ranges_used; j++, rp++) { #####: 2054: if (rp->min_code <= c && c <= rp->max_code) #####: 2055: matched = 1; -: 2056: } #####: 2057: if (sym->type == _URE_NCCLASS) #####: 2058: matched = !matched; -: 2059: break; -: 2060: } -: 2061: #####: 2062: if (matched) { #####: 2063: if (ms == ~0UL) #####: 2064: ms = lp - text; -: 2065: else #####: 2066: me = sp - text; #####: 2067: stp = dfa->states + stp->trans[i].next_state; -: 2068: -: 2069: /* -: 2070: * If the match was an EOL anchor, adjust the pointer past the -: 2071: * separator that caused the match. The correct match -: 2072: * position has been recorded already. -: 2073: */ #####: 2074: if (sym->type == _URE_EOL_ANCHOR) { -: 2075: /* -: 2076: * Skip the character that caused the match. -: 2077: */ #####: 2078: sp++; -: 2079: -: 2080: /* -: 2081: * Handle the infamous CRLF situation. -: 2082: */ #####: 2083: if (sp < ep && c == '\r' && *sp == '\n') #####: 2084: sp++; -: 2085: } -: 2086: } -: 2087: } -: 2088: #####: 2089: if (matched == 0) { #####: 2090: if (stp->accepting == 0) { -: 2091: /* -: 2092: * If the last state was not accepting, then reset -: 2093: * and start over. -: 2094: */ #####: 2095: stp = dfa->states; #####: 2096: ms = me = ~0; -: 2097: } else -: 2098: /* -: 2099: * The last state was accepting, so terminate the matching -: 2100: * loop to avoid more work. -: 2101: */ #####: 2102: found = 1; #####: 2103: } else if (sp == ep) { #####: 2104: if (!stp->accepting) { -: 2105: /* -: 2106: * This ugly hack is to make sure the end-of-line anchors -: 2107: * match when the source text hits the end. This is only done -: 2108: * if the last subexpression matches. -: 2109: */ #####: 2110: for (i = 0; found == 0 && i < stp->ntrans; i++) { #####: 2111: sym = dfa->syms + stp->trans[i].symbol; #####: 2112: if (sym->type ==_URE_EOL_ANCHOR) { #####: 2113: stp = dfa->states + stp->trans[i].next_state; #####: 2114: if (stp->accepting) { #####: 2115: me = sp - text; #####: 2116: found = 1; -: 2117: } else #####: 2118: break; -: 2119: } -: 2120: } -: 2121: } else { -: 2122: /* -: 2123: * Make sure any conditions that match all the way to the end -: 2124: * of the string match. -: 2125: */ #####: 2126: found = 1; #####: 2127: me = sp - text; -: 2128: } -: 2129: } -: 2130: } -: 2131: #####: 2132: if (found == 0) #####: 2133: ms = me = ~0; -: 2134: #####: 2135: *match_start = ms; #####: 2136: *match_end = me; -: 2137: #####: 2138: return (ms != ~0UL) ? 1 : 0; -: 2139:}