1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26#ifdef RE_ENABLE_I18N
27static void free_charset (re_charset_t *cset);
28#endif /* RE_ENABLE_I18N */
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 int node, int root);
54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55static int fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 int accept_hyphen);
85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88#ifdef RE_ENABLE_I18N
89static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 re_charset_t *mbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 bitset_t sbcset,
95 re_charset_t *mbcset,
96 int *char_class_alloc,
97 const char *class_name,
98 reg_syntax_t syntax);
99#else /* not RE_ENABLE_I18N */
100static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 bitset_t sbcset,
104 const char *class_name,
105 reg_syntax_t syntax);
106#endif /* not RE_ENABLE_I18N */
107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const char *class_name,
110 const char *extra,
111 int non_match, reg_errcode_t *err);
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122\f
123/* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
127
128const char __re_error_msgid[] attribute_hidden =
129 {
130#define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180 };
181
182const size_t __re_error_msgid_idx[] attribute_hidden =
183 {
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
201 };
202\f
203/* Entry points for GNU code. */
204
205
206#ifdef ZOS_USS
207
208/* For ZOS USS we must define btowc */
209
210wchar_t
211btowc (int c)
212{
213 wchar_t wtmp[2];
214 char tmp[2];
215
216 tmp[0] = c;
217 tmp[1] = 0;
218
219 mbtowc (wtmp, tmp, 1);
220 return wtmp[0];
221}
222#endif
223
224/* re_compile_pattern is the GNU regular expression compiler: it
225 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
226 Returns 0 if the pattern was valid, otherwise an error string.
227
228 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
229 are set in BUFP on entry. */
230
231const char *
232re_compile_pattern (const char *pattern,
233 size_t length,
234 struct re_pattern_buffer *bufp)
235{
236 reg_errcode_t ret;
237
238 /* And GNU code determines whether or not to get register information
239 by passing null for the REGS argument to re_match, etc., not by
240 setting no_sub, unless RE_NO_SUB is set. */
241 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
242
243 /* Match anchors at newline. */
244 bufp->newline_anchor = 1;
245
246 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
247
248 if (!ret)
249 return NULL;
250 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
251}
252#ifdef _LIBC
253weak_alias (__re_compile_pattern, re_compile_pattern)
254#endif
255
256/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
257 also be assigned to arbitrarily: each pattern buffer stores its own
258 syntax, so it can be changed between regex compilations. */
259/* This has no initializer because initialized variables in Emacs
260 become read-only after dumping. */
261reg_syntax_t re_syntax_options;
262
263
264/* Specify the precise syntax of regexps for compilation. This provides
265 for compatibility for various utilities which historically have
266 different, incompatible syntaxes.
267
268 The argument SYNTAX is a bit mask comprised of the various bits
269 defined in regex.h. We return the old syntax. */
270
271reg_syntax_t
272re_set_syntax (reg_syntax_t syntax)
273{
274 reg_syntax_t ret = re_syntax_options;
275
276 re_syntax_options = syntax;
277 return ret;
278}
279#ifdef _LIBC
280weak_alias (__re_set_syntax, re_set_syntax)
281#endif
282
283int
284re_compile_fastmap (struct re_pattern_buffer *bufp)
285{
286 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
287 char *fastmap = bufp->fastmap;
288
289 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
290 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
291 if (dfa->init_state != dfa->init_state_word)
292 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
293 if (dfa->init_state != dfa->init_state_nl)
294 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
295 if (dfa->init_state != dfa->init_state_begbuf)
296 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
297 bufp->fastmap_accurate = 1;
298 return 0;
299}
300#ifdef _LIBC
301weak_alias (__re_compile_fastmap, re_compile_fastmap)
302#endif
303
304static inline void
305__attribute ((always_inline))
306re_set_fastmap (char *fastmap, int icase, int ch)
307{
308 fastmap[ch] = 1;
309 if (icase)
310 fastmap[tolower (ch)] = 1;
311}
312
313/* Helper function for re_compile_fastmap.
314 Compile fastmap for the initial_state INIT_STATE. */
315
316static void
317re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
318 char *fastmap)
319{
320 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
321 int node_cnt;
322 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
323 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
324 {
325 int node = init_state->nodes.elems[node_cnt];
326 re_token_type_t type = dfa->nodes[node].type;
327
328 if (type == CHARACTER)
329 {
330 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
331#ifdef RE_ENABLE_I18N
332 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
333 {
334 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
335 wchar_t wc;
336 mbstate_t state;
337
338 p = buf;
339 *p++ = dfa->nodes[node].opr.c;
340 while (++node < dfa->nodes_len
341 && dfa->nodes[node].type == CHARACTER
342 && dfa->nodes[node].mb_partial)
343 *p++ = dfa->nodes[node].opr.c;
344 memset (&state, '\0', sizeof (state));
345 if (__mbrtowc (&wc, (const char *) buf, p - buf,
346 &state) == p - buf
347 && (__wcrtomb ((char *) buf, towlower (wc), &state)
348 != (size_t) -1))
349 re_set_fastmap (fastmap, 0, buf[0]);
350 re_free (buf);
351 }
352#endif
353 }
354 else if (type == SIMPLE_BRACKET)
355 {
356 int i, ch;
357 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
358 {
359 int j;
360 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
361 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
362 if (w & ((bitset_word_t) 1 << j))
363 re_set_fastmap (fastmap, icase, ch);
364 }
365 }
366#ifdef RE_ENABLE_I18N
367 else if (type == COMPLEX_BRACKET)
368 {
369 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
370 int i;
371
372# ifdef _LIBC
373 /* See if we have to try all bytes which start multiple collation
374 elements.
375 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
376 collation element, and don't catch 'b' since 'b' is
377 the only collation element which starts from 'b' (and
378 it is caught by SIMPLE_BRACKET). */
379 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
380 && (cset->ncoll_syms || cset->nranges))
381 {
382 const int32_t *table = (const int32_t *)
383 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
384 for (i = 0; i < SBC_MAX; ++i)
385 if (table[i] < 0)
386 re_set_fastmap (fastmap, icase, i);
387 }
388# endif /* _LIBC */
389
390 /* See if we have to start the match at all multibyte characters,
391 i.e. where we would not find an invalid sequence. This only
392 applies to multibyte character sets; for single byte character
393 sets, the SIMPLE_BRACKET again suffices. */
394 if (dfa->mb_cur_max > 1
395 && (cset->nchar_classes || cset->non_match || cset->nranges
396# ifdef _LIBC
397 || cset->nequiv_classes
398# endif /* _LIBC */
399 ))
400 {
401 unsigned char c = 0;
402 do
403 {
404 mbstate_t mbs;
405 memset (&mbs, 0, sizeof (mbs));
406 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
407 re_set_fastmap (fastmap, false, (int) c);
408 }
409 while (++c != 0);
410 }
411
412 else
413 {
414 /* ... Else catch all bytes which can start the mbchars. */
415 for (i = 0; i < cset->nmbchars; ++i)
416 {
417 char buf[256];
418 mbstate_t state;
419 memset (&state, '\0', sizeof (state));
420 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
421 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
422 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
423 {
424 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
425 != (size_t) -1)
426 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
427 }
428 }
429 }
430 }
431#endif /* RE_ENABLE_I18N */
432 else if (type == OP_PERIOD
433#ifdef RE_ENABLE_I18N
434 || type == OP_UTF8_PERIOD
435#endif /* RE_ENABLE_I18N */
436 || type == END_OF_RE)
437 {
438 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
439 if (type == END_OF_RE)
440 bufp->can_be_null = 1;
441 return;
442 }
443 }
444}
445\f
446/* Entry point for POSIX code. */
447/* regcomp takes a regular expression as a string and compiles it.
448
449 PREG is a regex_t *. We do not expect any fields to be initialized,
450 since POSIX says we shouldn't. Thus, we set
451
452 `buffer' to the compiled pattern;
453 `used' to the length of the compiled pattern;
454 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
455 REG_EXTENDED bit in CFLAGS is set; otherwise, to
456 RE_SYNTAX_POSIX_BASIC;
457 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
458 `fastmap' to an allocated space for the fastmap;
459 `fastmap_accurate' to zero;
460 `re_nsub' to the number of subexpressions in PATTERN.
461
462 PATTERN is the address of the pattern string.
463
464 CFLAGS is a series of bits which affect compilation.
465
466 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
467 use POSIX basic syntax.
468
469 If REG_NEWLINE is set, then . and [^...] don't match newline.
470 Also, regexec will try a match beginning after every newline.
471
472 If REG_ICASE is set, then we considers upper- and lowercase
473 versions of letters to be equivalent when matching.
474
475 If REG_NOSUB is set, then when PREG is passed to regexec, that
476 routine will report only success or failure, and nothing about the
477 registers.
478
479 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
480 the return codes and their meanings.) */
481
482int
483regcomp (regex_t *__restrict preg,
484 const char *__restrict pattern,
485 int cflags)
486{
487 reg_errcode_t ret;
488 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
489 : RE_SYNTAX_POSIX_BASIC);
490
491 preg->buffer = NULL;
492 preg->allocated = 0;
493 preg->used = 0;
494
495 /* Try to allocate space for the fastmap. */
496 preg->fastmap = re_malloc (char, SBC_MAX);
497 if (BE (preg->fastmap == NULL, 0))
498 return REG_ESPACE;
499
500 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
501
502 /* If REG_NEWLINE is set, newlines are treated differently. */
503 if (cflags & REG_NEWLINE)
504 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
505 syntax &= ~RE_DOT_NEWLINE;
506 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
507 /* It also changes the matching behavior. */
508 preg->newline_anchor = 1;
509 }
510 else
511 preg->newline_anchor = 0;
512 preg->no_sub = !!(cflags & REG_NOSUB);
513 preg->translate = NULL;
514
515 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
516
517 /* POSIX doesn't distinguish between an unmatched open-group and an
518 unmatched close-group: both are REG_EPAREN. */
519 if (ret == REG_ERPAREN)
520 ret = REG_EPAREN;
521
522 /* We have already checked preg->fastmap != NULL. */
523 if (BE (ret == REG_NOERROR, 1))
524 /* Compute the fastmap now, since regexec cannot modify the pattern
525 buffer. This function never fails in this implementation. */
526 (void) re_compile_fastmap (preg);
527 else
528 {
529 /* Some error occurred while compiling the expression. */
530 re_free (preg->fastmap);
531 preg->fastmap = NULL;
532 }
533
534 return (int) ret;
535}
536#ifdef _LIBC
537weak_alias (__regcomp, regcomp)
538#endif
539
540/* Returns a message corresponding to an error code, ERRCODE, returned
541 from either regcomp or regexec. We don't use PREG here. */
542
543size_t
544regerror(int errcode, const regex_t *__restrict preg,
545 char *__restrict errbuf, size_t errbuf_size)
546{
547 const char *msg;
548 size_t msg_size;
549
550 if (BE (errcode < 0
551 || errcode >= (int) (sizeof (__re_error_msgid_idx)
552 / sizeof (__re_error_msgid_idx[0])), 0))
553 /* Only error codes returned by the rest of the code should be passed
554 to this routine. If we are given anything else, or if other regex
555 code generates an invalid error code, then the program has a bug.
556 Dump core so we can fix it. */
557 abort ();
558
559 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560
561 msg_size = strlen (msg) + 1; /* Includes the null. */
562
563 if (BE (errbuf_size != 0, 1))
564 {
565 if (BE (msg_size > errbuf_size, 0))
566 {
567 memcpy (errbuf, msg, errbuf_size - 1);
568 errbuf[errbuf_size - 1] = 0;
569 }
570 else
571 memcpy (errbuf, msg, msg_size);
572 }
573
574 return msg_size;
575}
576#ifdef _LIBC
577weak_alias (__regerror, regerror)
578#endif
579
580
581#ifdef RE_ENABLE_I18N
582/* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586#if __GNUC__ >= 3
587static const bitset_t utf8_sb_map = {
588 /* Set the first 128 bits. */
589 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
590};
591#else /* ! (__GNUC__ >= 3) */
592static bitset_t utf8_sb_map;
593#endif /* __GNUC__ >= 3 */
594#endif /* RE_ENABLE_I18N */
595
596
597static void
598free_dfa_content (re_dfa_t *dfa)
599{
600 int i, j;
601
602 if (dfa->nodes)
603 for (i = 0; i < dfa->nodes_len; ++i)
604 free_token (dfa->nodes + i);
605 re_free (dfa->nexts);
606 for (i = 0; i < dfa->nodes_len; ++i)
607 {
608 if (dfa->eclosures != NULL)
609 re_node_set_free (dfa->eclosures + i);
610 if (dfa->inveclosures != NULL)
611 re_node_set_free (dfa->inveclosures + i);
612 if (dfa->edests != NULL)
613 re_node_set_free (dfa->edests + i);
614 }
615 re_free (dfa->edests);
616 re_free (dfa->eclosures);
617 re_free (dfa->inveclosures);
618 re_free (dfa->nodes);
619
620 if (dfa->state_table)
621 for (i = 0; i <= dfa->state_hash_mask; ++i)
622 {
623 struct re_state_table_entry *entry = dfa->state_table + i;
624 for (j = 0; j < entry->num; ++j)
625 {
626 re_dfastate_t *state = entry->array[j];
627 free_state (state);
628 }
629 re_free (entry->array);
630 }
631 re_free (dfa->state_table);
632#ifdef RE_ENABLE_I18N
633 if (dfa->sb_char != utf8_sb_map)
634 re_free (dfa->sb_char);
635#endif
636 re_free (dfa->subexp_map);
637#ifdef DEBUG
638 re_free (dfa->re_str);
639#endif
640
641 re_free (dfa);
642}
643
644
645/* Free dynamically allocated space used by PREG. */
646
647void
648regfree (regex_t *preg)
649{
650 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
651 if (BE (dfa != NULL, 1))
652 free_dfa_content (dfa);
653 preg->buffer = NULL;
654 preg->allocated = 0;
655
656 re_free (preg->fastmap);
657 preg->fastmap = NULL;
658
659 re_free (preg->translate);
660 preg->translate = NULL;
661}
662#ifdef _LIBC
663weak_alias (__regfree, regfree)
664#endif
665\f
666/* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
668
669#if defined _REGEX_RE_COMP || defined _LIBC
670
671/* BSD has one and only one pattern buffer. */
672static struct re_pattern_buffer re_comp_buf;
673
674char *
675# ifdef _LIBC
676/* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679weak_function
680# endif
681re_comp (s)
682 const char *s;
683{
684 reg_errcode_t ret;
685 char *fastmap;
686
687 if (!s)
688 {
689 if (!re_comp_buf.buffer)
690 return gettext ("No previous regular expression");
691 return 0;
692 }
693
694 if (re_comp_buf.buffer)
695 {
696 fastmap = re_comp_buf.fastmap;
697 re_comp_buf.fastmap = NULL;
698 __regfree (&re_comp_buf);
699 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
700 re_comp_buf.fastmap = fastmap;
701 }
702
703 if (re_comp_buf.fastmap == NULL)
704 {
705 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
706 if (re_comp_buf.fastmap == NULL)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx[(int) REG_ESPACE]);
709 }
710
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
713
714 /* Match anchors at newlines. */
715 re_comp_buf.newline_anchor = 1;
716
717 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
718
719 if (!ret)
720 return NULL;
721
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
724}
725
726#ifdef _LIBC
727libc_freeres_fn (free_mem)
728{
729 __regfree (&re_comp_buf);
730}
731#endif
732
733#endif /* _REGEX_RE_COMP */
734\f
735/* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
738
739static reg_errcode_t
740re_compile_internal (regex_t *preg, const char * pattern, size_t length,
741 reg_syntax_t syntax)
742{
743 reg_errcode_t err = REG_NOERROR;
744 re_dfa_t *dfa;
745 re_string_t regexp;
746
747 /* Initialize the pattern buffer. */
748 preg->fastmap_accurate = 0;
749 preg->syntax = syntax;
750 preg->not_bol = preg->not_eol = 0;
751 preg->used = 0;
752 preg->re_nsub = 0;
753 preg->can_be_null = 0;
754 preg->regs_allocated = REGS_UNALLOCATED;
755
756 /* Initialize the dfa. */
757 dfa = (re_dfa_t *) preg->buffer;
758 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
759 {
760 /* If zero allocated, but buffer is non-null, try to realloc
761 enough space. This loses if buffer's address is bogus, but
762 that is the user's responsibility. If ->buffer is NULL this
763 is a simple allocation. */
764 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
765 if (dfa == NULL)
766 return REG_ESPACE;
767 preg->allocated = sizeof (re_dfa_t);
768 preg->buffer = (unsigned char *) dfa;
769 }
770 preg->used = sizeof (re_dfa_t);
771
772 err = init_dfa (dfa, length);
773 if (BE (err != REG_NOERROR, 0))
774 {
775 free_dfa_content (dfa);
776 preg->buffer = NULL;
777 preg->allocated = 0;
778 return err;
779 }
780#ifdef DEBUG
781 /* Note: length+1 will not overflow since it is checked in init_dfa. */
782 dfa->re_str = re_malloc (char, length + 1);
783 strncpy (dfa->re_str, pattern, length + 1);
784#endif
785
786 __libc_lock_init (dfa->lock);
787
788 err = re_string_construct (®exp, pattern, length, preg->translate,
789 syntax & RE_ICASE, dfa);
790 if (BE (err != REG_NOERROR, 0))
791 {
792 re_compile_internal_free_return:
793 free_workarea_compile (preg);
794 re_string_destruct (®exp);
795 free_dfa_content (dfa);
796 preg->buffer = NULL;
797 preg->allocated = 0;
798 return err;
799 }
800
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (®exp, preg, syntax, &err);
804 if (BE (dfa->str_tree == NULL, 0))
805 goto re_compile_internal_free_return;
806
807 /* Analyze the tree and create the nfa. */
808 err = analyze (preg);
809 if (BE (err != REG_NOERROR, 0))
810 goto re_compile_internal_free_return;
811
812#ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
815 optimize_utf8 (dfa);
816#endif
817
818 /* Then create the initial state of the dfa. */
819 err = create_initial_state (dfa);
820
821 /* Release work areas. */
822 free_workarea_compile (preg);
823 re_string_destruct (®exp);
824
825 if (BE (err != REG_NOERROR, 0))
826 {
827 free_dfa_content (dfa);
828 preg->buffer = NULL;
829 preg->allocated = 0;
830 }
831
832 return err;
833}
834
835/* Initialize DFA. We use the length of the regular expression PAT_LEN
836 as the initial length of some arrays. */
837
838static reg_errcode_t
839init_dfa (re_dfa_t *dfa, size_t pat_len)
840{
841 unsigned int table_size;
842#ifndef _LIBC
843 char *codeset_name;
844#endif
845
846 memset (dfa, '\0', sizeof (re_dfa_t));
847
848 /* Force allocation of str_tree_storage the first time. */
849 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
850
851 /* Avoid overflows. */
852 if (pat_len == SIZE_MAX)
853 return REG_ESPACE;
854
855 dfa->nodes_alloc = pat_len + 1;
856 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
857
858 /* table_size = 2 ^ ceil(log pat_len) */
859 for (table_size = 1; ; table_size <<= 1)
860 if (table_size > pat_len)
861 break;
862
863 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
864 dfa->state_hash_mask = table_size - 1;
865
866 dfa->mb_cur_max = MB_CUR_MAX;
867#ifdef _LIBC
868 if (dfa->mb_cur_max == 6
869 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
870 dfa->is_utf8 = 1;
871 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
872 != 0);
873#else
874# ifdef HAVE_LANGINFO_CODESET
875 codeset_name = nl_langinfo (CODESET);
876# else
877 codeset_name = getenv ("LC_ALL");
878 if (codeset_name == NULL || codeset_name[0] == '\0')
879 codeset_name = getenv ("LC_CTYPE");
880 if (codeset_name == NULL || codeset_name[0] == '\0')
881 codeset_name = getenv ("LANG");
882 if (codeset_name == NULL)
883 codeset_name = "";
884 else if (strchr (codeset_name, '.') != NULL)
885 codeset_name = strchr (codeset_name, '.') + 1;
886# endif
887
888 /* strcasecmp isn't a standard interface. brute force check */
889#if 0
890 if (strcasecmp (codeset_name, "UTF-8") == 0
891 || strcasecmp (codeset_name, "UTF8") == 0)
892 dfa->is_utf8 = 1;
893#else
894 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
895 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
896 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
897 && (codeset_name[3] == '-'
898 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
899 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
900 dfa->is_utf8 = 1;
901#endif
902
903 /* We check exhaustively in the loop below if this charset is a
904 superset of ASCII. */
905 dfa->map_notascii = 0;
906#endif
907
908#ifdef RE_ENABLE_I18N
909 if (dfa->mb_cur_max > 1)
910 {
911 if (dfa->is_utf8)
912 {
913#if !defined(__GNUC__) || __GNUC__ < 3
914 static short utf8_sb_map_inited = 0;
915
916 if (! utf8_sb_map_inited)
917 {
918 int i;
919
920 utf8_sb_map_inited = 0;
921 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
922 utf8_sb_map[i] = BITSET_WORD_MAX;
923 }
924#endif
925 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
926 }
927 else
928 {
929 int i, j, ch;
930
931 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
932 if (BE (dfa->sb_char == NULL, 0))
933 return REG_ESPACE;
934
935 /* Set the bits corresponding to single byte chars. */
936 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
937 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
938 {
939 wint_t wch = __btowc (ch);
940 if (wch != WEOF)
941 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
942# ifndef _LIBC
943 if (isascii (ch) && wch != ch)
944 dfa->map_notascii = 1;
945# endif
946 }
947 }
948 }
949#endif
950
951 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
952 return REG_ESPACE;
953 return REG_NOERROR;
954}
955
956/* Initialize WORD_CHAR table, which indicate which character is
957 "word". In this case "word" means that it is the word construction
958 character used by some operators like "\<", "\>", etc. */
959
960static void
961internal_function
962init_word_char (re_dfa_t *dfa)
963{
964 int i, j, ch;
965 dfa->word_ops_used = 1;
966 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
967 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
968 if (isalnum (ch) || ch == '_')
969 dfa->word_char[i] |= (bitset_word_t) 1 << j;
970}
971
972/* Free the work area which are only used while compiling. */
973
974static void
975free_workarea_compile (regex_t *preg)
976{
977 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
978 bin_tree_storage_t *storage, *next;
979 for (storage = dfa->str_tree_storage; storage; storage = next)
980 {
981 next = storage->next;
982 re_free (storage);
983 }
984 dfa->str_tree_storage = NULL;
985 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
986 dfa->str_tree = NULL;
987 re_free (dfa->org_indices);
988 dfa->org_indices = NULL;
989}
990
991/* Create initial states for all contexts. */
992
993static reg_errcode_t
994create_initial_state (re_dfa_t *dfa)
995{
996 int first, i;
997 reg_errcode_t err;
998 re_node_set init_nodes;
999
1000 /* Initial states have the epsilon closure of the node which is
1001 the first node of the regular expression. */
1002 first = dfa->str_tree->first->node_idx;
1003 dfa->init_node = first;
1004 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1005 if (BE (err != REG_NOERROR, 0))
1006 return err;
1007
1008 /* The back-references which are in initial states can epsilon transit,
1009 since in this case all of the subexpressions can be null.
1010 Then we add epsilon closures of the nodes which are the next nodes of
1011 the back-references. */
1012 if (dfa->nbackref > 0)
1013 for (i = 0; i < init_nodes.nelem; ++i)
1014 {
1015 int node_idx = init_nodes.elems[i];
1016 re_token_type_t type = dfa->nodes[node_idx].type;
1017
1018 int clexp_idx;
1019 if (type != OP_BACK_REF)
1020 continue;
1021 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1022 {
1023 re_token_t *clexp_node;
1024 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1025 if (clexp_node->type == OP_CLOSE_SUBEXP
1026 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1027 break;
1028 }
1029 if (clexp_idx == init_nodes.nelem)
1030 continue;
1031
1032 if (type == OP_BACK_REF)
1033 {
1034 int dest_idx = dfa->edests[node_idx].elems[0];
1035 if (!re_node_set_contains (&init_nodes, dest_idx))
1036 {
1037 reg_errcode_t err = re_node_set_merge (&init_nodes,
1038 dfa->eclosures
1039 + dest_idx);
1040 if (err != REG_NOERROR)
1041 return err;
1042 i = 0;
1043 }
1044 }
1045 }
1046
1047 /* It must be the first time to invoke acquire_state. */
1048 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1049 /* We don't check ERR here, since the initial state must not be NULL. */
1050 if (BE (dfa->init_state == NULL, 0))
1051 return err;
1052 if (dfa->init_state->has_constraint)
1053 {
1054 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1055 CONTEXT_WORD);
1056 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1057 CONTEXT_NEWLINE);
1058 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1059 &init_nodes,
1060 CONTEXT_NEWLINE
1061 | CONTEXT_BEGBUF);
1062 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1063 || dfa->init_state_begbuf == NULL, 0))
1064 return err;
1065 }
1066 else
1067 dfa->init_state_word = dfa->init_state_nl
1068 = dfa->init_state_begbuf = dfa->init_state;
1069
1070 re_node_set_free (&init_nodes);
1071 return REG_NOERROR;
1072}
1073\f
1074#ifdef RE_ENABLE_I18N
1075/* If it is possible to do searching in single byte encoding instead of UTF-8
1076 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1077 DFA nodes where needed. */
1078
1079static void
1080optimize_utf8 (re_dfa_t *dfa)
1081{
1082 int node, i, mb_chars = 0, has_period = 0;
1083
1084 for (node = 0; node < dfa->nodes_len; ++node)
1085 switch (dfa->nodes[node].type)
1086 {
1087 case CHARACTER:
1088 if (dfa->nodes[node].opr.c >= 0x80)
1089 mb_chars = 1;
1090 break;
1091 case ANCHOR:
1092 switch (dfa->nodes[node].opr.ctx_type)
1093 {
1094 case LINE_FIRST:
1095 case LINE_LAST:
1096 case BUF_FIRST:
1097 case BUF_LAST:
1098 break;
1099 default:
1100 /* Word anchors etc. cannot be handled. It's okay to test
1101 opr.ctx_type since constraints (for all DFA nodes) are
1102 created by ORing one or more opr.ctx_type values. */
1103 return;
1104 }
1105 break;
1106 case OP_PERIOD:
1107 has_period = 1;
1108 break;
1109 case OP_BACK_REF:
1110 case OP_ALT:
1111 case END_OF_RE:
1112 case OP_DUP_ASTERISK:
1113 case OP_OPEN_SUBEXP:
1114 case OP_CLOSE_SUBEXP:
1115 break;
1116 case COMPLEX_BRACKET:
1117 return;
1118 case SIMPLE_BRACKET:
1119 /* Just double check. The non-ASCII range starts at 0x80. */
1120 assert (0x80 % BITSET_WORD_BITS == 0);
1121 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1122 if (dfa->nodes[node].opr.sbcset[i])
1123 return;
1124 break;
1125 default:
1126 abort ();
1127 }
1128
1129 if (mb_chars || has_period)
1130 for (node = 0; node < dfa->nodes_len; ++node)
1131 {
1132 if (dfa->nodes[node].type == CHARACTER
1133 && dfa->nodes[node].opr.c >= 0x80)
1134 dfa->nodes[node].mb_partial = 0;
1135 else if (dfa->nodes[node].type == OP_PERIOD)
1136 dfa->nodes[node].type = OP_UTF8_PERIOD;
1137 }
1138
1139 /* The search can be in single byte locale. */
1140 dfa->mb_cur_max = 1;
1141 dfa->is_utf8 = 0;
1142 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1143}
1144#endif
1145\f
1146/* Analyze the structure tree, and calculate "first", "next", "edest",
1147 "eclosure", and "inveclosure". */
1148
1149static reg_errcode_t
1150analyze (regex_t *preg)
1151{
1152 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1153 reg_errcode_t ret;
1154
1155 /* Allocate arrays. */
1156 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1157 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1158 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1159 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1160 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1161 || dfa->eclosures == NULL, 0))
1162 return REG_ESPACE;
1163
1164 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1165 if (dfa->subexp_map != NULL)
1166 {
1167 int i;
1168 for (i = 0; i < preg->re_nsub; i++)
1169 dfa->subexp_map[i] = i;
1170 preorder (dfa->str_tree, optimize_subexps, dfa);
1171 for (i = 0; i < preg->re_nsub; i++)
1172 if (dfa->subexp_map[i] != i)
1173 break;
1174 if (i == preg->re_nsub)
1175 {
1176 free (dfa->subexp_map);
1177 dfa->subexp_map = NULL;
1178 }
1179 }
1180
1181 ret = postorder (dfa->str_tree, lower_subexps, preg);
1182 if (BE (ret != REG_NOERROR, 0))
1183 return ret;
1184 ret = postorder (dfa->str_tree, calc_first, dfa);
1185 if (BE (ret != REG_NOERROR, 0))
1186 return ret;
1187 preorder (dfa->str_tree, calc_next, dfa);
1188 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1189 if (BE (ret != REG_NOERROR, 0))
1190 return ret;
1191 ret = calc_eclosure (dfa);
1192 if (BE (ret != REG_NOERROR, 0))
1193 return ret;
1194
1195 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1196 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1197 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1198 || dfa->nbackref)
1199 {
1200 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1201 if (BE (dfa->inveclosures == NULL, 0))
1202 return REG_ESPACE;
1203 ret = calc_inveclosure (dfa);
1204 }
1205
1206 return ret;
1207}
1208
1209/* Our parse trees are very unbalanced, so we cannot use a stack to
1210 implement parse tree visits. Instead, we use parent pointers and
1211 some hairy code in these two functions. */
1212static reg_errcode_t
1213postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1214 void *extra)
1215{
1216 bin_tree_t *node, *prev;
1217
1218 for (node = root; ; )
1219 {
1220 /* Descend down the tree, preferably to the left (or to the right
1221 if that's the only child). */
1222 while (node->left || node->right)
1223 if (node->left)
1224 node = node->left;
1225 else
1226 node = node->right;
1227
1228 do
1229 {
1230 reg_errcode_t err = fn (extra, node);
1231 if (BE (err != REG_NOERROR, 0))
1232 return err;
1233 if (node->parent == NULL)
1234 return REG_NOERROR;
1235 prev = node;
1236 node = node->parent;
1237 }
1238 /* Go up while we have a node that is reached from the right. */
1239 while (node->right == prev || node->right == NULL);
1240 node = node->right;
1241 }
1242}
1243
1244static reg_errcode_t
1245preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1246 void *extra)
1247{
1248 bin_tree_t *node;
1249
1250 for (node = root; ; )
1251 {
1252 reg_errcode_t err = fn (extra, node);
1253 if (BE (err != REG_NOERROR, 0))
1254 return err;
1255
1256 /* Go to the left node, or up and to the right. */
1257 if (node->left)
1258 node = node->left;
1259 else
1260 {
1261 bin_tree_t *prev = NULL;
1262 while (node->right == prev || node->right == NULL)
1263 {
1264 prev = node;
1265 node = node->parent;
1266 if (!node)
1267 return REG_NOERROR;
1268 }
1269 node = node->right;
1270 }
1271 }
1272}
1273
1274/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1275 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1276 backreferences as well. Requires a preorder visit. */
1277static reg_errcode_t
1278optimize_subexps (void *extra, bin_tree_t *node)
1279{
1280 re_dfa_t *dfa = (re_dfa_t *) extra;
1281
1282 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1283 {
1284 int idx = node->token.opr.idx;
1285 node->token.opr.idx = dfa->subexp_map[idx];
1286 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1287 }
1288
1289 else if (node->token.type == SUBEXP
1290 && node->left && node->left->token.type == SUBEXP)
1291 {
1292 int other_idx = node->left->token.opr.idx;
1293
1294 node->left = node->left->left;
1295 if (node->left)
1296 node->left->parent = node;
1297
1298 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1299 if (other_idx < BITSET_WORD_BITS)
1300 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1301 }
1302
1303 return REG_NOERROR;
1304}
1305
1306/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1307 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1308static reg_errcode_t
1309lower_subexps (void *extra, bin_tree_t *node)
1310{
1311 regex_t *preg = (regex_t *) extra;
1312 reg_errcode_t err = REG_NOERROR;
1313
1314 if (node->left && node->left->token.type == SUBEXP)
1315 {
1316 node->left = lower_subexp (&err, preg, node->left);
1317 if (node->left)
1318 node->left->parent = node;
1319 }
1320 if (node->right && node->right->token.type == SUBEXP)
1321 {
1322 node->right = lower_subexp (&err, preg, node->right);
1323 if (node->right)
1324 node->right->parent = node;
1325 }
1326
1327 return err;
1328}
1329
1330static bin_tree_t *
1331lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1332{
1333 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1334 bin_tree_t *body = node->left;
1335 bin_tree_t *op, *cls, *tree1, *tree;
1336
1337 if (preg->no_sub
1338 /* We do not optimize empty subexpressions, because otherwise we may
1339 have bad CONCAT nodes with NULL children. This is obviously not
1340 very common, so we do not lose much. An example that triggers
1341 this case is the sed "script" /\(\)/x. */
1342 && node->left != NULL
1343 && (node->token.opr.idx >= BITSET_WORD_BITS
1344 || !(dfa->used_bkref_map
1345 & ((bitset_word_t) 1 << node->token.opr.idx))))
1346 return node->left;
1347
1348 /* Convert the SUBEXP node to the concatenation of an
1349 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1350 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1351 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1352 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1353 tree = create_tree (dfa, op, tree1, CONCAT);
1354 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1355 {
1356 *err = REG_ESPACE;
1357 return NULL;
1358 }
1359
1360 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1361 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1362 return tree;
1363}
1364
1365/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1366 nodes. Requires a postorder visit. */
1367static reg_errcode_t
1368calc_first (void *extra, bin_tree_t *node)
1369{
1370 re_dfa_t *dfa = (re_dfa_t *) extra;
1371 if (node->token.type == CONCAT)
1372 {
1373 node->first = node->left->first;
1374 node->node_idx = node->left->node_idx;
1375 }
1376 else
1377 {
1378 node->first = node;
1379 node->node_idx = re_dfa_add_node (dfa, node->token);
1380 if (BE (node->node_idx == -1, 0))
1381 return REG_ESPACE;
1382 if (node->token.type == ANCHOR)
1383 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1384 }
1385 return REG_NOERROR;
1386}
1387
1388/* Pass 2: compute NEXT on the tree. Preorder visit. */
1389static reg_errcode_t
1390calc_next (void *extra, bin_tree_t *node)
1391{
1392 switch (node->token.type)
1393 {
1394 case OP_DUP_ASTERISK:
1395 node->left->next = node;
1396 break;
1397 case CONCAT:
1398 node->left->next = node->right->first;
1399 node->right->next = node->next;
1400 break;
1401 default:
1402 if (node->left)
1403 node->left->next = node->next;
1404 if (node->right)
1405 node->right->next = node->next;
1406 break;
1407 }
1408 return REG_NOERROR;
1409}
1410
1411/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1412static reg_errcode_t
1413link_nfa_nodes (void *extra, bin_tree_t *node)
1414{
1415 re_dfa_t *dfa = (re_dfa_t *) extra;
1416 int idx = node->node_idx;
1417 reg_errcode_t err = REG_NOERROR;
1418
1419 switch (node->token.type)
1420 {
1421 case CONCAT:
1422 break;
1423
1424 case END_OF_RE:
1425 assert (node->next == NULL);
1426 break;
1427
1428 case OP_DUP_ASTERISK:
1429 case OP_ALT:
1430 {
1431 int left, right;
1432 dfa->has_plural_match = 1;
1433 if (node->left != NULL)
1434 left = node->left->first->node_idx;
1435 else
1436 left = node->next->node_idx;
1437 if (node->right != NULL)
1438 right = node->right->first->node_idx;
1439 else
1440 right = node->next->node_idx;
1441 assert (left > -1);
1442 assert (right > -1);
1443 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1444 }
1445 break;
1446
1447 case ANCHOR:
1448 case OP_OPEN_SUBEXP:
1449 case OP_CLOSE_SUBEXP:
1450 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1451 break;
1452
1453 case OP_BACK_REF:
1454 dfa->nexts[idx] = node->next->node_idx;
1455 if (node->token.type == OP_BACK_REF)
1456 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1457 break;
1458
1459 default:
1460 assert (!IS_EPSILON_NODE (node->token.type));
1461 dfa->nexts[idx] = node->next->node_idx;
1462 break;
1463 }
1464
1465 return err;
1466}
1467
1468/* Duplicate the epsilon closure of the node ROOT_NODE.
1469 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1470 to their own constraint. */
1471
1472static reg_errcode_t
1473internal_function
1474duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1475 int root_node, unsigned int init_constraint)
1476{
1477 int org_node, clone_node, ret;
1478 unsigned int constraint = init_constraint;
1479 for (org_node = top_org_node, clone_node = top_clone_node;;)
1480 {
1481 int org_dest, clone_dest;
1482 if (dfa->nodes[org_node].type == OP_BACK_REF)
1483 {
1484 /* If the back reference epsilon-transit, its destination must
1485 also have the constraint. Then duplicate the epsilon closure
1486 of the destination of the back reference, and store it in
1487 edests of the back reference. */
1488 org_dest = dfa->nexts[org_node];
1489 re_node_set_empty (dfa->edests + clone_node);
1490 clone_dest = duplicate_node (dfa, org_dest, constraint);
1491 if (BE (clone_dest == -1, 0))
1492 return REG_ESPACE;
1493 dfa->nexts[clone_node] = dfa->nexts[org_node];
1494 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495 if (BE (ret < 0, 0))
1496 return REG_ESPACE;
1497 }
1498 else if (dfa->edests[org_node].nelem == 0)
1499 {
1500 /* In case of the node can't epsilon-transit, don't duplicate the
1501 destination and store the original destination as the
1502 destination of the node. */
1503 dfa->nexts[clone_node] = dfa->nexts[org_node];
1504 break;
1505 }
1506 else if (dfa->edests[org_node].nelem == 1)
1507 {
1508 /* In case of the node can epsilon-transit, and it has only one
1509 destination. */
1510 org_dest = dfa->edests[org_node].elems[0];
1511 re_node_set_empty (dfa->edests + clone_node);
1512 /* If the node is root_node itself, it means the epsilon clsoure
1513 has a loop. Then tie it to the destination of the root_node. */
1514 if (org_node == root_node && clone_node != org_node)
1515 {
1516 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1517 if (BE (ret < 0, 0))
1518 return REG_ESPACE;
1519 break;
1520 }
1521 /* In case of the node has another constraint, add it. */
1522 constraint |= dfa->nodes[org_node].constraint;
1523 clone_dest = duplicate_node (dfa, org_dest, constraint);
1524 if (BE (clone_dest == -1, 0))
1525 return REG_ESPACE;
1526 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527 if (BE (ret < 0, 0))
1528 return REG_ESPACE;
1529 }
1530 else /* dfa->edests[org_node].nelem == 2 */
1531 {
1532 /* In case of the node can epsilon-transit, and it has two
1533 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1534 org_dest = dfa->edests[org_node].elems[0];
1535 re_node_set_empty (dfa->edests + clone_node);
1536 /* Search for a duplicated node which satisfies the constraint. */
1537 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1538 if (clone_dest == -1)
1539 {
1540 /* There is no such duplicated node, create a new one. */
1541 reg_errcode_t err;
1542 clone_dest = duplicate_node (dfa, org_dest, constraint);
1543 if (BE (clone_dest == -1, 0))
1544 return REG_ESPACE;
1545 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1546 if (BE (ret < 0, 0))
1547 return REG_ESPACE;
1548 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1549 root_node, constraint);
1550 if (BE (err != REG_NOERROR, 0))
1551 return err;
1552 }
1553 else
1554 {
1555 /* There is a duplicated node which satisfies the constraint,
1556 use it to avoid infinite loop. */
1557 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1558 if (BE (ret < 0, 0))
1559 return REG_ESPACE;
1560 }
1561
1562 org_dest = dfa->edests[org_node].elems[1];
1563 clone_dest = duplicate_node (dfa, org_dest, constraint);
1564 if (BE (clone_dest == -1, 0))
1565 return REG_ESPACE;
1566 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567 if (BE (ret < 0, 0))
1568 return REG_ESPACE;
1569 }
1570 org_node = org_dest;
1571 clone_node = clone_dest;
1572 }
1573 return REG_NOERROR;
1574}
1575
1576/* Search for a node which is duplicated from the node ORG_NODE, and
1577 satisfies the constraint CONSTRAINT. */
1578
1579static int
1580search_duplicated_node (const re_dfa_t *dfa, int org_node,
1581 unsigned int constraint)
1582{
1583 int idx;
1584 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1585 {
1586 if (org_node == dfa->org_indices[idx]
1587 && constraint == dfa->nodes[idx].constraint)
1588 return idx; /* Found. */
1589 }
1590 return -1; /* Not found. */
1591}
1592
1593/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1594 Return the index of the new node, or -1 if insufficient storage is
1595 available. */
1596
1597static int
1598duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1599{
1600 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1601 if (BE (dup_idx != -1, 1))
1602 {
1603 dfa->nodes[dup_idx].constraint = constraint;
1604 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1605 dfa->nodes[dup_idx].duplicated = 1;
1606
1607 /* Store the index of the original node. */
1608 dfa->org_indices[dup_idx] = org_idx;
1609 }
1610 return dup_idx;
1611}
1612
1613static reg_errcode_t
1614calc_inveclosure (re_dfa_t *dfa)
1615{
1616 int src, idx, ret;
1617 for (idx = 0; idx < dfa->nodes_len; ++idx)
1618 re_node_set_init_empty (dfa->inveclosures + idx);
1619
1620 for (src = 0; src < dfa->nodes_len; ++src)
1621 {
1622 int *elems = dfa->eclosures[src].elems;
1623 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1624 {
1625 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1626 if (BE (ret == -1, 0))
1627 return REG_ESPACE;
1628 }
1629 }
1630
1631 return REG_NOERROR;
1632}
1633
1634/* Calculate "eclosure" for all the node in DFA. */
1635
1636static reg_errcode_t
1637calc_eclosure (re_dfa_t *dfa)
1638{
1639 int node_idx, incomplete;
1640#ifdef DEBUG
1641 assert (dfa->nodes_len > 0);
1642#endif
1643 incomplete = 0;
1644 /* For each nodes, calculate epsilon closure. */
1645 for (node_idx = 0; ; ++node_idx)
1646 {
1647 reg_errcode_t err;
1648 re_node_set eclosure_elem;
1649 if (node_idx == dfa->nodes_len)
1650 {
1651 if (!incomplete)
1652 break;
1653 incomplete = 0;
1654 node_idx = 0;
1655 }
1656
1657#ifdef DEBUG
1658 assert (dfa->eclosures[node_idx].nelem != -1);
1659#endif
1660
1661 /* If we have already calculated, skip it. */
1662 if (dfa->eclosures[node_idx].nelem != 0)
1663 continue;
1664 /* Calculate epsilon closure of `node_idx'. */
1665 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1666 if (BE (err != REG_NOERROR, 0))
1667 return err;
1668
1669 if (dfa->eclosures[node_idx].nelem == 0)
1670 {
1671 incomplete = 1;
1672 re_node_set_free (&eclosure_elem);
1673 }
1674 }
1675 return REG_NOERROR;
1676}
1677
1678/* Calculate epsilon closure of NODE. */
1679
1680static reg_errcode_t
1681calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1682{
1683 reg_errcode_t err;
1684 int i;
1685 re_node_set eclosure;
1686 int ret;
1687 int incomplete = 0;
1688 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1689 if (BE (err != REG_NOERROR, 0))
1690 return err;
1691
1692 /* This indicates that we are calculating this node now.
1693 We reference this value to avoid infinite loop. */
1694 dfa->eclosures[node].nelem = -1;
1695
1696 /* If the current node has constraints, duplicate all nodes
1697 since they must inherit the constraints. */
1698 if (dfa->nodes[node].constraint
1699 && dfa->edests[node].nelem
1700 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1701 {
1702 err = duplicate_node_closure (dfa, node, node, node,
1703 dfa->nodes[node].constraint);
1704 if (BE (err != REG_NOERROR, 0))
1705 return err;
1706 }
1707
1708 /* Expand each epsilon destination nodes. */
1709 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1710 for (i = 0; i < dfa->edests[node].nelem; ++i)
1711 {
1712 re_node_set eclosure_elem;
1713 int edest = dfa->edests[node].elems[i];
1714 /* If calculating the epsilon closure of `edest' is in progress,
1715 return intermediate result. */
1716 if (dfa->eclosures[edest].nelem == -1)
1717 {
1718 incomplete = 1;
1719 continue;
1720 }
1721 /* If we haven't calculated the epsilon closure of `edest' yet,
1722 calculate now. Otherwise use calculated epsilon closure. */
1723 if (dfa->eclosures[edest].nelem == 0)
1724 {
1725 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1726 if (BE (err != REG_NOERROR, 0))
1727 return err;
1728 }
1729 else
1730 eclosure_elem = dfa->eclosures[edest];
1731 /* Merge the epsilon closure of `edest'. */
1732 err = re_node_set_merge (&eclosure, &eclosure_elem);
1733 if (BE (err != REG_NOERROR, 0))
1734 return err;
1735 /* If the epsilon closure of `edest' is incomplete,
1736 the epsilon closure of this node is also incomplete. */
1737 if (dfa->eclosures[edest].nelem == 0)
1738 {
1739 incomplete = 1;
1740 re_node_set_free (&eclosure_elem);
1741 }
1742 }
1743
1744 /* An epsilon closure includes itself. */
1745 ret = re_node_set_insert (&eclosure, node);
1746 if (BE (ret < 0, 0))
1747 return REG_ESPACE;
1748 if (incomplete && !root)
1749 dfa->eclosures[node].nelem = 0;
1750 else
1751 dfa->eclosures[node] = eclosure;
1752 *new_set = eclosure;
1753 return REG_NOERROR;
1754}
1755\f
1756/* Functions for token which are used in the parser. */
1757
1758/* Fetch a token from INPUT.
1759 We must not use this function inside bracket expressions. */
1760
1761static void
1762internal_function
1763fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1764{
1765 re_string_skip_bytes (input, peek_token (result, input, syntax));
1766}
1767
1768/* Peek a token from INPUT, and return the length of the token.
1769 We must not use this function inside bracket expressions. */
1770
1771static int
1772internal_function
1773peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1774{
1775 unsigned char c;
1776
1777 if (re_string_eoi (input))
1778 {
1779 token->type = END_OF_RE;
1780 return 0;
1781 }
1782
1783 c = re_string_peek_byte (input, 0);
1784 token->opr.c = c;
1785
1786 token->word_char = 0;
1787#ifdef RE_ENABLE_I18N
1788 token->mb_partial = 0;
1789 if (input->mb_cur_max > 1 &&
1790 !re_string_first_byte (input, re_string_cur_idx (input)))
1791 {
1792 token->type = CHARACTER;
1793 token->mb_partial = 1;
1794 return 1;
1795 }
1796#endif
1797 if (c == '\\')
1798 {
1799 unsigned char c2;
1800 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1801 {
1802 token->type = BACK_SLASH;
1803 return 1;
1804 }
1805
1806 c2 = re_string_peek_byte_case (input, 1);
1807 token->opr.c = c2;
1808 token->type = CHARACTER;
1809#ifdef RE_ENABLE_I18N
1810 if (input->mb_cur_max > 1)
1811 {
1812 wint_t wc = re_string_wchar_at (input,
1813 re_string_cur_idx (input) + 1);
1814 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1815 }
1816 else
1817#endif
1818 token->word_char = IS_WORD_CHAR (c2) != 0;
1819
1820 switch (c2)
1821 {
1822 case '|':
1823 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1824 token->type = OP_ALT;
1825 break;
1826 case '1': case '2': case '3': case '4': case '5':
1827 case '6': case '7': case '8': case '9':
1828 if (!(syntax & RE_NO_BK_REFS))
1829 {
1830 token->type = OP_BACK_REF;
1831 token->opr.idx = c2 - '1';
1832 }
1833 break;
1834 case '<':
1835 if (!(syntax & RE_NO_GNU_OPS))
1836 {
1837 token->type = ANCHOR;
1838 token->opr.ctx_type = WORD_FIRST;
1839 }
1840 break;
1841 case '>':
1842 if (!(syntax & RE_NO_GNU_OPS))
1843 {
1844 token->type = ANCHOR;
1845 token->opr.ctx_type = WORD_LAST;
1846 }
1847 break;
1848 case 'b':
1849 if (!(syntax & RE_NO_GNU_OPS))
1850 {
1851 token->type = ANCHOR;
1852 token->opr.ctx_type = WORD_DELIM;
1853 }
1854 break;
1855 case 'B':
1856 if (!(syntax & RE_NO_GNU_OPS))
1857 {
1858 token->type = ANCHOR;
1859 token->opr.ctx_type = NOT_WORD_DELIM;
1860 }
1861 break;
1862 case 'w':
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = OP_WORD;
1865 break;
1866 case 'W':
1867 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = OP_NOTWORD;
1869 break;
1870 case 's':
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = OP_SPACE;
1873 break;
1874 case 'S':
1875 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = OP_NOTSPACE;
1877 break;
1878 case '`':
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 {
1881 token->type = ANCHOR;
1882 token->opr.ctx_type = BUF_FIRST;
1883 }
1884 break;
1885 case '\'':
1886 if (!(syntax & RE_NO_GNU_OPS))
1887 {
1888 token->type = ANCHOR;
1889 token->opr.ctx_type = BUF_LAST;
1890 }
1891 break;
1892 case '(':
1893 if (!(syntax & RE_NO_BK_PARENS))
1894 token->type = OP_OPEN_SUBEXP;
1895 break;
1896 case ')':
1897 if (!(syntax & RE_NO_BK_PARENS))
1898 token->type = OP_CLOSE_SUBEXP;
1899 break;
1900 case '+':
1901 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902 token->type = OP_DUP_PLUS;
1903 break;
1904 case '?':
1905 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1906 token->type = OP_DUP_QUESTION;
1907 break;
1908 case '{':
1909 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910 token->type = OP_OPEN_DUP_NUM;
1911 break;
1912 case '}':
1913 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1914 token->type = OP_CLOSE_DUP_NUM;
1915 break;
1916 default:
1917 break;
1918 }
1919 return 2;
1920 }
1921
1922 token->type = CHARACTER;
1923#ifdef RE_ENABLE_I18N
1924 if (input->mb_cur_max > 1)
1925 {
1926 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1927 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1928 }
1929 else
1930#endif
1931 token->word_char = IS_WORD_CHAR (token->opr.c);
1932
1933 switch (c)
1934 {
1935 case '\n':
1936 if (syntax & RE_NEWLINE_ALT)
1937 token->type = OP_ALT;
1938 break;
1939 case '|':
1940 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1941 token->type = OP_ALT;
1942 break;
1943 case '*':
1944 token->type = OP_DUP_ASTERISK;
1945 break;
1946 case '+':
1947 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948 token->type = OP_DUP_PLUS;
1949 break;
1950 case '?':
1951 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1952 token->type = OP_DUP_QUESTION;
1953 break;
1954 case '{':
1955 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956 token->type = OP_OPEN_DUP_NUM;
1957 break;
1958 case '}':
1959 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1960 token->type = OP_CLOSE_DUP_NUM;
1961 break;
1962 case '(':
1963 if (syntax & RE_NO_BK_PARENS)
1964 token->type = OP_OPEN_SUBEXP;
1965 break;
1966 case ')':
1967 if (syntax & RE_NO_BK_PARENS)
1968 token->type = OP_CLOSE_SUBEXP;
1969 break;
1970 case '[':
1971 token->type = OP_OPEN_BRACKET;
1972 break;
1973 case '.':
1974 token->type = OP_PERIOD;
1975 break;
1976 case '^':
1977 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1978 re_string_cur_idx (input) != 0)
1979 {
1980 char prev = re_string_peek_byte (input, -1);
1981 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1982 break;
1983 }
1984 token->type = ANCHOR;
1985 token->opr.ctx_type = LINE_FIRST;
1986 break;
1987 case '$':
1988 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1989 re_string_cur_idx (input) + 1 != re_string_length (input))
1990 {
1991 re_token_t next;
1992 re_string_skip_bytes (input, 1);
1993 peek_token (&next, input, syntax);
1994 re_string_skip_bytes (input, -1);
1995 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1996 break;
1997 }
1998 token->type = ANCHOR;
1999 token->opr.ctx_type = LINE_LAST;
2000 break;
2001 default:
2002 break;
2003 }
2004 return 1;
2005}
2006
2007/* Peek a token from INPUT, and return the length of the token.
2008 We must not use this function out of bracket expressions. */
2009
2010static int
2011internal_function
2012peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2013{
2014 unsigned char c;
2015 if (re_string_eoi (input))
2016 {
2017 token->type = END_OF_RE;
2018 return 0;
2019 }
2020 c = re_string_peek_byte (input, 0);
2021 token->opr.c = c;
2022
2023#ifdef RE_ENABLE_I18N
2024 if (input->mb_cur_max > 1 &&
2025 !re_string_first_byte (input, re_string_cur_idx (input)))
2026 {
2027 token->type = CHARACTER;
2028 return 1;
2029 }
2030#endif /* RE_ENABLE_I18N */
2031
2032 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2033 && re_string_cur_idx (input) + 1 < re_string_length (input))
2034 {
2035 /* In this case, '\' escape a character. */
2036 unsigned char c2;
2037 re_string_skip_bytes (input, 1);
2038 c2 = re_string_peek_byte (input, 0);
2039 token->opr.c = c2;
2040 token->type = CHARACTER;
2041 return 1;
2042 }
2043 if (c == '[') /* '[' is a special char in a bracket exps. */
2044 {
2045 unsigned char c2;
2046 int token_len;
2047 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2048 c2 = re_string_peek_byte (input, 1);
2049 else
2050 c2 = 0;
2051 token->opr.c = c2;
2052 token_len = 2;
2053 switch (c2)
2054 {
2055 case '.':
2056 token->type = OP_OPEN_COLL_ELEM;
2057 break;
2058 case '=':
2059 token->type = OP_OPEN_EQUIV_CLASS;
2060 break;
2061 case ':':
2062 if (syntax & RE_CHAR_CLASSES)
2063 {
2064 token->type = OP_OPEN_CHAR_CLASS;
2065 break;
2066 }
2067 /* else fall through. */
2068 default:
2069 token->type = CHARACTER;
2070 token->opr.c = c;
2071 token_len = 1;
2072 break;
2073 }
2074 return token_len;
2075 }
2076 switch (c)
2077 {
2078 case '-':
2079 token->type = OP_CHARSET_RANGE;
2080 break;
2081 case ']':
2082 token->type = OP_CLOSE_BRACKET;
2083 break;
2084 case '^':
2085 token->type = OP_NON_MATCH_LIST;
2086 break;
2087 default:
2088 token->type = CHARACTER;
2089 }
2090 return 1;
2091}
2092\f
2093/* Functions for parser. */
2094
2095/* Entry point of the parser.
2096 Parse the regular expression REGEXP and return the structure tree.
2097 If an error has occurred, ERR is set by error code, and return NULL.
2098 This function build the following tree, from regular expression <reg_exp>:
2099 CAT
2100 / \
2101 / \
2102 <reg_exp> EOR
2103
2104 CAT means concatenation.
2105 EOR means end of regular expression. */
2106
2107static bin_tree_t *
2108parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2109 reg_errcode_t *err)
2110{
2111 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2112 bin_tree_t *tree, *eor, *root;
2113 re_token_t current_token;
2114 dfa->syntax = syntax;
2115 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2116 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2117 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2118 return NULL;
2119 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2120 if (tree != NULL)
2121 root = create_tree (dfa, tree, eor, CONCAT);
2122 else
2123 root = eor;
2124 if (BE (eor == NULL || root == NULL, 0))
2125 {
2126 *err = REG_ESPACE;
2127 return NULL;
2128 }
2129 return root;
2130}
2131
2132/* This function build the following tree, from regular expression
2133 <branch1>|<branch2>:
2134 ALT
2135 / \
2136 / \
2137 <branch1> <branch2>
2138
2139 ALT means alternative, which represents the operator `|'. */
2140
2141static bin_tree_t *
2142parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2143 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2144{
2145 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2146 bin_tree_t *tree, *branch = NULL;
2147 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2148 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149 return NULL;
2150
2151 while (token->type == OP_ALT)
2152 {
2153 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154 if (token->type != OP_ALT && token->type != END_OF_RE
2155 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2156 {
2157 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2158 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2159 return NULL;
2160 }
2161 else
2162 branch = NULL;
2163 tree = create_tree (dfa, tree, branch, OP_ALT);
2164 if (BE (tree == NULL, 0))
2165 {
2166 *err = REG_ESPACE;
2167 return NULL;
2168 }
2169 }
2170 return tree;
2171}
2172
2173/* This function build the following tree, from regular expression
2174 <exp1><exp2>:
2175 CAT
2176 / \
2177 / \
2178 <exp1> <exp2>
2179
2180 CAT means concatenation. */
2181
2182static bin_tree_t *
2183parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2184 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2185{
2186 bin_tree_t *tree, *exp;
2187 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2188 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2189 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190 return NULL;
2191
2192 while (token->type != OP_ALT && token->type != END_OF_RE
2193 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194 {
2195 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2196 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2197 {
2198 return NULL;
2199 }
2200 if (tree != NULL && exp != NULL)
2201 {
2202 tree = create_tree (dfa, tree, exp, CONCAT);
2203 if (tree == NULL)
2204 {
2205 *err = REG_ESPACE;
2206 return NULL;
2207 }
2208 }
2209 else if (tree == NULL)
2210 tree = exp;
2211 /* Otherwise exp == NULL, we don't need to create new tree. */
2212 }
2213 return tree;
2214}
2215
2216/* This function build the following tree, from regular expression a*:
2217 *
2218 |
2219 a
2220*/
2221
2222static bin_tree_t *
2223parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2225{
2226 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227 bin_tree_t *tree;
2228 switch (token->type)
2229 {
2230 case CHARACTER:
2231 tree = create_token_tree (dfa, NULL, NULL, token);
2232 if (BE (tree == NULL, 0))
2233 {
2234 *err = REG_ESPACE;
2235 return NULL;
2236 }
2237#ifdef RE_ENABLE_I18N
2238 if (dfa->mb_cur_max > 1)
2239 {
2240 while (!re_string_eoi (regexp)
2241 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242 {
2243 bin_tree_t *mbc_remain;
2244 fetch_token (token, regexp, syntax);
2245 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2246 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2247 if (BE (mbc_remain == NULL || tree == NULL, 0))
2248 {
2249 *err = REG_ESPACE;
2250 return NULL;
2251 }
2252 }
2253 }
2254#endif
2255 break;
2256 case OP_OPEN_SUBEXP:
2257 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2258 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259 return NULL;
2260 break;
2261 case OP_OPEN_BRACKET:
2262 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2264 return NULL;
2265 break;
2266 case OP_BACK_REF:
2267 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2268 {
2269 *err = REG_ESUBREG;
2270 return NULL;
2271 }
2272 dfa->used_bkref_map |= 1 << token->opr.idx;
2273 tree = create_token_tree (dfa, NULL, NULL, token);
2274 if (BE (tree == NULL, 0))
2275 {
2276 *err = REG_ESPACE;
2277 return NULL;
2278 }
2279 ++dfa->nbackref;
2280 dfa->has_mb_node = 1;
2281 break;
2282 case OP_OPEN_DUP_NUM:
2283 if (syntax & RE_CONTEXT_INVALID_DUP)
2284 {
2285 *err = REG_BADRPT;
2286 return NULL;
2287 }
2288 /* FALLTHROUGH */
2289 case OP_DUP_ASTERISK:
2290 case OP_DUP_PLUS:
2291 case OP_DUP_QUESTION:
2292 if (syntax & RE_CONTEXT_INVALID_OPS)
2293 {
2294 *err = REG_BADRPT;
2295 return NULL;
2296 }
2297 else if (syntax & RE_CONTEXT_INDEP_OPS)
2298 {
2299 fetch_token (token, regexp, syntax);
2300 return parse_expression (regexp, preg, token, syntax, nest, err);
2301 }
2302 /* else fall through */
2303 case OP_CLOSE_SUBEXP:
2304 if ((token->type == OP_CLOSE_SUBEXP) &&
2305 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2306 {
2307 *err = REG_ERPAREN;
2308 return NULL;
2309 }
2310 /* else fall through */
2311 case OP_CLOSE_DUP_NUM:
2312 /* We treat it as a normal character. */
2313
2314 /* Then we can these characters as normal characters. */
2315 token->type = CHARACTER;
2316 /* mb_partial and word_char bits should be initialized already
2317 by peek_token. */
2318 tree = create_token_tree (dfa, NULL, NULL, token);
2319 if (BE (tree == NULL, 0))
2320 {
2321 *err = REG_ESPACE;
2322 return NULL;
2323 }
2324 break;
2325 case ANCHOR:
2326 if ((token->opr.ctx_type
2327 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2328 && dfa->word_ops_used == 0)
2329 init_word_char (dfa);
2330 if (token->opr.ctx_type == WORD_DELIM
2331 || token->opr.ctx_type == NOT_WORD_DELIM)
2332 {
2333 bin_tree_t *tree_first, *tree_last;
2334 if (token->opr.ctx_type == WORD_DELIM)
2335 {
2336 token->opr.ctx_type = WORD_FIRST;
2337 tree_first = create_token_tree (dfa, NULL, NULL, token);
2338 token->opr.ctx_type = WORD_LAST;
2339 }
2340 else
2341 {
2342 token->opr.ctx_type = INSIDE_WORD;
2343 tree_first = create_token_tree (dfa, NULL, NULL, token);
2344 token->opr.ctx_type = INSIDE_NOTWORD;
2345 }
2346 tree_last = create_token_tree (dfa, NULL, NULL, token);
2347 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2348 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2349 {
2350 *err = REG_ESPACE;
2351 return NULL;
2352 }
2353 }
2354 else
2355 {
2356 tree = create_token_tree (dfa, NULL, NULL, token);
2357 if (BE (tree == NULL, 0))
2358 {
2359 *err = REG_ESPACE;
2360 return NULL;
2361 }
2362 }
2363 /* We must return here, since ANCHORs can't be followed
2364 by repetition operators.
2365 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2367 fetch_token (token, regexp, syntax);
2368 return tree;
2369 case OP_PERIOD:
2370 tree = create_token_tree (dfa, NULL, NULL, token);
2371 if (BE (tree == NULL, 0))
2372 {
2373 *err = REG_ESPACE;
2374 return NULL;
2375 }
2376 if (dfa->mb_cur_max > 1)
2377 dfa->has_mb_node = 1;
2378 break;
2379 case OP_WORD:
2380 case OP_NOTWORD:
2381 tree = build_charclass_op (dfa, regexp->trans,
2382 "alnum",
2383 "_",
2384 token->type == OP_NOTWORD, err);
2385 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386 return NULL;
2387 break;
2388 case OP_SPACE:
2389 case OP_NOTSPACE:
2390 tree = build_charclass_op (dfa, regexp->trans,
2391 "space",
2392 "",
2393 token->type == OP_NOTSPACE, err);
2394 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395 return NULL;
2396 break;
2397 case OP_ALT:
2398 case END_OF_RE:
2399 return NULL;
2400 case BACK_SLASH:
2401 *err = REG_EESCAPE;
2402 return NULL;
2403 default:
2404 /* Must not happen? */
2405#ifdef DEBUG
2406 assert (0);
2407#endif
2408 return NULL;
2409 }
2410 fetch_token (token, regexp, syntax);
2411
2412 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2413 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414 {
2415 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2416 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2417 return NULL;
2418 /* In BRE consecutive duplications are not allowed. */
2419 if ((syntax & RE_CONTEXT_INVALID_DUP)
2420 && (token->type == OP_DUP_ASTERISK
2421 || token->type == OP_OPEN_DUP_NUM))
2422 {
2423 *err = REG_BADRPT;
2424 return NULL;
2425 }
2426 }
2427
2428 return tree;
2429}
2430
2431/* This function build the following tree, from regular expression
2432 (<reg_exp>):
2433 SUBEXP
2434 |
2435 <reg_exp>
2436*/
2437
2438static bin_tree_t *
2439parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2440 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2441{
2442 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2443 bin_tree_t *tree;
2444 size_t cur_nsub;
2445 cur_nsub = preg->re_nsub++;
2446
2447 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2448
2449 /* The subexpression may be a null string. */
2450 if (token->type == OP_CLOSE_SUBEXP)
2451 tree = NULL;
2452 else
2453 {
2454 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2455 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2456 *err = REG_EPAREN;
2457 if (BE (*err != REG_NOERROR, 0))
2458 return NULL;
2459 }
2460
2461 if (cur_nsub <= '9' - '1')
2462 dfa->completed_bkref_map |= 1 << cur_nsub;
2463
2464 tree = create_tree (dfa, tree, NULL, SUBEXP);
2465 if (BE (tree == NULL, 0))
2466 {
2467 *err = REG_ESPACE;
2468 return NULL;
2469 }
2470 tree->token.opr.idx = cur_nsub;
2471 return tree;
2472}
2473
2474/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2475
2476static bin_tree_t *
2477parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2478 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2479{
2480 bin_tree_t *tree = NULL, *old_tree = NULL;
2481 int i, start, end, start_idx = re_string_cur_idx (regexp);
2482#ifndef RE_TOKEN_INIT_BUG
2483 re_token_t start_token = *token;
2484#else
2485 re_token_t start_token;
2486
2487 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2488#endif
2489
2490 if (token->type == OP_OPEN_DUP_NUM)
2491 {
2492 end = 0;
2493 start = fetch_number (regexp, token, syntax);
2494 if (start == -1)
2495 {
2496 if (token->type == CHARACTER && token->opr.c == ',')
2497 start = 0; /* We treat "{,m}" as "{0,m}". */
2498 else
2499 {
2500 *err = REG_BADBR; /* <re>{} is invalid. */
2501 return NULL;
2502 }
2503 }
2504 if (BE (start != -2, 1))
2505 {
2506 /* We treat "{n}" as "{n,n}". */
2507 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2508 : ((token->type == CHARACTER && token->opr.c == ',')
2509 ? fetch_number (regexp, token, syntax) : -2));
2510 }
2511 if (BE (start == -2 || end == -2, 0))
2512 {
2513 /* Invalid sequence. */
2514 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2515 {
2516 if (token->type == END_OF_RE)
2517 *err = REG_EBRACE;
2518 else
2519 *err = REG_BADBR;
2520
2521 return NULL;
2522 }
2523
2524 /* If the syntax bit is set, rollback. */
2525 re_string_set_index (regexp, start_idx);
2526 *token = start_token;
2527 token->type = CHARACTER;
2528 /* mb_partial and word_char bits should be already initialized by
2529 peek_token. */
2530 return elem;
2531 }
2532
2533 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2534 {
2535 /* First number greater than second. */
2536 *err = REG_BADBR;
2537 return NULL;
2538 }
2539 }
2540 else
2541 {
2542 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2543 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2544 }
2545
2546 fetch_token (token, regexp, syntax);
2547
2548 if (BE (elem == NULL, 0))
2549 return NULL;
2550 if (BE (start == 0 && end == 0, 0))
2551 {
2552 postorder (elem, free_tree, NULL);
2553 return NULL;
2554 }
2555
2556 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2557 if (BE (start > 0, 0))
2558 {
2559 tree = elem;
2560 for (i = 2; i <= start; ++i)
2561 {
2562 elem = duplicate_tree (elem, dfa);
2563 tree = create_tree (dfa, tree, elem, CONCAT);
2564 if (BE (elem == NULL || tree == NULL, 0))
2565 goto parse_dup_op_espace;
2566 }
2567
2568 if (start == end)
2569 return tree;
2570
2571 /* Duplicate ELEM before it is marked optional. */
2572 elem = duplicate_tree (elem, dfa);
2573 old_tree = tree;
2574 }
2575 else
2576 old_tree = NULL;
2577
2578 if (elem->token.type == SUBEXP)
2579 postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2580
2581 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2582 if (BE (tree == NULL, 0))
2583 goto parse_dup_op_espace;
2584
2585 /* This loop is actually executed only when end != -1,
2586 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2587 already created the start+1-th copy. */
2588 for (i = start + 2; i <= end; ++i)
2589 {
2590 elem = duplicate_tree (elem, dfa);
2591 tree = create_tree (dfa, tree, elem, CONCAT);
2592 if (BE (elem == NULL || tree == NULL, 0))
2593 goto parse_dup_op_espace;
2594
2595 tree = create_tree (dfa, tree, NULL, OP_ALT);
2596 if (BE (tree == NULL, 0))
2597 goto parse_dup_op_espace;
2598 }
2599
2600 if (old_tree)
2601 tree = create_tree (dfa, old_tree, tree, CONCAT);
2602
2603 return tree;
2604
2605 parse_dup_op_espace:
2606 *err = REG_ESPACE;
2607 return NULL;
2608}
2609
2610/* Size of the names for collating symbol/equivalence_class/character_class.
2611 I'm not sure, but maybe enough. */
2612#define BRACKET_NAME_BUF_SIZE 32
2613
2614#ifndef _LIBC
2615 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2616 Build the range expression which starts from START_ELEM, and ends
2617 at END_ELEM. The result are written to MBCSET and SBCSET.
2618 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2619 mbcset->range_ends, is a pointer argument since we may
2620 update it. */
2621
2622static reg_errcode_t
2623internal_function
2624# ifdef RE_ENABLE_I18N
2625build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2626 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2627# else /* not RE_ENABLE_I18N */
2628build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2629 bracket_elem_t *end_elem)
2630# endif /* not RE_ENABLE_I18N */
2631{
2632 unsigned int start_ch, end_ch;
2633 /* Equivalence Classes and Character Classes can't be a range start/end. */
2634 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2635 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2636 0))
2637 return REG_ERANGE;
2638
2639 /* We can handle no multi character collating elements without libc
2640 support. */
2641 if (BE ((start_elem->type == COLL_SYM
2642 && strlen ((char *) start_elem->opr.name) > 1)
2643 || (end_elem->type == COLL_SYM
2644 && strlen ((char *) end_elem->opr.name) > 1), 0))
2645 return REG_ECOLLATE;
2646
2647# ifdef RE_ENABLE_I18N
2648 {
2649 wchar_t wc;
2650 wint_t start_wc;
2651 wint_t end_wc;
2652 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2653
2654 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2655 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2656 : 0));
2657 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2658 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2659 : 0));
2660#ifdef GAWK
2661 /*
2662 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2663 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2664 * unsigned, so we don't have sign extension problems.
2665 */
2666 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2667 ? start_ch : start_elem->opr.wch);
2668 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2669 ? end_ch : end_elem->opr.wch);
2670#else
2671 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2672 ? __btowc (start_ch) : start_elem->opr.wch);
2673 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2674 ? __btowc (end_ch) : end_elem->opr.wch);
2675#endif
2676 if (start_wc == WEOF || end_wc == WEOF)
2677 return REG_ECOLLATE;
2678 cmp_buf[0] = start_wc;
2679 cmp_buf[4] = end_wc;
2680 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2681 return REG_ERANGE;
2682
2683 /* Got valid collation sequence values, add them as a new entry.
2684 However, for !_LIBC we have no collation elements: if the
2685 character set is single byte, the single byte character set
2686 that we build below suffices. parse_bracket_exp passes
2687 no MBCSET if dfa->mb_cur_max == 1. */
2688 if (mbcset)
2689 {
2690 /* Check the space of the arrays. */
2691 if (BE (*range_alloc == mbcset->nranges, 0))
2692 {
2693 /* There is not enough space, need realloc. */
2694 wchar_t *new_array_start, *new_array_end;
2695 int new_nranges;
2696
2697 /* +1 in case of mbcset->nranges is 0. */
2698 new_nranges = 2 * mbcset->nranges + 1;
2699 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2700 are NULL if *range_alloc == 0. */
2701 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2702 new_nranges);
2703 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2704 new_nranges);
2705
2706 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2707 return REG_ESPACE;
2708
2709 mbcset->range_starts = new_array_start;
2710 mbcset->range_ends = new_array_end;
2711 *range_alloc = new_nranges;
2712 }
2713
2714 mbcset->range_starts[mbcset->nranges] = start_wc;
2715 mbcset->range_ends[mbcset->nranges++] = end_wc;
2716 }
2717
2718 /* Build the table for single byte characters. */
2719 for (wc = 0; wc < SBC_MAX; ++wc)
2720 {
2721 cmp_buf[2] = wc;
2722 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2723 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2724 bitset_set (sbcset, wc);
2725 }
2726 }
2727# else /* not RE_ENABLE_I18N */
2728 {
2729 unsigned int ch;
2730 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2731 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2732 : 0));
2733 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2734 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2735 : 0));
2736 if (start_ch > end_ch)
2737 return REG_ERANGE;
2738 /* Build the table for single byte characters. */
2739 for (ch = 0; ch < SBC_MAX; ++ch)
2740 if (start_ch <= ch && ch <= end_ch)
2741 bitset_set (sbcset, ch);
2742 }
2743# endif /* not RE_ENABLE_I18N */
2744 return REG_NOERROR;
2745}
2746#endif /* not _LIBC */
2747
2748#ifndef _LIBC
2749/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2750 Build the collating element which is represented by NAME.
2751 The result are written to MBCSET and SBCSET.
2752 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2753 pointer argument since we may update it. */
2754
2755static reg_errcode_t
2756internal_function
2757# ifdef RE_ENABLE_I18N
2758build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2759 int *coll_sym_alloc, const unsigned char *name)
2760# else /* not RE_ENABLE_I18N */
2761build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2762# endif /* not RE_ENABLE_I18N */
2763{
2764 size_t name_len = strlen ((const char *) name);
2765 if (BE (name_len != 1, 0))
2766 return REG_ECOLLATE;
2767 else
2768 {
2769 bitset_set (sbcset, name[0]);
2770 return REG_NOERROR;
2771 }
2772}
2773#endif /* not _LIBC */
2774
2775/* This function parse bracket expression like "[abc]", "[a-c]",
2776 "[[.a-a.]]" etc. */
2777
2778static bin_tree_t *
2779parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2780 reg_syntax_t syntax, reg_errcode_t *err)
2781{
2782#ifdef _LIBC
2783 const unsigned char *collseqmb;
2784 const char *collseqwc;
2785 uint32_t nrules;
2786 int32_t table_size;
2787 const int32_t *symb_table;
2788 const unsigned char *extra;
2789
2790 /* Local function for parse_bracket_exp used in _LIBC environment.
2791 Seek the collating symbol entry correspondings to NAME.
2792 Return the index of the symbol in the SYMB_TABLE. */
2793
2794 auto inline int32_t
2795 __attribute ((always_inline))
2796 seek_collating_symbol_entry (name, name_len)
2797 const unsigned char *name;
2798 size_t name_len;
2799 {
2800 int32_t hash = elem_hash ((const char *) name, name_len);
2801 int32_t elem = hash % table_size;
2802 if (symb_table[2 * elem] != 0)
2803 {
2804 int32_t second = hash % (table_size - 2) + 1;
2805
2806 do
2807 {
2808 /* First compare the hashing value. */
2809 if (symb_table[2 * elem] == hash
2810 /* Compare the length of the name. */
2811 && name_len == extra[symb_table[2 * elem + 1]]
2812 /* Compare the name. */
2813 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2814 name_len) == 0)
2815 {
2816 /* Yep, this is the entry. */
2817 break;
2818 }
2819
2820 /* Next entry. */
2821 elem += second;
2822 }
2823 while (symb_table[2 * elem] != 0);
2824 }
2825 return elem;
2826 }
2827
2828 /* Local function for parse_bracket_exp used in _LIBC environment.
2829 Look up the collation sequence value of BR_ELEM.
2830 Return the value if succeeded, UINT_MAX otherwise. */
2831
2832 auto inline unsigned int
2833 __attribute ((always_inline))
2834 lookup_collation_sequence_value (br_elem)
2835 bracket_elem_t *br_elem;
2836 {
2837 if (br_elem->type == SB_CHAR)
2838 {
2839 /*
2840 if (MB_CUR_MAX == 1)
2841 */
2842 if (nrules == 0)
2843 return collseqmb[br_elem->opr.ch];
2844 else
2845 {
2846 wint_t wc = __btowc (br_elem->opr.ch);
2847 return __collseq_table_lookup (collseqwc, wc);
2848 }
2849 }
2850 else if (br_elem->type == MB_CHAR)
2851 {
2852 if (nrules != 0)
2853 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2854 }
2855 else if (br_elem->type == COLL_SYM)
2856 {
2857 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2858 if (nrules != 0)
2859 {
2860 int32_t elem, idx;
2861 elem = seek_collating_symbol_entry (br_elem->opr.name,
2862 sym_name_len);
2863 if (symb_table[2 * elem] != 0)
2864 {
2865 /* We found the entry. */
2866 idx = symb_table[2 * elem + 1];
2867 /* Skip the name of collating element name. */
2868 idx += 1 + extra[idx];
2869 /* Skip the byte sequence of the collating element. */
2870 idx += 1 + extra[idx];
2871 /* Adjust for the alignment. */
2872 idx = (idx + 3) & ~3;
2873 /* Skip the multibyte collation sequence value. */
2874 idx += sizeof (unsigned int);
2875 /* Skip the wide char sequence of the collating element. */
2876 idx += sizeof (unsigned int) *
2877 (1 + *(unsigned int *) (extra + idx));
2878 /* Return the collation sequence value. */
2879 return *(unsigned int *) (extra + idx);
2880 }
2881 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2882 {
2883 /* No valid character. Match it as a single byte
2884 character. */
2885 return collseqmb[br_elem->opr.name[0]];
2886 }
2887 }
2888 else if (sym_name_len == 1)
2889 return collseqmb[br_elem->opr.name[0]];
2890 }
2891 return UINT_MAX;
2892 }
2893
2894 /* Local function for parse_bracket_exp used in _LIBC environment.
2895 Build the range expression which starts from START_ELEM, and ends
2896 at END_ELEM. The result are written to MBCSET and SBCSET.
2897 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2898 mbcset->range_ends, is a pointer argument since we may
2899 update it. */
2900
2901 auto inline reg_errcode_t
2902 __attribute ((always_inline))
2903 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2904 re_charset_t *mbcset;
2905 int *range_alloc;
2906 bitset_t sbcset;
2907 bracket_elem_t *start_elem, *end_elem;
2908 {
2909 unsigned int ch;
2910 uint32_t start_collseq;
2911 uint32_t end_collseq;
2912
2913 /* Equivalence Classes and Character Classes can't be a range
2914 start/end. */
2915 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2916 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2917 0))
2918 return REG_ERANGE;
2919
2920 start_collseq = lookup_collation_sequence_value (start_elem);
2921 end_collseq = lookup_collation_sequence_value (end_elem);
2922 /* Check start/end collation sequence values. */
2923 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2924 return REG_ECOLLATE;
2925 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2926 return REG_ERANGE;
2927
2928 /* Got valid collation sequence values, add them as a new entry.
2929 However, if we have no collation elements, and the character set
2930 is single byte, the single byte character set that we
2931 build below suffices. */
2932 if (nrules > 0 || dfa->mb_cur_max > 1)
2933 {
2934 /* Check the space of the arrays. */
2935 if (BE (*range_alloc == mbcset->nranges, 0))
2936 {
2937 /* There is not enough space, need realloc. */
2938 uint32_t *new_array_start;
2939 uint32_t *new_array_end;
2940 int new_nranges;
2941
2942 /* +1 in case of mbcset->nranges is 0. */
2943 new_nranges = 2 * mbcset->nranges + 1;
2944 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2945 new_nranges);
2946 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2947 new_nranges);
2948
2949 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2950 return REG_ESPACE;
2951
2952 mbcset->range_starts = new_array_start;
2953 mbcset->range_ends = new_array_end;
2954 *range_alloc = new_nranges;
2955 }
2956
2957 mbcset->range_starts[mbcset->nranges] = start_collseq;
2958 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2959 }
2960
2961 /* Build the table for single byte characters. */
2962 for (ch = 0; ch < SBC_MAX; ch++)
2963 {
2964 uint32_t ch_collseq;
2965 /*
2966 if (MB_CUR_MAX == 1)
2967 */
2968 if (nrules == 0)
2969 ch_collseq = collseqmb[ch];
2970 else
2971 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2972 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2973 bitset_set (sbcset, ch);
2974 }
2975 return REG_NOERROR;
2976 }
2977
2978 /* Local function for parse_bracket_exp used in _LIBC environment.
2979 Build the collating element which is represented by NAME.
2980 The result are written to MBCSET and SBCSET.
2981 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2982 pointer argument since we may update it. */
2983
2984 auto inline reg_errcode_t
2985 __attribute ((always_inline))
2986 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2987 re_charset_t *mbcset;
2988 int *coll_sym_alloc;
2989 bitset_t sbcset;
2990 const unsigned char *name;
2991 {
2992 int32_t elem, idx;
2993 size_t name_len = strlen ((const char *) name);
2994 if (nrules != 0)
2995 {
2996 elem = seek_collating_symbol_entry (name, name_len);
2997 if (symb_table[2 * elem] != 0)
2998 {
2999 /* We found the entry. */
3000 idx = symb_table[2 * elem + 1];
3001 /* Skip the name of collating element name. */
3002 idx += 1 + extra[idx];
3003 }
3004 else if (symb_table[2 * elem] == 0 && name_len == 1)
3005 {
3006 /* No valid character, treat it as a normal
3007 character. */
3008 bitset_set (sbcset, name[0]);
3009 return REG_NOERROR;
3010 }
3011 else
3012 return REG_ECOLLATE;
3013
3014 /* Got valid collation sequence, add it as a new entry. */
3015 /* Check the space of the arrays. */
3016 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3017 {
3018 /* Not enough, realloc it. */
3019 /* +1 in case of mbcset->ncoll_syms is 0. */
3020 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3021 /* Use realloc since mbcset->coll_syms is NULL
3022 if *alloc == 0. */
3023 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3024 new_coll_sym_alloc);
3025 if (BE (new_coll_syms == NULL, 0))
3026 return REG_ESPACE;
3027 mbcset->coll_syms = new_coll_syms;
3028 *coll_sym_alloc = new_coll_sym_alloc;
3029 }
3030 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3031 return REG_NOERROR;
3032 }
3033 else
3034 {
3035 if (BE (name_len != 1, 0))
3036 return REG_ECOLLATE;
3037 else
3038 {
3039 bitset_set (sbcset, name[0]);
3040 return REG_NOERROR;
3041 }
3042 }
3043 }
3044#endif
3045
3046 re_token_t br_token;
3047 re_bitset_ptr_t sbcset;
3048#ifdef RE_ENABLE_I18N
3049 re_charset_t *mbcset;
3050 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3051 int equiv_class_alloc = 0, char_class_alloc = 0;
3052#endif /* not RE_ENABLE_I18N */
3053 int non_match = 0;
3054 bin_tree_t *work_tree;
3055 int token_len;
3056 int first_round = 1;
3057#ifdef _LIBC
3058 collseqmb = (const unsigned char *)
3059 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3060 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3061 if (nrules)
3062 {
3063 /*
3064 if (MB_CUR_MAX > 1)
3065 */
3066 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3067 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3068 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3069 _NL_COLLATE_SYMB_TABLEMB);
3070 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3071 _NL_COLLATE_SYMB_EXTRAMB);
3072 }
3073#endif
3074 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3075#ifdef RE_ENABLE_I18N
3076 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3077#endif /* RE_ENABLE_I18N */
3078#ifdef RE_ENABLE_I18N
3079 if (BE (sbcset == NULL || mbcset == NULL, 0))
3080#else
3081 if (BE (sbcset == NULL, 0))
3082#endif /* RE_ENABLE_I18N */
3083 {
3084 *err = REG_ESPACE;
3085 return NULL;
3086 }
3087
3088 token_len = peek_token_bracket (token, regexp, syntax);
3089 if (BE (token->type == END_OF_RE, 0))
3090 {
3091 *err = REG_BADPAT;
3092 goto parse_bracket_exp_free_return;
3093 }
3094 if (token->type == OP_NON_MATCH_LIST)
3095 {
3096#ifdef RE_ENABLE_I18N
3097 mbcset->non_match = 1;
3098#endif /* not RE_ENABLE_I18N */
3099 non_match = 1;
3100 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3101 bitset_set (sbcset, '\n');
3102 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3103 token_len = peek_token_bracket (token, regexp, syntax);
3104 if (BE (token->type == END_OF_RE, 0))
3105 {
3106 *err = REG_BADPAT;
3107 goto parse_bracket_exp_free_return;
3108 }
3109 }
3110
3111 /* We treat the first ']' as a normal character. */
3112 if (token->type == OP_CLOSE_BRACKET)
3113 token->type = CHARACTER;
3114
3115 while (1)
3116 {
3117 bracket_elem_t start_elem, end_elem;
3118 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3119 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3120 reg_errcode_t ret;
3121 int token_len2 = 0, is_range_exp = 0;
3122 re_token_t token2;
3123
3124 start_elem.opr.name = start_name_buf;
3125 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3126 syntax, first_round);
3127 if (BE (ret != REG_NOERROR, 0))
3128 {
3129 *err = ret;
3130 goto parse_bracket_exp_free_return;
3131 }
3132 first_round = 0;
3133
3134 /* Get information about the next token. We need it in any case. */
3135 token_len = peek_token_bracket (token, regexp, syntax);
3136
3137 /* Do not check for ranges if we know they are not allowed. */
3138 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3139 {
3140 if (BE (token->type == END_OF_RE, 0))
3141 {
3142 *err = REG_EBRACK;
3143 goto parse_bracket_exp_free_return;
3144 }
3145 if (token->type == OP_CHARSET_RANGE)
3146 {
3147 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3148 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3149 if (BE (token2.type == END_OF_RE, 0))
3150 {
3151 *err = REG_EBRACK;
3152 goto parse_bracket_exp_free_return;
3153 }
3154 if (token2.type == OP_CLOSE_BRACKET)
3155 {
3156 /* We treat the last '-' as a normal character. */
3157 re_string_skip_bytes (regexp, -token_len);
3158 token->type = CHARACTER;
3159 }
3160 else
3161 is_range_exp = 1;
3162 }
3163 }
3164
3165 if (is_range_exp == 1)
3166 {
3167 end_elem.opr.name = end_name_buf;
3168 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3169 dfa, syntax, 1);
3170 if (BE (ret != REG_NOERROR, 0))
3171 {
3172 *err = ret;
3173 goto parse_bracket_exp_free_return;
3174 }
3175
3176 token_len = peek_token_bracket (token, regexp, syntax);
3177
3178#ifdef _LIBC
3179 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3180 &start_elem, &end_elem);
3181#else
3182# ifdef RE_ENABLE_I18N
3183 *err = build_range_exp (sbcset,
3184 dfa->mb_cur_max > 1 ? mbcset : NULL,
3185 &range_alloc, &start_elem, &end_elem);
3186# else
3187 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3188# endif
3189#endif /* RE_ENABLE_I18N */
3190 if (BE (*err != REG_NOERROR, 0))
3191 goto parse_bracket_exp_free_return;
3192 }
3193 else
3194 {
3195 switch (start_elem.type)
3196 {
3197 case SB_CHAR:
3198 bitset_set (sbcset, start_elem.opr.ch);
3199 break;
3200#ifdef RE_ENABLE_I18N
3201 case MB_CHAR:
3202 /* Check whether the array has enough space. */
3203 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3204 {
3205 wchar_t *new_mbchars;
3206 /* Not enough, realloc it. */
3207 /* +1 in case of mbcset->nmbchars is 0. */
3208 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3209 /* Use realloc since array is NULL if *alloc == 0. */
3210 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3211 mbchar_alloc);
3212 if (BE (new_mbchars == NULL, 0))
3213 goto parse_bracket_exp_espace;
3214 mbcset->mbchars = new_mbchars;
3215 }
3216 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3217 break;
3218#endif /* RE_ENABLE_I18N */
3219 case EQUIV_CLASS:
3220 *err = build_equiv_class (sbcset,
3221#ifdef RE_ENABLE_I18N
3222 mbcset, &equiv_class_alloc,
3223#endif /* RE_ENABLE_I18N */
3224 start_elem.opr.name);
3225 if (BE (*err != REG_NOERROR, 0))
3226 goto parse_bracket_exp_free_return;
3227 break;
3228 case COLL_SYM:
3229 *err = build_collating_symbol (sbcset,
3230#ifdef RE_ENABLE_I18N
3231 mbcset, &coll_sym_alloc,
3232#endif /* RE_ENABLE_I18N */
3233 start_elem.opr.name);
3234 if (BE (*err != REG_NOERROR, 0))
3235 goto parse_bracket_exp_free_return;
3236 break;
3237 case CHAR_CLASS:
3238 *err = build_charclass (regexp->trans, sbcset,
3239#ifdef RE_ENABLE_I18N
3240 mbcset, &char_class_alloc,
3241#endif /* RE_ENABLE_I18N */
3242 (const char *) start_elem.opr.name, syntax);
3243 if (BE (*err != REG_NOERROR, 0))
3244 goto parse_bracket_exp_free_return;
3245 break;
3246 default:
3247 assert (0);
3248 break;
3249 }
3250 }
3251 if (BE (token->type == END_OF_RE, 0))
3252 {
3253 *err = REG_EBRACK;
3254 goto parse_bracket_exp_free_return;
3255 }
3256 if (token->type == OP_CLOSE_BRACKET)
3257 break;
3258 }
3259
3260 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3261
3262 /* If it is non-matching list. */
3263 if (non_match)
3264 bitset_not (sbcset);
3265
3266#ifdef RE_ENABLE_I18N
3267 /* Ensure only single byte characters are set. */
3268 if (dfa->mb_cur_max > 1)
3269 bitset_mask (sbcset, dfa->sb_char);
3270
3271 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3272 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3273 || mbcset->non_match)))
3274 {
3275 bin_tree_t *mbc_tree;
3276 int sbc_idx;
3277 /* Build a tree for complex bracket. */
3278 dfa->has_mb_node = 1;
3279 br_token.type = COMPLEX_BRACKET;
3280 br_token.opr.mbcset = mbcset;
3281 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282 if (BE (mbc_tree == NULL, 0))
3283 goto parse_bracket_exp_espace;
3284 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3285 if (sbcset[sbc_idx])
3286 break;
3287 /* If there are no bits set in sbcset, there is no point
3288 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3289 if (sbc_idx < BITSET_WORDS)
3290 {
3291 /* Build a tree for simple bracket. */
3292 br_token.type = SIMPLE_BRACKET;
3293 br_token.opr.sbcset = sbcset;
3294 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3295 if (BE (work_tree == NULL, 0))
3296 goto parse_bracket_exp_espace;
3297
3298 /* Then join them by ALT node. */
3299 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3300 if (BE (work_tree == NULL, 0))
3301 goto parse_bracket_exp_espace;
3302 }
3303 else
3304 {
3305 re_free (sbcset);
3306 work_tree = mbc_tree;
3307 }
3308 }
3309 else
3310#endif /* not RE_ENABLE_I18N */
3311 {
3312#ifdef RE_ENABLE_I18N
3313 free_charset (mbcset);
3314#endif
3315 /* Build a tree for simple bracket. */
3316 br_token.type = SIMPLE_BRACKET;
3317 br_token.opr.sbcset = sbcset;
3318 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3319 if (BE (work_tree == NULL, 0))
3320 goto parse_bracket_exp_espace;
3321 }
3322 return work_tree;
3323
3324 parse_bracket_exp_espace:
3325 *err = REG_ESPACE;
3326 parse_bracket_exp_free_return:
3327 re_free (sbcset);
3328#ifdef RE_ENABLE_I18N
3329 free_charset (mbcset);
3330#endif /* RE_ENABLE_I18N */
3331 return NULL;
3332}
3333
3334/* Parse an element in the bracket expression. */
3335
3336static reg_errcode_t
3337parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3338 re_token_t *token, int token_len, re_dfa_t *dfa,
3339 reg_syntax_t syntax, int accept_hyphen)
3340{
3341#ifdef RE_ENABLE_I18N
3342 int cur_char_size;
3343 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3344 if (cur_char_size > 1)
3345 {
3346 elem->type = MB_CHAR;
3347 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3348 re_string_skip_bytes (regexp, cur_char_size);
3349 return REG_NOERROR;
3350 }
3351#endif /* RE_ENABLE_I18N */
3352 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3353 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3354 || token->type == OP_OPEN_EQUIV_CLASS)
3355 return parse_bracket_symbol (elem, regexp, token);
3356 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3357 {
3358 /* A '-' must only appear as anything but a range indicator before
3359 the closing bracket. Everything else is an error. */
3360 re_token_t token2;
3361 (void) peek_token_bracket (&token2, regexp, syntax);
3362 if (token2.type != OP_CLOSE_BRACKET)
3363 /* The actual error value is not standardized since this whole
3364 case is undefined. But ERANGE makes good sense. */
3365 return REG_ERANGE;
3366 }
3367 elem->type = SB_CHAR;
3368 elem->opr.ch = token->opr.c;
3369 return REG_NOERROR;
3370}
3371
3372/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3373 such as [:<character_class>:], [.<collating_element>.], and
3374 [=<equivalent_class>=]. */
3375
3376static reg_errcode_t
3377parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3378 re_token_t *token)
3379{
3380 unsigned char ch, delim = token->opr.c;
3381 int i = 0;
3382 if (re_string_eoi(regexp))
3383 return REG_EBRACK;
3384 for (;; ++i)
3385 {
3386 if (i >= BRACKET_NAME_BUF_SIZE)
3387 return REG_EBRACK;
3388 if (token->type == OP_OPEN_CHAR_CLASS)
3389 ch = re_string_fetch_byte_case (regexp);
3390 else
3391 ch = re_string_fetch_byte (regexp);
3392 if (re_string_eoi(regexp))
3393 return REG_EBRACK;
3394 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3395 break;
3396 elem->opr.name[i] = ch;
3397 }
3398 re_string_skip_bytes (regexp, 1);
3399 elem->opr.name[i] = '\0';
3400 switch (token->type)
3401 {
3402 case OP_OPEN_COLL_ELEM:
3403 elem->type = COLL_SYM;
3404 break;
3405 case OP_OPEN_EQUIV_CLASS:
3406 elem->type = EQUIV_CLASS;
3407 break;
3408 case OP_OPEN_CHAR_CLASS:
3409 elem->type = CHAR_CLASS;
3410 break;
3411 default:
3412 break;
3413 }
3414 return REG_NOERROR;
3415}
3416
3417 /* Helper function for parse_bracket_exp.
3418 Build the equivalence class which is represented by NAME.
3419 The result are written to MBCSET and SBCSET.
3420 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3421 is a pointer argument since we may update it. */
3422
3423static reg_errcode_t
3424#ifdef RE_ENABLE_I18N
3425build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3426 int *equiv_class_alloc, const unsigned char *name)
3427#else /* not RE_ENABLE_I18N */
3428build_equiv_class (bitset_t sbcset, const unsigned char *name)
3429#endif /* not RE_ENABLE_I18N */
3430{
3431#ifdef _LIBC
3432 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3433 if (nrules != 0)
3434 {
3435 const int32_t *table, *indirect;
3436 const unsigned char *weights, *extra, *cp;
3437 unsigned char char_buf[2];
3438 int32_t idx1, idx2;
3439 unsigned int ch;
3440 size_t len;
3441 /* This #include defines a local function! */
3442# include <locale/weight.h>
3443 /* Calculate the index for equivalence class. */
3444 cp = name;
3445 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3446 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3447 _NL_COLLATE_WEIGHTMB);
3448 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3449 _NL_COLLATE_EXTRAMB);
3450 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3451 _NL_COLLATE_INDIRECTMB);
3452 idx1 = findidx (&cp);
3453 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3454 /* This isn't a valid character. */
3455 return REG_ECOLLATE;
3456
3457 /* Build single byte matcing table for this equivalence class. */
3458 char_buf[1] = (unsigned char) '\0';
3459 len = weights[idx1 & 0xffffff];
3460 for (ch = 0; ch < SBC_MAX; ++ch)
3461 {
3462 char_buf[0] = ch;
3463 cp = char_buf;
3464 idx2 = findidx (&cp);
3465/*
3466 idx2 = table[ch];
3467*/
3468 if (idx2 == 0)
3469 /* This isn't a valid character. */
3470 continue;
3471 /* Compare only if the length matches and the collation rule
3472 index is the same. */
3473 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3474 {
3475 int cnt = 0;
3476
3477 while (cnt <= len &&
3478 weights[(idx1 & 0xffffff) + 1 + cnt]
3479 == weights[(idx2 & 0xffffff) + 1 + cnt])
3480 ++cnt;
3481
3482 if (cnt > len)
3483 bitset_set (sbcset, ch);
3484 }
3485 }
3486 /* Check whether the array has enough space. */
3487 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3488 {
3489 /* Not enough, realloc it. */
3490 /* +1 in case of mbcset->nequiv_classes is 0. */
3491 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3492 /* Use realloc since the array is NULL if *alloc == 0. */
3493 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3494 int32_t,
3495 new_equiv_class_alloc);
3496 if (BE (new_equiv_classes == NULL, 0))
3497 return REG_ESPACE;
3498 mbcset->equiv_classes = new_equiv_classes;
3499 *equiv_class_alloc = new_equiv_class_alloc;
3500 }
3501 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3502 }
3503 else
3504#endif /* _LIBC */
3505 {
3506 if (BE (strlen ((const char *) name) != 1, 0))
3507 return REG_ECOLLATE;
3508 bitset_set (sbcset, *name);
3509 }
3510 return REG_NOERROR;
3511}
3512
3513 /* Helper function for parse_bracket_exp.
3514 Build the character class which is represented by NAME.
3515 The result are written to MBCSET and SBCSET.
3516 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3517 is a pointer argument since we may update it. */
3518
3519static reg_errcode_t
3520#ifdef RE_ENABLE_I18N
3521build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3522 re_charset_t *mbcset, int *char_class_alloc,
3523 const char *class_name, reg_syntax_t syntax)
3524#else /* not RE_ENABLE_I18N */
3525build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3526 const char *class_name, reg_syntax_t syntax)
3527#endif /* not RE_ENABLE_I18N */
3528{
3529 int i;
3530
3531 /* In case of REG_ICASE "upper" and "lower" match the both of
3532 upper and lower cases. */
3533 if ((syntax & RE_ICASE)
3534 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3535 class_name = "alpha";
3536
3537#ifdef RE_ENABLE_I18N
3538 /* Check the space of the arrays. */
3539 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3540 {
3541 /* Not enough, realloc it. */
3542 /* +1 in case of mbcset->nchar_classes is 0. */
3543 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3544 /* Use realloc since array is NULL if *alloc == 0. */
3545 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3546 new_char_class_alloc);
3547 if (BE (new_char_classes == NULL, 0))
3548 return REG_ESPACE;
3549 mbcset->char_classes = new_char_classes;
3550 *char_class_alloc = new_char_class_alloc;
3551 }
3552 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3553#endif /* RE_ENABLE_I18N */
3554
3555#define BUILD_CHARCLASS_LOOP(ctype_func) \
3556 do { \
3557 if (BE (trans != NULL, 0)) \
3558 { \
3559 for (i = 0; i < SBC_MAX; ++i) \
3560 if (ctype_func (i)) \
3561 bitset_set (sbcset, trans[i]); \
3562 } \
3563 else \
3564 { \
3565 for (i = 0; i < SBC_MAX; ++i) \
3566 if (ctype_func (i)) \
3567 bitset_set (sbcset, i); \
3568 } \
3569 } while (0)
3570
3571 if (strcmp (class_name, "alnum") == 0)
3572 BUILD_CHARCLASS_LOOP (isalnum);
3573 else if (strcmp (class_name, "cntrl") == 0)
3574 BUILD_CHARCLASS_LOOP (iscntrl);
3575 else if (strcmp (class_name, "lower") == 0)
3576 BUILD_CHARCLASS_LOOP (islower);
3577 else if (strcmp (class_name, "space") == 0)
3578 BUILD_CHARCLASS_LOOP (isspace);
3579 else if (strcmp (class_name, "alpha") == 0)
3580 BUILD_CHARCLASS_LOOP (isalpha);
3581 else if (strcmp (class_name, "digit") == 0)
3582 BUILD_CHARCLASS_LOOP (isdigit);
3583 else if (strcmp (class_name, "print") == 0)
3584 BUILD_CHARCLASS_LOOP (isprint);
3585 else if (strcmp (class_name, "upper") == 0)
3586 BUILD_CHARCLASS_LOOP (isupper);
3587 else if (strcmp (class_name, "blank") == 0)
3588#ifndef GAWK
3589 BUILD_CHARCLASS_LOOP (isblank);
3590#else
3591 /* see comments above */
3592 BUILD_CHARCLASS_LOOP (is_blank);
3593#endif
3594 else if (strcmp (class_name, "graph") == 0)
3595 BUILD_CHARCLASS_LOOP (isgraph);
3596 else if (strcmp (class_name, "punct") == 0)
3597 BUILD_CHARCLASS_LOOP (ispunct);
3598 else if (strcmp (class_name, "xdigit") == 0)
3599 BUILD_CHARCLASS_LOOP (isxdigit);
3600 else
3601 return REG_ECTYPE;
3602
3603 return REG_NOERROR;
3604}
3605
3606static bin_tree_t *
3607build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3608 const char *class_name,
3609 const char *extra, int non_match,
3610 reg_errcode_t *err)
3611{
3612 re_bitset_ptr_t sbcset;
3613#ifdef RE_ENABLE_I18N
3614 re_charset_t *mbcset;
3615 int alloc = 0;
3616#endif /* not RE_ENABLE_I18N */
3617 reg_errcode_t ret;
3618 re_token_t br_token;
3619 bin_tree_t *tree;
3620
3621 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3622#ifdef RE_ENABLE_I18N
3623 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3624#endif /* RE_ENABLE_I18N */
3625
3626#ifdef RE_ENABLE_I18N
3627 if (BE (sbcset == NULL || mbcset == NULL, 0))
3628#else /* not RE_ENABLE_I18N */
3629 if (BE (sbcset == NULL, 0))
3630#endif /* not RE_ENABLE_I18N */
3631 {
3632 *err = REG_ESPACE;
3633 return NULL;
3634 }
3635
3636 if (non_match)
3637 {
3638#ifdef RE_ENABLE_I18N
3639 mbcset->non_match = 1;
3640#endif /* not RE_ENABLE_I18N */
3641 }
3642
3643 /* We don't care the syntax in this case. */
3644 ret = build_charclass (trans, sbcset,
3645#ifdef RE_ENABLE_I18N
3646 mbcset, &alloc,
3647#endif /* RE_ENABLE_I18N */
3648 class_name, 0);
3649
3650 if (BE (ret != REG_NOERROR, 0))
3651 {
3652 re_free (sbcset);
3653#ifdef RE_ENABLE_I18N
3654 free_charset (mbcset);
3655#endif /* RE_ENABLE_I18N */
3656 *err = ret;
3657 return NULL;
3658 }
3659 /* \w match '_' also. */
3660 for (; *extra; extra++)
3661 bitset_set (sbcset, *extra);
3662
3663 /* If it is non-matching list. */
3664 if (non_match)
3665 bitset_not (sbcset);
3666
3667#ifdef RE_ENABLE_I18N
3668 /* Ensure only single byte characters are set. */
3669 if (dfa->mb_cur_max > 1)
3670 bitset_mask (sbcset, dfa->sb_char);
3671#endif
3672
3673 /* Build a tree for simple bracket. */
3674 br_token.type = SIMPLE_BRACKET;
3675 br_token.opr.sbcset = sbcset;
3676 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3677 if (BE (tree == NULL, 0))
3678 goto build_word_op_espace;
3679
3680#ifdef RE_ENABLE_I18N
3681 if (dfa->mb_cur_max > 1)
3682 {
3683 bin_tree_t *mbc_tree;
3684 /* Build a tree for complex bracket. */
3685 br_token.type = COMPLEX_BRACKET;
3686 br_token.opr.mbcset = mbcset;
3687 dfa->has_mb_node = 1;
3688 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3689 if (BE (mbc_tree == NULL, 0))
3690 goto build_word_op_espace;
3691 /* Then join them by ALT node. */
3692 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3693 if (BE (mbc_tree != NULL, 1))
3694 return tree;
3695 }
3696 else
3697 {
3698 free_charset (mbcset);
3699 return tree;
3700 }
3701#else /* not RE_ENABLE_I18N */
3702 return tree;
3703#endif /* not RE_ENABLE_I18N */
3704
3705 build_word_op_espace:
3706 re_free (sbcset);
3707#ifdef RE_ENABLE_I18N
3708 free_charset (mbcset);
3709#endif /* RE_ENABLE_I18N */
3710 *err = REG_ESPACE;
3711 return NULL;
3712}
3713
3714/* This is intended for the expressions like "a{1,3}".
3715 Fetch a number from `input', and return the number.
3716 Return -1, if the number field is empty like "{,1}".
3717 Return -2, if an error has occurred. */
3718
3719static int
3720fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3721{
3722 int num = -1;
3723 unsigned char c;
3724 while (1)
3725 {
3726 fetch_token (token, input, syntax);
3727 c = token->opr.c;
3728 if (BE (token->type == END_OF_RE, 0))
3729 return -2;
3730 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3731 break;
3732 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3733 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3734 num = (num > RE_DUP_MAX) ? -2 : num;
3735 }
3736 return num;
3737}
3738\f
3739#ifdef RE_ENABLE_I18N
3740static void
3741free_charset (re_charset_t *cset)
3742{
3743 re_free (cset->mbchars);
3744# ifdef _LIBC
3745 re_free (cset->coll_syms);
3746 re_free (cset->equiv_classes);
3747 re_free (cset->range_starts);
3748 re_free (cset->range_ends);
3749# endif
3750 re_free (cset->char_classes);
3751 re_free (cset);
3752}
3753#endif /* RE_ENABLE_I18N */
3754\f
3755/* Functions for binary tree operation. */
3756
3757/* Create a tree node. */
3758
3759static bin_tree_t *
3760create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3761 re_token_type_t type)
3762{
3763 re_token_t t;
3764 t.type = type;
3765 return create_token_tree (dfa, left, right, &t);
3766}
3767
3768static bin_tree_t *
3769create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3770 const re_token_t *token)
3771{
3772 bin_tree_t *tree;
3773 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3774 {
3775 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3776
3777 if (storage == NULL)
3778 return NULL;
3779 storage->next = dfa->str_tree_storage;
3780 dfa->str_tree_storage = storage;
3781 dfa->str_tree_storage_idx = 0;
3782 }
3783 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3784
3785 tree->parent = NULL;
3786 tree->left = left;
3787 tree->right = right;
3788 tree->token = *token;
3789 tree->token.duplicated = 0;
3790 tree->token.opt_subexp = 0;
3791 tree->first = NULL;
3792 tree->next = NULL;
3793 tree->node_idx = -1;
3794
3795 if (left != NULL)
3796 left->parent = tree;
3797 if (right != NULL)
3798 right->parent = tree;
3799 return tree;
3800}
3801
3802/* Mark the tree SRC as an optional subexpression.
3803 To be called from preorder or postorder. */
3804
3805static reg_errcode_t
3806mark_opt_subexp (void *extra, bin_tree_t *node)
3807{
3808 int idx = (int) (intptr_t) extra;
3809 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3810 node->token.opt_subexp = 1;
3811
3812 return REG_NOERROR;
3813}
3814
3815/* Free the allocated memory inside NODE. */
3816
3817static void
3818free_token (re_token_t *node)
3819{
3820#ifdef RE_ENABLE_I18N
3821 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3822 free_charset (node->opr.mbcset);
3823 else
3824#endif /* RE_ENABLE_I18N */
3825 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3826 re_free (node->opr.sbcset);
3827}
3828
3829/* Worker function for tree walking. Free the allocated memory inside NODE
3830 and its children. */
3831
3832static reg_errcode_t
3833free_tree (void *extra, bin_tree_t *node)
3834{
3835 free_token (&node->token);
3836 return REG_NOERROR;
3837}
3838
3839
3840/* Duplicate the node SRC, and return new node. This is a preorder
3841 visit similar to the one implemented by the generic visitor, but
3842 we need more infrastructure to maintain two parallel trees --- so,
3843 it's easier to duplicate. */
3844
3845static bin_tree_t *
3846duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3847{
3848 const bin_tree_t *node;
3849 bin_tree_t *dup_root;
3850 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3851
3852 for (node = root; ; )
3853 {
3854 /* Create a new tree and link it back to the current parent. */
3855 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3856 if (*p_new == NULL)
3857 return NULL;
3858 (*p_new)->parent = dup_node;
3859 (*p_new)->token.duplicated = 1;
3860 dup_node = *p_new;
3861
3862 /* Go to the left node, or up and to the right. */
3863 if (node->left)
3864 {
3865 node = node->left;
3866 p_new = &dup_node->left;
3867 }
3868 else
3869 {
3870 const bin_tree_t *prev = NULL;
3871 while (node->right == prev || node->right == NULL)
3872 {
3873 prev = node;
3874 node = node->parent;
3875 dup_node = dup_node->parent;
3876 if (!node)
3877 return dup_root;
3878 }
3879 node = node->right;
3880 p_new = &dup_node->right;
3881 }
3882 }
3883}