1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21#include <stdint.h>
22
23static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
24 size_t length, reg_syntax_t syntax);
25static void re_compile_fastmap_iter (regex_t *bufp,
26 const re_dfastate_t *init_state,
27 char *fastmap);
28static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
29#ifdef RE_ENABLE_I18N
30static void free_charset (re_charset_t *cset);
31#endif /* RE_ENABLE_I18N */
32static void free_workarea_compile (regex_t *preg);
33static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34#ifdef RE_ENABLE_I18N
35static void optimize_utf8 (re_dfa_t *dfa);
36#endif
37static reg_errcode_t analyze (regex_t *preg);
38static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 bin_tree_t *node);
48static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
52static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
53 unsigned int constraint);
54static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
55static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 int node, int root);
57static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
58static int fetch_number (re_string_t *input, re_token_t *token,
59 reg_syntax_t syntax);
60static int peek_token (re_token_t *token, re_string_t *input,
61 reg_syntax_t syntax) internal_function;
62static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
63 reg_syntax_t syntax, reg_errcode_t *err);
64static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
77 re_dfa_t *dfa, re_token_t *token,
78 reg_syntax_t syntax, reg_errcode_t *err);
79static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
80 re_token_t *token, reg_syntax_t syntax,
81 reg_errcode_t *err);
82static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 re_string_t *regexp,
84 re_token_t *token, int token_len,
85 re_dfa_t *dfa,
86 reg_syntax_t syntax,
87 int accept_hyphen);
88static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 re_string_t *regexp,
90 re_token_t *token);
91#ifdef RE_ENABLE_I18N
92static reg_errcode_t build_equiv_class (bitset_t sbcset,
93 re_charset_t *mbcset,
94 int *equiv_class_alloc,
95 const unsigned char *name);
96static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 bitset_t sbcset,
98 re_charset_t *mbcset,
99 int *char_class_alloc,
100 const char *class_name,
101 reg_syntax_t syntax);
102#else /* not RE_ENABLE_I18N */
103static reg_errcode_t build_equiv_class (bitset_t sbcset,
104 const unsigned char *name);
105static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
106 bitset_t sbcset,
107 const char *class_name,
108 reg_syntax_t syntax);
109#endif /* not RE_ENABLE_I18N */
110static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
111 RE_TRANSLATE_TYPE trans,
112 const char *class_name,
113 const char *extra,
114 int non_match, reg_errcode_t *err);
115static bin_tree_t *create_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 re_token_type_t type);
118static bin_tree_t *create_token_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 const re_token_t *token);
121static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
122static void free_token (re_token_t *node);
123static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
124static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125\f
126/* This table gives an error message for each of the error codes listed
127 in regex.h. Obviously the order here has to be same as there.
128 POSIX doesn't require that we do anything for REG_NOERROR,
129 but why not be nice? */
130
131const char __re_error_msgid[] attribute_hidden =
132 {
133#define REG_NOERROR_IDX 0
134 gettext_noop ("Success") /* REG_NOERROR */
135 "\0"
136#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
137 gettext_noop ("No match") /* REG_NOMATCH */
138 "\0"
139#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
140 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 "\0"
142#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
143 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 "\0"
145#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
146 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 "\0"
148#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
149 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 "\0"
151#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
152 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 "\0"
154#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
155 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 "\0"
157#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
158 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 "\0"
160#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
161 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 "\0"
163#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
164 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 "\0"
166#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
167 gettext_noop ("Invalid range end") /* REG_ERANGE */
168 "\0"
169#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
170 gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 "\0"
172#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
173 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 "\0"
175#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
176 gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 "\0"
178#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
179 gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 "\0"
181#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
182 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 };
184
185const size_t __re_error_msgid_idx[] attribute_hidden =
186 {
187 REG_NOERROR_IDX,
188 REG_NOMATCH_IDX,
189 REG_BADPAT_IDX,
190 REG_ECOLLATE_IDX,
191 REG_ECTYPE_IDX,
192 REG_EESCAPE_IDX,
193 REG_ESUBREG_IDX,
194 REG_EBRACK_IDX,
195 REG_EPAREN_IDX,
196 REG_EBRACE_IDX,
197 REG_BADBR_IDX,
198 REG_ERANGE_IDX,
199 REG_ESPACE_IDX,
200 REG_BADRPT_IDX,
201 REG_EEND_IDX,
202 REG_ESIZE_IDX,
203 REG_ERPAREN_IDX
204 };
205\f
206/* Entry points for GNU code. */
207
208
209#ifdef ZOS_USS
210
211/* For ZOS USS we must define btowc */
212
213wchar_t
214btowc (int c)
215{
216 wchar_t wtmp[2];
217 char tmp[2];
218
219 tmp[0] = c;
220 tmp[1] = 0;
221
222 mbtowc (wtmp, tmp, 1);
223 return wtmp[0];
224}
225#endif
226
227/* re_compile_pattern is the GNU regular expression compiler: it
228 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
229 Returns 0 if the pattern was valid, otherwise an error string.
230
231 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
232 are set in BUFP on entry. */
233
234const char *
235re_compile_pattern (const char *pattern,
236 size_t length,
237 struct re_pattern_buffer *bufp)
238{
239 reg_errcode_t ret;
240
241 /* And GNU code determines whether or not to get register information
242 by passing null for the REGS argument to re_match, etc., not by
243 setting no_sub, unless RE_NO_SUB is set. */
244 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
245
246 /* Match anchors at newline. */
247 bufp->newline_anchor = 1;
248
249 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
250
251 if (!ret)
252 return NULL;
253 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
254}
255#ifdef _LIBC
256weak_alias (__re_compile_pattern, re_compile_pattern)
257#endif
258
259/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
260 also be assigned to arbitrarily: each pattern buffer stores its own
261 syntax, so it can be changed between regex compilations. */
262/* This has no initializer because initialized variables in Emacs
263 become read-only after dumping. */
264reg_syntax_t re_syntax_options;
265
266
267/* Specify the precise syntax of regexps for compilation. This provides
268 for compatibility for various utilities which historically have
269 different, incompatible syntaxes.
270
271 The argument SYNTAX is a bit mask comprised of the various bits
272 defined in regex.h. We return the old syntax. */
273
274reg_syntax_t
275re_set_syntax (reg_syntax_t syntax)
276{
277 reg_syntax_t ret = re_syntax_options;
278
279 re_syntax_options = syntax;
280 return ret;
281}
282#ifdef _LIBC
283weak_alias (__re_set_syntax, re_set_syntax)
284#endif
285
286int
287re_compile_fastmap (struct re_pattern_buffer *bufp)
288{
289 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
290 char *fastmap = bufp->fastmap;
291
292 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
293 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
294 if (dfa->init_state != dfa->init_state_word)
295 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
296 if (dfa->init_state != dfa->init_state_nl)
297 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
298 if (dfa->init_state != dfa->init_state_begbuf)
299 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
300 bufp->fastmap_accurate = 1;
301 return 0;
302}
303#ifdef _LIBC
304weak_alias (__re_compile_fastmap, re_compile_fastmap)
305#endif
306
307static inline void
308__attribute ((always_inline))
309re_set_fastmap (char *fastmap, int icase, int ch)
310{
311 fastmap[ch] = 1;
312 if (icase)
313 fastmap[tolower (ch)] = 1;
314}
315
316/* Helper function for re_compile_fastmap.
317 Compile fastmap for the initial_state INIT_STATE. */
318
319static void
320re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
321 char *fastmap)
322{
323 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
324 int node_cnt;
325 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
326 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
327 {
328 int node = init_state->nodes.elems[node_cnt];
329 re_token_type_t type = dfa->nodes[node].type;
330
331 if (type == CHARACTER)
332 {
333 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
334#ifdef RE_ENABLE_I18N
335 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
336 {
337 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
338 wchar_t wc;
339 mbstate_t state;
340
341 p = buf;
342 *p++ = dfa->nodes[node].opr.c;
343 while (++node < dfa->nodes_len
344 && dfa->nodes[node].type == CHARACTER
345 && dfa->nodes[node].mb_partial)
346 *p++ = dfa->nodes[node].opr.c;
347 memset (&state, '\0', sizeof (state));
348 if (__mbrtowc (&wc, (const char *) buf, p - buf,
349 &state) == p - buf
350 && (__wcrtomb ((char *) buf, towlower (wc), &state)
351 != (size_t) -1))
352 re_set_fastmap (fastmap, 0, buf[0]);
353 re_free (buf);
354 }
355#endif
356 }
357 else if (type == SIMPLE_BRACKET)
358 {
359 int i, ch;
360 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
361 {
362 int j;
363 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
364 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
365 if (w & ((bitset_word_t) 1 << j))
366 re_set_fastmap (fastmap, icase, ch);
367 }
368 }
369#ifdef RE_ENABLE_I18N
370 else if (type == COMPLEX_BRACKET)
371 {
372 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
373 int i;
374
375# ifdef _LIBC
376 /* See if we have to try all bytes which start multiple collation
377 elements.
378 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
379 collation element, and don't catch 'b' since 'b' is
380 the only collation element which starts from 'b' (and
381 it is caught by SIMPLE_BRACKET). */
382 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
383 && (cset->ncoll_syms || cset->nranges))
384 {
385 const int32_t *table = (const int32_t *)
386 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
387 for (i = 0; i < SBC_MAX; ++i)
388 if (table[i] < 0)
389 re_set_fastmap (fastmap, icase, i);
390 }
391# endif /* _LIBC */
392
393 /* See if we have to start the match at all multibyte characters,
394 i.e. where we would not find an invalid sequence. This only
395 applies to multibyte character sets; for single byte character
396 sets, the SIMPLE_BRACKET again suffices. */
397 if (dfa->mb_cur_max > 1
398 && (cset->nchar_classes || cset->non_match || cset->nranges
399# ifdef _LIBC
400 || cset->nequiv_classes
401# endif /* _LIBC */
402 ))
403 {
404 unsigned char c = 0;
405 do
406 {
407 mbstate_t mbs;
408 memset (&mbs, 0, sizeof (mbs));
409 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
410 re_set_fastmap (fastmap, false, (int) c);
411 }
412 while (++c != 0);
413 }
414
415 else
416 {
417 /* ... Else catch all bytes which can start the mbchars. */
418 for (i = 0; i < cset->nmbchars; ++i)
419 {
420 char buf[256];
421 mbstate_t state;
422 memset (&state, '\0', sizeof (state));
423 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
424 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
425 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
426 {
427 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
428 != (size_t) -1)
429 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
430 }
431 }
432 }
433 }
434#endif /* RE_ENABLE_I18N */
435 else if (type == OP_PERIOD
436#ifdef RE_ENABLE_I18N
437 || type == OP_UTF8_PERIOD
438#endif /* RE_ENABLE_I18N */
439 || type == END_OF_RE)
440 {
441 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
442 if (type == END_OF_RE)
443 bufp->can_be_null = 1;
444 return;
445 }
446 }
447}
448\f
449/* Entry point for POSIX code. */
450/* regcomp takes a regular expression as a string and compiles it.
451
452 PREG is a regex_t *. We do not expect any fields to be initialized,
453 since POSIX says we shouldn't. Thus, we set
454
455 `buffer' to the compiled pattern;
456 `used' to the length of the compiled pattern;
457 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
458 REG_EXTENDED bit in CFLAGS is set; otherwise, to
459 RE_SYNTAX_POSIX_BASIC;
460 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
461 `fastmap' to an allocated space for the fastmap;
462 `fastmap_accurate' to zero;
463 `re_nsub' to the number of subexpressions in PATTERN.
464
465 PATTERN is the address of the pattern string.
466
467 CFLAGS is a series of bits which affect compilation.
468
469 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
470 use POSIX basic syntax.
471
472 If REG_NEWLINE is set, then . and [^...] don't match newline.
473 Also, regexec will try a match beginning after every newline.
474
475 If REG_ICASE is set, then we considers upper- and lowercase
476 versions of letters to be equivalent when matching.
477
478 If REG_NOSUB is set, then when PREG is passed to regexec, that
479 routine will report only success or failure, and nothing about the
480 registers.
481
482 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
483 the return codes and their meanings.) */
484
485int
486regcomp (regex_t *__restrict preg,
487 const char *__restrict pattern,
488 int cflags)
489{
490 reg_errcode_t ret;
491 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
492 : RE_SYNTAX_POSIX_BASIC);
493
494 preg->buffer = NULL;
495 preg->allocated = 0;
496 preg->used = 0;
497
498 /* Try to allocate space for the fastmap. */
499 preg->fastmap = re_malloc (char, SBC_MAX);
500 if (BE (preg->fastmap == NULL, 0))
501 return REG_ESPACE;
502
503 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
504
505 /* If REG_NEWLINE is set, newlines are treated differently. */
506 if (cflags & REG_NEWLINE)
507 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
508 syntax &= ~RE_DOT_NEWLINE;
509 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
510 /* It also changes the matching behavior. */
511 preg->newline_anchor = 1;
512 }
513 else
514 preg->newline_anchor = 0;
515 preg->no_sub = !!(cflags & REG_NOSUB);
516 preg->translate = NULL;
517
518 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
519
520 /* POSIX doesn't distinguish between an unmatched open-group and an
521 unmatched close-group: both are REG_EPAREN. */
522 if (ret == REG_ERPAREN)
523 ret = REG_EPAREN;
524
525 /* We have already checked preg->fastmap != NULL. */
526 if (BE (ret == REG_NOERROR, 1))
527 /* Compute the fastmap now, since regexec cannot modify the pattern
528 buffer. This function never fails in this implementation. */
529 (void) re_compile_fastmap (preg);
530 else
531 {
532 /* Some error occurred while compiling the expression. */
533 re_free (preg->fastmap);
534 preg->fastmap = NULL;
535 }
536
537 return (int) ret;
538}
539#ifdef _LIBC
540weak_alias (__regcomp, regcomp)
541#endif
542
543/* Returns a message corresponding to an error code, ERRCODE, returned
544 from either regcomp or regexec. We don't use PREG here. */
545
546size_t
547regerror(int errcode, const regex_t *__restrict preg,
548 char *__restrict errbuf, size_t errbuf_size)
549{
550 const char *msg;
551 size_t msg_size;
552
553 if (BE (errcode < 0
554 || errcode >= (int) (sizeof (__re_error_msgid_idx)
555 / sizeof (__re_error_msgid_idx[0])), 0))
556 /* Only error codes returned by the rest of the code should be passed
557 to this routine. If we are given anything else, or if other regex
558 code generates an invalid error code, then the program has a bug.
559 Dump core so we can fix it. */
560 abort ();
561
562 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
563
564 msg_size = strlen (msg) + 1; /* Includes the null. */
565
566 if (BE (errbuf_size != 0, 1))
567 {
568 if (BE (msg_size > errbuf_size, 0))
569 {
570 memcpy (errbuf, msg, errbuf_size - 1);
571 errbuf[errbuf_size - 1] = 0;
572 }
573 else
574 memcpy (errbuf, msg, msg_size);
575 }
576
577 return msg_size;
578}
579#ifdef _LIBC
580weak_alias (__regerror, regerror)
581#endif
582
583
584#ifdef RE_ENABLE_I18N
585/* This static array is used for the map to single-byte characters when
586 UTF-8 is used. Otherwise we would allocate memory just to initialize
587 it the same all the time. UTF-8 is the preferred encoding so this is
588 a worthwhile optimization. */
589#if __GNUC__ >= 3
590static const bitset_t utf8_sb_map = {
591 /* Set the first 128 bits. */
592 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
593};
594#else /* ! (__GNUC__ >= 3) */
595static bitset_t utf8_sb_map;
596#endif /* __GNUC__ >= 3 */
597#endif /* RE_ENABLE_I18N */
598
599
600static void
601free_dfa_content (re_dfa_t *dfa)
602{
603 int i, j;
604
605 if (dfa->nodes)
606 for (i = 0; i < dfa->nodes_len; ++i)
607 free_token (dfa->nodes + i);
608 re_free (dfa->nexts);
609 for (i = 0; i < dfa->nodes_len; ++i)
610 {
611 if (dfa->eclosures != NULL)
612 re_node_set_free (dfa->eclosures + i);
613 if (dfa->inveclosures != NULL)
614 re_node_set_free (dfa->inveclosures + i);
615 if (dfa->edests != NULL)
616 re_node_set_free (dfa->edests + i);
617 }
618 re_free (dfa->edests);
619 re_free (dfa->eclosures);
620 re_free (dfa->inveclosures);
621 re_free (dfa->nodes);
622
623 if (dfa->state_table)
624 for (i = 0; i <= dfa->state_hash_mask; ++i)
625 {
626 struct re_state_table_entry *entry = dfa->state_table + i;
627 for (j = 0; j < entry->num; ++j)
628 {
629 re_dfastate_t *state = entry->array[j];
630 free_state (state);
631 }
632 re_free (entry->array);
633 }
634 re_free (dfa->state_table);
635#ifdef RE_ENABLE_I18N
636 if (dfa->sb_char != utf8_sb_map)
637 re_free (dfa->sb_char);
638#endif
639 re_free (dfa->subexp_map);
640#ifdef DEBUG
641 re_free (dfa->re_str);
642#endif
643
644 re_free (dfa);
645}
646
647
648/* Free dynamically allocated space used by PREG. */
649
650void
651regfree (regex_t *preg)
652{
653 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
654 if (BE (dfa != NULL, 1))
655 free_dfa_content (dfa);
656 preg->buffer = NULL;
657 preg->allocated = 0;
658
659 re_free (preg->fastmap);
660 preg->fastmap = NULL;
661
662 re_free (preg->translate);
663 preg->translate = NULL;
664}
665#ifdef _LIBC
666weak_alias (__regfree, regfree)
667#endif
668\f
669/* Entry points compatible with 4.2 BSD regex library. We don't define
670 them unless specifically requested. */
671
672#if defined _REGEX_RE_COMP || defined _LIBC
673
674/* BSD has one and only one pattern buffer. */
675static struct re_pattern_buffer re_comp_buf;
676
677char *
678# ifdef _LIBC
679/* Make these definitions weak in libc, so POSIX programs can redefine
680 these names if they don't use our functions, and still use
681 regcomp/regexec above without link errors. */
682weak_function
683# endif
684re_comp (s)
685 const char *s;
686{
687 reg_errcode_t ret;
688 char *fastmap;
689
690 if (!s)
691 {
692 if (!re_comp_buf.buffer)
693 return gettext ("No previous regular expression");
694 return 0;
695 }
696
697 if (re_comp_buf.buffer)
698 {
699 fastmap = re_comp_buf.fastmap;
700 re_comp_buf.fastmap = NULL;
701 __regfree (&re_comp_buf);
702 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
703 re_comp_buf.fastmap = fastmap;
704 }
705
706 if (re_comp_buf.fastmap == NULL)
707 {
708 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
709 if (re_comp_buf.fastmap == NULL)
710 return (char *) gettext (__re_error_msgid
711 + __re_error_msgid_idx[(int) REG_ESPACE]);
712 }
713
714 /* Since `re_exec' always passes NULL for the `regs' argument, we
715 don't need to initialize the pattern buffer fields which affect it. */
716
717 /* Match anchors at newlines. */
718 re_comp_buf.newline_anchor = 1;
719
720 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
721
722 if (!ret)
723 return NULL;
724
725 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
726 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
727}
728
729#ifdef _LIBC
730libc_freeres_fn (free_mem)
731{
732 __regfree (&re_comp_buf);
733}
734#endif
735
736#endif /* _REGEX_RE_COMP */
737\f
738/* Internal entry point.
739 Compile the regular expression PATTERN, whose length is LENGTH.
740 SYNTAX indicate regular expression's syntax. */
741
742static reg_errcode_t
743re_compile_internal (regex_t *preg, const char * pattern, size_t length,
744 reg_syntax_t syntax)
745{
746 reg_errcode_t err = REG_NOERROR;
747 re_dfa_t *dfa;
748 re_string_t regexp;
749
750 /* Initialize the pattern buffer. */
751 preg->fastmap_accurate = 0;
752 preg->syntax = syntax;
753 preg->not_bol = preg->not_eol = 0;
754 preg->used = 0;
755 preg->re_nsub = 0;
756 preg->can_be_null = 0;
757 preg->regs_allocated = REGS_UNALLOCATED;
758
759 /* Initialize the dfa. */
760 dfa = (re_dfa_t *) preg->buffer;
761 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
762 {
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
768 if (dfa == NULL)
769 return REG_ESPACE;
770 preg->allocated = sizeof (re_dfa_t);
771 preg->buffer = (unsigned char *) dfa;
772 }
773 preg->used = sizeof (re_dfa_t);
774
775 err = init_dfa (dfa, length);
776 if (BE (err != REG_NOERROR, 0))
777 {
778 free_dfa_content (dfa);
779 preg->buffer = NULL;
780 preg->allocated = 0;
781 return err;
782 }
783#ifdef DEBUG
784 /* Note: length+1 will not overflow since it is checked in init_dfa. */
785 dfa->re_str = re_malloc (char, length + 1);
786 strncpy (dfa->re_str, pattern, length + 1);
787#endif
788
789 __libc_lock_init (dfa->lock);
790
791 err = re_string_construct (®exp, pattern, length, preg->translate,
792 syntax & RE_ICASE, dfa);
793 if (BE (err != REG_NOERROR, 0))
794 {
795 re_compile_internal_free_return:
796 free_workarea_compile (preg);
797 re_string_destruct (®exp);
798 free_dfa_content (dfa);
799 preg->buffer = NULL;
800 preg->allocated = 0;
801 return err;
802 }
803
804 /* Parse the regular expression, and build a structure tree. */
805 preg->re_nsub = 0;
806 dfa->str_tree = parse (®exp, preg, syntax, &err);
807 if (BE (dfa->str_tree == NULL, 0))
808 goto re_compile_internal_free_return;
809
810 /* Analyze the tree and create the nfa. */
811 err = analyze (preg);
812 if (BE (err != REG_NOERROR, 0))
813 goto re_compile_internal_free_return;
814
815#ifdef RE_ENABLE_I18N
816 /* If possible, do searching in single byte encoding to speed things up. */
817 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
818 optimize_utf8 (dfa);
819#endif
820
821 /* Then create the initial state of the dfa. */
822 err = create_initial_state (dfa);
823
824 /* Release work areas. */
825 free_workarea_compile (preg);
826 re_string_destruct (®exp);
827
828 if (BE (err != REG_NOERROR, 0))
829 {
830 free_dfa_content (dfa);
831 preg->buffer = NULL;
832 preg->allocated = 0;
833 }
834
835 return err;
836}
837
838/* Initialize DFA. We use the length of the regular expression PAT_LEN
839 as the initial length of some arrays. */
840
841static reg_errcode_t
842init_dfa (re_dfa_t *dfa, size_t pat_len)
843{
844 unsigned int table_size;
845#ifndef _LIBC
846 char *codeset_name;
847#endif
848
849 memset (dfa, '\0', sizeof (re_dfa_t));
850
851 /* Force allocation of str_tree_storage the first time. */
852 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
853
854 /* Avoid overflows. */
855 if (pat_len == SIZE_MAX)
856 return REG_ESPACE;
857
858 dfa->nodes_alloc = pat_len + 1;
859 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860
861 /* table_size = 2 ^ ceil(log pat_len) */
862 for (table_size = 1; ; table_size <<= 1)
863 if (table_size > pat_len)
864 break;
865
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
868
869 dfa->mb_cur_max = MB_CUR_MAX;
870#ifdef _LIBC
871 if (dfa->mb_cur_max == 6
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
873 dfa->is_utf8 = 1;
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 != 0);
876#else
877# ifdef HAVE_LANGINFO_CODESET
878 codeset_name = nl_langinfo (CODESET);
879# else
880 codeset_name = getenv ("LC_ALL");
881 if (codeset_name == NULL || codeset_name[0] == '\0')
882 codeset_name = getenv ("LC_CTYPE");
883 if (codeset_name == NULL || codeset_name[0] == '\0')
884 codeset_name = getenv ("LANG");
885 if (codeset_name == NULL)
886 codeset_name = "";
887 else if (strchr (codeset_name, '.') != NULL)
888 codeset_name = strchr (codeset_name, '.') + 1;
889# endif
890
891 /* strcasecmp isn't a standard interface. brute force check */
892#if 0
893 if (strcasecmp (codeset_name, "UTF-8") == 0
894 || strcasecmp (codeset_name, "UTF8") == 0)
895 dfa->is_utf8 = 1;
896#else
897 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
898 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
899 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
900 && (codeset_name[3] == '-'
901 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
902 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
903 dfa->is_utf8 = 1;
904#endif
905
906 /* We check exhaustively in the loop below if this charset is a
907 superset of ASCII. */
908 dfa->map_notascii = 0;
909#endif
910
911#ifdef RE_ENABLE_I18N
912 if (dfa->mb_cur_max > 1)
913 {
914 if (dfa->is_utf8)
915 {
916#if !defined(__GNUC__) || __GNUC__ < 3
917 static short utf8_sb_map_inited = 0;
918
919 if (! utf8_sb_map_inited)
920 {
921 int i;
922
923 utf8_sb_map_inited = 0;
924 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
925 utf8_sb_map[i] = BITSET_WORD_MAX;
926 }
927#endif
928 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
929 }
930 else
931 {
932 int i, j, ch;
933
934 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
935 if (BE (dfa->sb_char == NULL, 0))
936 return REG_ESPACE;
937
938 /* Set the bits corresponding to single byte chars. */
939 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
940 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
941 {
942 wint_t wch = __btowc (ch);
943 if (wch != WEOF)
944 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
945# ifndef _LIBC
946 if (isascii (ch) && wch != ch)
947 dfa->map_notascii = 1;
948# endif
949 }
950 }
951 }
952#endif
953
954 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
955 return REG_ESPACE;
956 return REG_NOERROR;
957}
958
959/* Initialize WORD_CHAR table, which indicate which character is
960 "word". In this case "word" means that it is the word construction
961 character used by some operators like "\<", "\>", etc. */
962
963static void
964internal_function
965init_word_char (re_dfa_t *dfa)
966{
967 int i, j, ch;
968 dfa->word_ops_used = 1;
969 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
970 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
971 if (isalnum (ch) || ch == '_')
972 dfa->word_char[i] |= (bitset_word_t) 1 << j;
973}
974
975/* Free the work area which are only used while compiling. */
976
977static void
978free_workarea_compile (regex_t *preg)
979{
980 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
981 bin_tree_storage_t *storage, *next;
982 for (storage = dfa->str_tree_storage; storage; storage = next)
983 {
984 next = storage->next;
985 re_free (storage);
986 }
987 dfa->str_tree_storage = NULL;
988 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
989 dfa->str_tree = NULL;
990 re_free (dfa->org_indices);
991 dfa->org_indices = NULL;
992}
993
994/* Create initial states for all contexts. */
995
996static reg_errcode_t
997create_initial_state (re_dfa_t *dfa)
998{
999 int first, i;
1000 reg_errcode_t err;
1001 re_node_set init_nodes;
1002
1003 /* Initial states have the epsilon closure of the node which is
1004 the first node of the regular expression. */
1005 first = dfa->str_tree->first->node_idx;
1006 dfa->init_node = first;
1007 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008 if (BE (err != REG_NOERROR, 0))
1009 return err;
1010
1011 /* The back-references which are in initial states can epsilon transit,
1012 since in this case all of the subexpressions can be null.
1013 Then we add epsilon closures of the nodes which are the next nodes of
1014 the back-references. */
1015 if (dfa->nbackref > 0)
1016 for (i = 0; i < init_nodes.nelem; ++i)
1017 {
1018 int node_idx = init_nodes.elems[i];
1019 re_token_type_t type = dfa->nodes[node_idx].type;
1020
1021 int clexp_idx;
1022 if (type != OP_BACK_REF)
1023 continue;
1024 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025 {
1026 re_token_t *clexp_node;
1027 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028 if (clexp_node->type == OP_CLOSE_SUBEXP
1029 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030 break;
1031 }
1032 if (clexp_idx == init_nodes.nelem)
1033 continue;
1034
1035 if (type == OP_BACK_REF)
1036 {
1037 int dest_idx = dfa->edests[node_idx].elems[0];
1038 if (!re_node_set_contains (&init_nodes, dest_idx))
1039 {
1040 reg_errcode_t err = re_node_set_merge (&init_nodes,
1041 dfa->eclosures
1042 + dest_idx);
1043 if (err != REG_NOERROR)
1044 return err;
1045 i = 0;
1046 }
1047 }
1048 }
1049
1050 /* It must be the first time to invoke acquire_state. */
1051 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1052 /* We don't check ERR here, since the initial state must not be NULL. */
1053 if (BE (dfa->init_state == NULL, 0))
1054 return err;
1055 if (dfa->init_state->has_constraint)
1056 {
1057 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1058 CONTEXT_WORD);
1059 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1060 CONTEXT_NEWLINE);
1061 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1062 &init_nodes,
1063 CONTEXT_NEWLINE
1064 | CONTEXT_BEGBUF);
1065 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1066 || dfa->init_state_begbuf == NULL, 0))
1067 return err;
1068 }
1069 else
1070 dfa->init_state_word = dfa->init_state_nl
1071 = dfa->init_state_begbuf = dfa->init_state;
1072
1073 re_node_set_free (&init_nodes);
1074 return REG_NOERROR;
1075}
1076\f
1077#ifdef RE_ENABLE_I18N
1078/* If it is possible to do searching in single byte encoding instead of UTF-8
1079 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080 DFA nodes where needed. */
1081
1082static void
1083optimize_utf8 (re_dfa_t *dfa)
1084{
1085 int node, i, mb_chars = 0, has_period = 0;
1086
1087 for (node = 0; node < dfa->nodes_len; ++node)
1088 switch (dfa->nodes[node].type)
1089 {
1090 case CHARACTER:
1091 if (dfa->nodes[node].opr.c >= 0x80)
1092 mb_chars = 1;
1093 break;
1094 case ANCHOR:
1095 switch (dfa->nodes[node].opr.ctx_type)
1096 {
1097 case LINE_FIRST:
1098 case LINE_LAST:
1099 case BUF_FIRST:
1100 case BUF_LAST:
1101 break;
1102 default:
1103 /* Word anchors etc. cannot be handled. It's okay to test
1104 opr.ctx_type since constraints (for all DFA nodes) are
1105 created by ORing one or more opr.ctx_type values. */
1106 return;
1107 }
1108 break;
1109 case OP_PERIOD:
1110 has_period = 1;
1111 break;
1112 case OP_BACK_REF:
1113 case OP_ALT:
1114 case END_OF_RE:
1115 case OP_DUP_ASTERISK:
1116 case OP_OPEN_SUBEXP:
1117 case OP_CLOSE_SUBEXP:
1118 break;
1119 case COMPLEX_BRACKET:
1120 return;
1121 case SIMPLE_BRACKET:
1122 /* Just double check. The non-ASCII range starts at 0x80. */
1123 assert (0x80 % BITSET_WORD_BITS == 0);
1124 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1125 if (dfa->nodes[node].opr.sbcset[i])
1126 return;
1127 break;
1128 default:
1129 abort ();
1130 }
1131
1132 if (mb_chars || has_period)
1133 for (node = 0; node < dfa->nodes_len; ++node)
1134 {
1135 if (dfa->nodes[node].type == CHARACTER
1136 && dfa->nodes[node].opr.c >= 0x80)
1137 dfa->nodes[node].mb_partial = 0;
1138 else if (dfa->nodes[node].type == OP_PERIOD)
1139 dfa->nodes[node].type = OP_UTF8_PERIOD;
1140 }
1141
1142 /* The search can be in single byte locale. */
1143 dfa->mb_cur_max = 1;
1144 dfa->is_utf8 = 0;
1145 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1146}
1147#endif
1148\f
1149/* Analyze the structure tree, and calculate "first", "next", "edest",
1150 "eclosure", and "inveclosure". */
1151
1152static reg_errcode_t
1153analyze (regex_t *preg)
1154{
1155 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1156 reg_errcode_t ret;
1157
1158 /* Allocate arrays. */
1159 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1160 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1161 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1162 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1163 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1164 || dfa->eclosures == NULL, 0))
1165 return REG_ESPACE;
1166
1167 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1168 if (dfa->subexp_map != NULL)
1169 {
1170 int i;
1171 for (i = 0; i < preg->re_nsub; i++)
1172 dfa->subexp_map[i] = i;
1173 preorder (dfa->str_tree, optimize_subexps, dfa);
1174 for (i = 0; i < preg->re_nsub; i++)
1175 if (dfa->subexp_map[i] != i)
1176 break;
1177 if (i == preg->re_nsub)
1178 {
1179 free (dfa->subexp_map);
1180 dfa->subexp_map = NULL;
1181 }
1182 }
1183
1184 ret = postorder (dfa->str_tree, lower_subexps, preg);
1185 if (BE (ret != REG_NOERROR, 0))
1186 return ret;
1187 ret = postorder (dfa->str_tree, calc_first, dfa);
1188 if (BE (ret != REG_NOERROR, 0))
1189 return ret;
1190 preorder (dfa->str_tree, calc_next, dfa);
1191 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1192 if (BE (ret != REG_NOERROR, 0))
1193 return ret;
1194 ret = calc_eclosure (dfa);
1195 if (BE (ret != REG_NOERROR, 0))
1196 return ret;
1197
1198 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1199 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1200 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1201 || dfa->nbackref)
1202 {
1203 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1204 if (BE (dfa->inveclosures == NULL, 0))
1205 return REG_ESPACE;
1206 ret = calc_inveclosure (dfa);
1207 }
1208
1209 return ret;
1210}
1211
1212/* Our parse trees are very unbalanced, so we cannot use a stack to
1213 implement parse tree visits. Instead, we use parent pointers and
1214 some hairy code in these two functions. */
1215static reg_errcode_t
1216postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1217 void *extra)
1218{
1219 bin_tree_t *node, *prev;
1220
1221 for (node = root; ; )
1222 {
1223 /* Descend down the tree, preferably to the left (or to the right
1224 if that's the only child). */
1225 while (node->left || node->right)
1226 if (node->left)
1227 node = node->left;
1228 else
1229 node = node->right;
1230
1231 do
1232 {
1233 reg_errcode_t err = fn (extra, node);
1234 if (BE (err != REG_NOERROR, 0))
1235 return err;
1236 if (node->parent == NULL)
1237 return REG_NOERROR;
1238 prev = node;
1239 node = node->parent;
1240 }
1241 /* Go up while we have a node that is reached from the right. */
1242 while (node->right == prev || node->right == NULL);
1243 node = node->right;
1244 }
1245}
1246
1247static reg_errcode_t
1248preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1249 void *extra)
1250{
1251 bin_tree_t *node;
1252
1253 for (node = root; ; )
1254 {
1255 reg_errcode_t err = fn (extra, node);
1256 if (BE (err != REG_NOERROR, 0))
1257 return err;
1258
1259 /* Go to the left node, or up and to the right. */
1260 if (node->left)
1261 node = node->left;
1262 else
1263 {
1264 bin_tree_t *prev = NULL;
1265 while (node->right == prev || node->right == NULL)
1266 {
1267 prev = node;
1268 node = node->parent;
1269 if (!node)
1270 return REG_NOERROR;
1271 }
1272 node = node->right;
1273 }
1274 }
1275}
1276
1277/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1278 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1279 backreferences as well. Requires a preorder visit. */
1280static reg_errcode_t
1281optimize_subexps (void *extra, bin_tree_t *node)
1282{
1283 re_dfa_t *dfa = (re_dfa_t *) extra;
1284
1285 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1286 {
1287 int idx = node->token.opr.idx;
1288 node->token.opr.idx = dfa->subexp_map[idx];
1289 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1290 }
1291
1292 else if (node->token.type == SUBEXP
1293 && node->left && node->left->token.type == SUBEXP)
1294 {
1295 int other_idx = node->left->token.opr.idx;
1296
1297 node->left = node->left->left;
1298 if (node->left)
1299 node->left->parent = node;
1300
1301 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1302 if (other_idx < BITSET_WORD_BITS)
1303 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1304 }
1305
1306 return REG_NOERROR;
1307}
1308
1309/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1310 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1311static reg_errcode_t
1312lower_subexps (void *extra, bin_tree_t *node)
1313{
1314 regex_t *preg = (regex_t *) extra;
1315 reg_errcode_t err = REG_NOERROR;
1316
1317 if (node->left && node->left->token.type == SUBEXP)
1318 {
1319 node->left = lower_subexp (&err, preg, node->left);
1320 if (node->left)
1321 node->left->parent = node;
1322 }
1323 if (node->right && node->right->token.type == SUBEXP)
1324 {
1325 node->right = lower_subexp (&err, preg, node->right);
1326 if (node->right)
1327 node->right->parent = node;
1328 }
1329
1330 return err;
1331}
1332
1333static bin_tree_t *
1334lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1335{
1336 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1337 bin_tree_t *body = node->left;
1338 bin_tree_t *op, *cls, *tree1, *tree;
1339
1340 if (preg->no_sub
1341 /* We do not optimize empty subexpressions, because otherwise we may
1342 have bad CONCAT nodes with NULL children. This is obviously not
1343 very common, so we do not lose much. An example that triggers
1344 this case is the sed "script" /\(\)/x. */
1345 && node->left != NULL
1346 && (node->token.opr.idx >= BITSET_WORD_BITS
1347 || !(dfa->used_bkref_map
1348 & ((bitset_word_t) 1 << node->token.opr.idx))))
1349 return node->left;
1350
1351 /* Convert the SUBEXP node to the concatenation of an
1352 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1353 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1354 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1355 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1356 tree = create_tree (dfa, op, tree1, CONCAT);
1357 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1358 {
1359 *err = REG_ESPACE;
1360 return NULL;
1361 }
1362
1363 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1364 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1365 return tree;
1366}
1367
1368/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1369 nodes. Requires a postorder visit. */
1370static reg_errcode_t
1371calc_first (void *extra, bin_tree_t *node)
1372{
1373 re_dfa_t *dfa = (re_dfa_t *) extra;
1374 if (node->token.type == CONCAT)
1375 {
1376 node->first = node->left->first;
1377 node->node_idx = node->left->node_idx;
1378 }
1379 else
1380 {
1381 node->first = node;
1382 node->node_idx = re_dfa_add_node (dfa, node->token);
1383 if (BE (node->node_idx == -1, 0))
1384 return REG_ESPACE;
1385 if (node->token.type == ANCHOR)
1386 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1387 }
1388 return REG_NOERROR;
1389}
1390
1391/* Pass 2: compute NEXT on the tree. Preorder visit. */
1392static reg_errcode_t
1393calc_next (void *extra, bin_tree_t *node)
1394{
1395 switch (node->token.type)
1396 {
1397 case OP_DUP_ASTERISK:
1398 node->left->next = node;
1399 break;
1400 case CONCAT:
1401 node->left->next = node->right->first;
1402 node->right->next = node->next;
1403 break;
1404 default:
1405 if (node->left)
1406 node->left->next = node->next;
1407 if (node->right)
1408 node->right->next = node->next;
1409 break;
1410 }
1411 return REG_NOERROR;
1412}
1413
1414/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1415static reg_errcode_t
1416link_nfa_nodes (void *extra, bin_tree_t *node)
1417{
1418 re_dfa_t *dfa = (re_dfa_t *) extra;
1419 int idx = node->node_idx;
1420 reg_errcode_t err = REG_NOERROR;
1421
1422 switch (node->token.type)
1423 {
1424 case CONCAT:
1425 break;
1426
1427 case END_OF_RE:
1428 assert (node->next == NULL);
1429 break;
1430
1431 case OP_DUP_ASTERISK:
1432 case OP_ALT:
1433 {
1434 int left, right;
1435 dfa->has_plural_match = 1;
1436 if (node->left != NULL)
1437 left = node->left->first->node_idx;
1438 else
1439 left = node->next->node_idx;
1440 if (node->right != NULL)
1441 right = node->right->first->node_idx;
1442 else
1443 right = node->next->node_idx;
1444 assert (left > -1);
1445 assert (right > -1);
1446 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1447 }
1448 break;
1449
1450 case ANCHOR:
1451 case OP_OPEN_SUBEXP:
1452 case OP_CLOSE_SUBEXP:
1453 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1454 break;
1455
1456 case OP_BACK_REF:
1457 dfa->nexts[idx] = node->next->node_idx;
1458 if (node->token.type == OP_BACK_REF)
1459 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1460 break;
1461
1462 default:
1463 assert (!IS_EPSILON_NODE (node->token.type));
1464 dfa->nexts[idx] = node->next->node_idx;
1465 break;
1466 }
1467
1468 return err;
1469}
1470
1471/* Duplicate the epsilon closure of the node ROOT_NODE.
1472 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1473 to their own constraint. */
1474
1475static reg_errcode_t
1476internal_function
1477duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1478 int root_node, unsigned int init_constraint)
1479{
1480 int org_node, clone_node, ret;
1481 unsigned int constraint = init_constraint;
1482 for (org_node = top_org_node, clone_node = top_clone_node;;)
1483 {
1484 int org_dest, clone_dest;
1485 if (dfa->nodes[org_node].type == OP_BACK_REF)
1486 {
1487 /* If the back reference epsilon-transit, its destination must
1488 also have the constraint. Then duplicate the epsilon closure
1489 of the destination of the back reference, and store it in
1490 edests of the back reference. */
1491 org_dest = dfa->nexts[org_node];
1492 re_node_set_empty (dfa->edests + clone_node);
1493 clone_dest = duplicate_node (dfa, org_dest, constraint);
1494 if (BE (clone_dest == -1, 0))
1495 return REG_ESPACE;
1496 dfa->nexts[clone_node] = dfa->nexts[org_node];
1497 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498 if (BE (ret < 0, 0))
1499 return REG_ESPACE;
1500 }
1501 else if (dfa->edests[org_node].nelem == 0)
1502 {
1503 /* In case of the node can't epsilon-transit, don't duplicate the
1504 destination and store the original destination as the
1505 destination of the node. */
1506 dfa->nexts[clone_node] = dfa->nexts[org_node];
1507 break;
1508 }
1509 else if (dfa->edests[org_node].nelem == 1)
1510 {
1511 /* In case of the node can epsilon-transit, and it has only one
1512 destination. */
1513 org_dest = dfa->edests[org_node].elems[0];
1514 re_node_set_empty (dfa->edests + clone_node);
1515 /* If the node is root_node itself, it means the epsilon clsoure
1516 has a loop. Then tie it to the destination of the root_node. */
1517 if (org_node == root_node && clone_node != org_node)
1518 {
1519 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1520 if (BE (ret < 0, 0))
1521 return REG_ESPACE;
1522 break;
1523 }
1524 /* In case of the node has another constraint, add it. */
1525 constraint |= dfa->nodes[org_node].constraint;
1526 clone_dest = duplicate_node (dfa, org_dest, constraint);
1527 if (BE (clone_dest == -1, 0))
1528 return REG_ESPACE;
1529 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1530 if (BE (ret < 0, 0))
1531 return REG_ESPACE;
1532 }
1533 else /* dfa->edests[org_node].nelem == 2 */
1534 {
1535 /* In case of the node can epsilon-transit, and it has two
1536 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1537 org_dest = dfa->edests[org_node].elems[0];
1538 re_node_set_empty (dfa->edests + clone_node);
1539 /* Search for a duplicated node which satisfies the constraint. */
1540 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1541 if (clone_dest == -1)
1542 {
1543 /* There is no such duplicated node, create a new one. */
1544 reg_errcode_t err;
1545 clone_dest = duplicate_node (dfa, org_dest, constraint);
1546 if (BE (clone_dest == -1, 0))
1547 return REG_ESPACE;
1548 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1549 if (BE (ret < 0, 0))
1550 return REG_ESPACE;
1551 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1552 root_node, constraint);
1553 if (BE (err != REG_NOERROR, 0))
1554 return err;
1555 }
1556 else
1557 {
1558 /* There is a duplicated node which satisfies the constraint,
1559 use it to avoid infinite loop. */
1560 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1561 if (BE (ret < 0, 0))
1562 return REG_ESPACE;
1563 }
1564
1565 org_dest = dfa->edests[org_node].elems[1];
1566 clone_dest = duplicate_node (dfa, org_dest, constraint);
1567 if (BE (clone_dest == -1, 0))
1568 return REG_ESPACE;
1569 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1570 if (BE (ret < 0, 0))
1571 return REG_ESPACE;
1572 }
1573 org_node = org_dest;
1574 clone_node = clone_dest;
1575 }
1576 return REG_NOERROR;
1577}
1578
1579/* Search for a node which is duplicated from the node ORG_NODE, and
1580 satisfies the constraint CONSTRAINT. */
1581
1582static int
1583search_duplicated_node (const re_dfa_t *dfa, int org_node,
1584 unsigned int constraint)
1585{
1586 int idx;
1587 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1588 {
1589 if (org_node == dfa->org_indices[idx]
1590 && constraint == dfa->nodes[idx].constraint)
1591 return idx; /* Found. */
1592 }
1593 return -1; /* Not found. */
1594}
1595
1596/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1597 Return the index of the new node, or -1 if insufficient storage is
1598 available. */
1599
1600static int
1601duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1602{
1603 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1604 if (BE (dup_idx != -1, 1))
1605 {
1606 dfa->nodes[dup_idx].constraint = constraint;
1607 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1608 dfa->nodes[dup_idx].duplicated = 1;
1609
1610 /* Store the index of the original node. */
1611 dfa->org_indices[dup_idx] = org_idx;
1612 }
1613 return dup_idx;
1614}
1615
1616static reg_errcode_t
1617calc_inveclosure (re_dfa_t *dfa)
1618{
1619 int src, idx, ret;
1620 for (idx = 0; idx < dfa->nodes_len; ++idx)
1621 re_node_set_init_empty (dfa->inveclosures + idx);
1622
1623 for (src = 0; src < dfa->nodes_len; ++src)
1624 {
1625 int *elems = dfa->eclosures[src].elems;
1626 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1627 {
1628 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1629 if (BE (ret == -1, 0))
1630 return REG_ESPACE;
1631 }
1632 }
1633
1634 return REG_NOERROR;
1635}
1636
1637/* Calculate "eclosure" for all the node in DFA. */
1638
1639static reg_errcode_t
1640calc_eclosure (re_dfa_t *dfa)
1641{
1642 int node_idx, incomplete;
1643#ifdef DEBUG
1644 assert (dfa->nodes_len > 0);
1645#endif
1646 incomplete = 0;
1647 /* For each nodes, calculate epsilon closure. */
1648 for (node_idx = 0; ; ++node_idx)
1649 {
1650 reg_errcode_t err;
1651 re_node_set eclosure_elem;
1652 if (node_idx == dfa->nodes_len)
1653 {
1654 if (!incomplete)
1655 break;
1656 incomplete = 0;
1657 node_idx = 0;
1658 }
1659
1660#ifdef DEBUG
1661 assert (dfa->eclosures[node_idx].nelem != -1);
1662#endif
1663
1664 /* If we have already calculated, skip it. */
1665 if (dfa->eclosures[node_idx].nelem != 0)
1666 continue;
1667 /* Calculate epsilon closure of `node_idx'. */
1668 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1669 if (BE (err != REG_NOERROR, 0))
1670 return err;
1671
1672 if (dfa->eclosures[node_idx].nelem == 0)
1673 {
1674 incomplete = 1;
1675 re_node_set_free (&eclosure_elem);
1676 }
1677 }
1678 return REG_NOERROR;
1679}
1680
1681/* Calculate epsilon closure of NODE. */
1682
1683static reg_errcode_t
1684calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1685{
1686 reg_errcode_t err;
1687 int i;
1688 re_node_set eclosure;
1689 int ret;
1690 int incomplete = 0;
1691 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1692 if (BE (err != REG_NOERROR, 0))
1693 return err;
1694
1695 /* This indicates that we are calculating this node now.
1696 We reference this value to avoid infinite loop. */
1697 dfa->eclosures[node].nelem = -1;
1698
1699 /* If the current node has constraints, duplicate all nodes
1700 since they must inherit the constraints. */
1701 if (dfa->nodes[node].constraint
1702 && dfa->edests[node].nelem
1703 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1704 {
1705 err = duplicate_node_closure (dfa, node, node, node,
1706 dfa->nodes[node].constraint);
1707 if (BE (err != REG_NOERROR, 0))
1708 return err;
1709 }
1710
1711 /* Expand each epsilon destination nodes. */
1712 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1713 for (i = 0; i < dfa->edests[node].nelem; ++i)
1714 {
1715 re_node_set eclosure_elem;
1716 int edest = dfa->edests[node].elems[i];
1717 /* If calculating the epsilon closure of `edest' is in progress,
1718 return intermediate result. */
1719 if (dfa->eclosures[edest].nelem == -1)
1720 {
1721 incomplete = 1;
1722 continue;
1723 }
1724 /* If we haven't calculated the epsilon closure of `edest' yet,
1725 calculate now. Otherwise use calculated epsilon closure. */
1726 if (dfa->eclosures[edest].nelem == 0)
1727 {
1728 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1729 if (BE (err != REG_NOERROR, 0))
1730 return err;
1731 }
1732 else
1733 eclosure_elem = dfa->eclosures[edest];
1734 /* Merge the epsilon closure of `edest'. */
1735 err = re_node_set_merge (&eclosure, &eclosure_elem);
1736 if (BE (err != REG_NOERROR, 0))
1737 return err;
1738 /* If the epsilon closure of `edest' is incomplete,
1739 the epsilon closure of this node is also incomplete. */
1740 if (dfa->eclosures[edest].nelem == 0)
1741 {
1742 incomplete = 1;
1743 re_node_set_free (&eclosure_elem);
1744 }
1745 }
1746
1747 /* An epsilon closure includes itself. */
1748 ret = re_node_set_insert (&eclosure, node);
1749 if (BE (ret < 0, 0))
1750 return REG_ESPACE;
1751 if (incomplete && !root)
1752 dfa->eclosures[node].nelem = 0;
1753 else
1754 dfa->eclosures[node] = eclosure;
1755 *new_set = eclosure;
1756 return REG_NOERROR;
1757}
1758\f
1759/* Functions for token which are used in the parser. */
1760
1761/* Fetch a token from INPUT.
1762 We must not use this function inside bracket expressions. */
1763
1764static void
1765internal_function
1766fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1767{
1768 re_string_skip_bytes (input, peek_token (result, input, syntax));
1769}
1770
1771/* Peek a token from INPUT, and return the length of the token.
1772 We must not use this function inside bracket expressions. */
1773
1774static int
1775internal_function
1776peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1777{
1778 unsigned char c;
1779
1780 if (re_string_eoi (input))
1781 {
1782 token->type = END_OF_RE;
1783 return 0;
1784 }
1785
1786 c = re_string_peek_byte (input, 0);
1787 token->opr.c = c;
1788
1789 token->word_char = 0;
1790#ifdef RE_ENABLE_I18N
1791 token->mb_partial = 0;
1792 if (input->mb_cur_max > 1 &&
1793 !re_string_first_byte (input, re_string_cur_idx (input)))
1794 {
1795 token->type = CHARACTER;
1796 token->mb_partial = 1;
1797 return 1;
1798 }
1799#endif
1800 if (c == '\\')
1801 {
1802 unsigned char c2;
1803 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1804 {
1805 token->type = BACK_SLASH;
1806 return 1;
1807 }
1808
1809 c2 = re_string_peek_byte_case (input, 1);
1810 token->opr.c = c2;
1811 token->type = CHARACTER;
1812#ifdef RE_ENABLE_I18N
1813 if (input->mb_cur_max > 1)
1814 {
1815 wint_t wc = re_string_wchar_at (input,
1816 re_string_cur_idx (input) + 1);
1817 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1818 }
1819 else
1820#endif
1821 token->word_char = IS_WORD_CHAR (c2) != 0;
1822
1823 switch (c2)
1824 {
1825 case '|':
1826 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1827 token->type = OP_ALT;
1828 break;
1829 case '1': case '2': case '3': case '4': case '5':
1830 case '6': case '7': case '8': case '9':
1831 if (!(syntax & RE_NO_BK_REFS))
1832 {
1833 token->type = OP_BACK_REF;
1834 token->opr.idx = c2 - '1';
1835 }
1836 break;
1837 case '<':
1838 if (!(syntax & RE_NO_GNU_OPS))
1839 {
1840 token->type = ANCHOR;
1841 token->opr.ctx_type = WORD_FIRST;
1842 }
1843 break;
1844 case '>':
1845 if (!(syntax & RE_NO_GNU_OPS))
1846 {
1847 token->type = ANCHOR;
1848 token->opr.ctx_type = WORD_LAST;
1849 }
1850 break;
1851 case 'b':
1852 if (!(syntax & RE_NO_GNU_OPS))
1853 {
1854 token->type = ANCHOR;
1855 token->opr.ctx_type = WORD_DELIM;
1856 }
1857 break;
1858 case 'B':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 {
1861 token->type = ANCHOR;
1862 token->opr.ctx_type = NOT_WORD_DELIM;
1863 }
1864 break;
1865 case 'w':
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 token->type = OP_WORD;
1868 break;
1869 case 'W':
1870 if (!(syntax & RE_NO_GNU_OPS))
1871 token->type = OP_NOTWORD;
1872 break;
1873 case 's':
1874 if (!(syntax & RE_NO_GNU_OPS))
1875 token->type = OP_SPACE;
1876 break;
1877 case 'S':
1878 if (!(syntax & RE_NO_GNU_OPS))
1879 token->type = OP_NOTSPACE;
1880 break;
1881 case '`':
1882 if (!(syntax & RE_NO_GNU_OPS))
1883 {
1884 token->type = ANCHOR;
1885 token->opr.ctx_type = BUF_FIRST;
1886 }
1887 break;
1888 case '\'':
1889 if (!(syntax & RE_NO_GNU_OPS))
1890 {
1891 token->type = ANCHOR;
1892 token->opr.ctx_type = BUF_LAST;
1893 }
1894 break;
1895 case '(':
1896 if (!(syntax & RE_NO_BK_PARENS))
1897 token->type = OP_OPEN_SUBEXP;
1898 break;
1899 case ')':
1900 if (!(syntax & RE_NO_BK_PARENS))
1901 token->type = OP_CLOSE_SUBEXP;
1902 break;
1903 case '+':
1904 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1905 token->type = OP_DUP_PLUS;
1906 break;
1907 case '?':
1908 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1909 token->type = OP_DUP_QUESTION;
1910 break;
1911 case '{':
1912 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1913 token->type = OP_OPEN_DUP_NUM;
1914 break;
1915 case '}':
1916 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1917 token->type = OP_CLOSE_DUP_NUM;
1918 break;
1919 default:
1920 break;
1921 }
1922 return 2;
1923 }
1924
1925 token->type = CHARACTER;
1926#ifdef RE_ENABLE_I18N
1927 if (input->mb_cur_max > 1)
1928 {
1929 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1930 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1931 }
1932 else
1933#endif
1934 token->word_char = IS_WORD_CHAR (token->opr.c);
1935
1936 switch (c)
1937 {
1938 case '\n':
1939 if (syntax & RE_NEWLINE_ALT)
1940 token->type = OP_ALT;
1941 break;
1942 case '|':
1943 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1944 token->type = OP_ALT;
1945 break;
1946 case '*':
1947 token->type = OP_DUP_ASTERISK;
1948 break;
1949 case '+':
1950 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1951 token->type = OP_DUP_PLUS;
1952 break;
1953 case '?':
1954 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1955 token->type = OP_DUP_QUESTION;
1956 break;
1957 case '{':
1958 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1959 token->type = OP_OPEN_DUP_NUM;
1960 break;
1961 case '}':
1962 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1963 token->type = OP_CLOSE_DUP_NUM;
1964 break;
1965 case '(':
1966 if (syntax & RE_NO_BK_PARENS)
1967 token->type = OP_OPEN_SUBEXP;
1968 break;
1969 case ')':
1970 if (syntax & RE_NO_BK_PARENS)
1971 token->type = OP_CLOSE_SUBEXP;
1972 break;
1973 case '[':
1974 token->type = OP_OPEN_BRACKET;
1975 break;
1976 case '.':
1977 token->type = OP_PERIOD;
1978 break;
1979 case '^':
1980 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1981 re_string_cur_idx (input) != 0)
1982 {
1983 char prev = re_string_peek_byte (input, -1);
1984 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1985 break;
1986 }
1987 token->type = ANCHOR;
1988 token->opr.ctx_type = LINE_FIRST;
1989 break;
1990 case '$':
1991 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1992 re_string_cur_idx (input) + 1 != re_string_length (input))
1993 {
1994 re_token_t next;
1995 re_string_skip_bytes (input, 1);
1996 peek_token (&next, input, syntax);
1997 re_string_skip_bytes (input, -1);
1998 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1999 break;
2000 }
2001 token->type = ANCHOR;
2002 token->opr.ctx_type = LINE_LAST;
2003 break;
2004 default:
2005 break;
2006 }
2007 return 1;
2008}
2009
2010/* Peek a token from INPUT, and return the length of the token.
2011 We must not use this function out of bracket expressions. */
2012
2013static int
2014internal_function
2015peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2016{
2017 unsigned char c;
2018 if (re_string_eoi (input))
2019 {
2020 token->type = END_OF_RE;
2021 return 0;
2022 }
2023 c = re_string_peek_byte (input, 0);
2024 token->opr.c = c;
2025
2026#ifdef RE_ENABLE_I18N
2027 if (input->mb_cur_max > 1 &&
2028 !re_string_first_byte (input, re_string_cur_idx (input)))
2029 {
2030 token->type = CHARACTER;
2031 return 1;
2032 }
2033#endif /* RE_ENABLE_I18N */
2034
2035 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2036 && re_string_cur_idx (input) + 1 < re_string_length (input))
2037 {
2038 /* In this case, '\' escape a character. */
2039 unsigned char c2;
2040 re_string_skip_bytes (input, 1);
2041 c2 = re_string_peek_byte (input, 0);
2042 token->opr.c = c2;
2043 token->type = CHARACTER;
2044 return 1;
2045 }
2046 if (c == '[') /* '[' is a special char in a bracket exps. */
2047 {
2048 unsigned char c2;
2049 int token_len;
2050 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2051 c2 = re_string_peek_byte (input, 1);
2052 else
2053 c2 = 0;
2054 token->opr.c = c2;
2055 token_len = 2;
2056 switch (c2)
2057 {
2058 case '.':
2059 token->type = OP_OPEN_COLL_ELEM;
2060 break;
2061 case '=':
2062 token->type = OP_OPEN_EQUIV_CLASS;
2063 break;
2064 case ':':
2065 if (syntax & RE_CHAR_CLASSES)
2066 {
2067 token->type = OP_OPEN_CHAR_CLASS;
2068 break;
2069 }
2070 /* else fall through. */
2071 default:
2072 token->type = CHARACTER;
2073 token->opr.c = c;
2074 token_len = 1;
2075 break;
2076 }
2077 return token_len;
2078 }
2079 switch (c)
2080 {
2081 case '-':
2082 token->type = OP_CHARSET_RANGE;
2083 break;
2084 case ']':
2085 token->type = OP_CLOSE_BRACKET;
2086 break;
2087 case '^':
2088 token->type = OP_NON_MATCH_LIST;
2089 break;
2090 default:
2091 token->type = CHARACTER;
2092 }
2093 return 1;
2094}
2095\f
2096/* Functions for parser. */
2097
2098/* Entry point of the parser.
2099 Parse the regular expression REGEXP and return the structure tree.
2100 If an error has occurred, ERR is set by error code, and return NULL.
2101 This function build the following tree, from regular expression <reg_exp>:
2102 CAT
2103 / \
2104 / \
2105 <reg_exp> EOR
2106
2107 CAT means concatenation.
2108 EOR means end of regular expression. */
2109
2110static bin_tree_t *
2111parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2112 reg_errcode_t *err)
2113{
2114 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2115 bin_tree_t *tree, *eor, *root;
2116 re_token_t current_token;
2117 dfa->syntax = syntax;
2118 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2119 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2120 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121 return NULL;
2122 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2123 if (tree != NULL)
2124 root = create_tree (dfa, tree, eor, CONCAT);
2125 else
2126 root = eor;
2127 if (BE (eor == NULL || root == NULL, 0))
2128 {
2129 *err = REG_ESPACE;
2130 return NULL;
2131 }
2132 return root;
2133}
2134
2135/* This function build the following tree, from regular expression
2136 <branch1>|<branch2>:
2137 ALT
2138 / \
2139 / \
2140 <branch1> <branch2>
2141
2142 ALT means alternative, which represents the operator `|'. */
2143
2144static bin_tree_t *
2145parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2146 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2147{
2148 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2149 bin_tree_t *tree, *branch = NULL;
2150 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2151 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2152 return NULL;
2153
2154 while (token->type == OP_ALT)
2155 {
2156 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2157 if (token->type != OP_ALT && token->type != END_OF_RE
2158 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2159 {
2160 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2161 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2162 return NULL;
2163 }
2164 else
2165 branch = NULL;
2166 tree = create_tree (dfa, tree, branch, OP_ALT);
2167 if (BE (tree == NULL, 0))
2168 {
2169 *err = REG_ESPACE;
2170 return NULL;
2171 }
2172 }
2173 return tree;
2174}
2175
2176/* This function build the following tree, from regular expression
2177 <exp1><exp2>:
2178 CAT
2179 / \
2180 / \
2181 <exp1> <exp2>
2182
2183 CAT means concatenation. */
2184
2185static bin_tree_t *
2186parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2187 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2188{
2189 bin_tree_t *tree, *exp;
2190 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2191 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2192 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193 return NULL;
2194
2195 while (token->type != OP_ALT && token->type != END_OF_RE
2196 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2197 {
2198 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2199 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2200 {
2201 return NULL;
2202 }
2203 if (tree != NULL && exp != NULL)
2204 {
2205 tree = create_tree (dfa, tree, exp, CONCAT);
2206 if (tree == NULL)
2207 {
2208 *err = REG_ESPACE;
2209 return NULL;
2210 }
2211 }
2212 else if (tree == NULL)
2213 tree = exp;
2214 /* Otherwise exp == NULL, we don't need to create new tree. */
2215 }
2216 return tree;
2217}
2218
2219/* This function build the following tree, from regular expression a*:
2220 *
2221 |
2222 a
2223*/
2224
2225static bin_tree_t *
2226parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2227 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2228{
2229 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2230 bin_tree_t *tree;
2231 switch (token->type)
2232 {
2233 case CHARACTER:
2234 tree = create_token_tree (dfa, NULL, NULL, token);
2235 if (BE (tree == NULL, 0))
2236 {
2237 *err = REG_ESPACE;
2238 return NULL;
2239 }
2240#ifdef RE_ENABLE_I18N
2241 if (dfa->mb_cur_max > 1)
2242 {
2243 while (!re_string_eoi (regexp)
2244 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2245 {
2246 bin_tree_t *mbc_remain;
2247 fetch_token (token, regexp, syntax);
2248 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2249 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2250 if (BE (mbc_remain == NULL || tree == NULL, 0))
2251 {
2252 *err = REG_ESPACE;
2253 return NULL;
2254 }
2255 }
2256 }
2257#endif
2258 break;
2259 case OP_OPEN_SUBEXP:
2260 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2261 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262 return NULL;
2263 break;
2264 case OP_OPEN_BRACKET:
2265 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2266 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267 return NULL;
2268 break;
2269 case OP_BACK_REF:
2270 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2271 {
2272 *err = REG_ESUBREG;
2273 return NULL;
2274 }
2275 dfa->used_bkref_map |= 1 << token->opr.idx;
2276 tree = create_token_tree (dfa, NULL, NULL, token);
2277 if (BE (tree == NULL, 0))
2278 {
2279 *err = REG_ESPACE;
2280 return NULL;
2281 }
2282 ++dfa->nbackref;
2283 dfa->has_mb_node = 1;
2284 break;
2285 case OP_OPEN_DUP_NUM:
2286 if (syntax & RE_CONTEXT_INVALID_DUP)
2287 {
2288 *err = REG_BADRPT;
2289 return NULL;
2290 }
2291 /* FALLTHROUGH */
2292 case OP_DUP_ASTERISK:
2293 case OP_DUP_PLUS:
2294 case OP_DUP_QUESTION:
2295 if (syntax & RE_CONTEXT_INVALID_OPS)
2296 {
2297 *err = REG_BADRPT;
2298 return NULL;
2299 }
2300 else if (syntax & RE_CONTEXT_INDEP_OPS)
2301 {
2302 fetch_token (token, regexp, syntax);
2303 return parse_expression (regexp, preg, token, syntax, nest, err);
2304 }
2305 /* else fall through */
2306 case OP_CLOSE_SUBEXP:
2307 if ((token->type == OP_CLOSE_SUBEXP) &&
2308 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2309 {
2310 *err = REG_ERPAREN;
2311 return NULL;
2312 }
2313 /* else fall through */
2314 case OP_CLOSE_DUP_NUM:
2315 /* We treat it as a normal character. */
2316
2317 /* Then we can these characters as normal characters. */
2318 token->type = CHARACTER;
2319 /* mb_partial and word_char bits should be initialized already
2320 by peek_token. */
2321 tree = create_token_tree (dfa, NULL, NULL, token);
2322 if (BE (tree == NULL, 0))
2323 {
2324 *err = REG_ESPACE;
2325 return NULL;
2326 }
2327 break;
2328 case ANCHOR:
2329 if ((token->opr.ctx_type
2330 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2331 && dfa->word_ops_used == 0)
2332 init_word_char (dfa);
2333 if (token->opr.ctx_type == WORD_DELIM
2334 || token->opr.ctx_type == NOT_WORD_DELIM)
2335 {
2336 bin_tree_t *tree_first, *tree_last;
2337 if (token->opr.ctx_type == WORD_DELIM)
2338 {
2339 token->opr.ctx_type = WORD_FIRST;
2340 tree_first = create_token_tree (dfa, NULL, NULL, token);
2341 token->opr.ctx_type = WORD_LAST;
2342 }
2343 else
2344 {
2345 token->opr.ctx_type = INSIDE_WORD;
2346 tree_first = create_token_tree (dfa, NULL, NULL, token);
2347 token->opr.ctx_type = INSIDE_NOTWORD;
2348 }
2349 tree_last = create_token_tree (dfa, NULL, NULL, token);
2350 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2351 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2352 {
2353 *err = REG_ESPACE;
2354 return NULL;
2355 }
2356 }
2357 else
2358 {
2359 tree = create_token_tree (dfa, NULL, NULL, token);
2360 if (BE (tree == NULL, 0))
2361 {
2362 *err = REG_ESPACE;
2363 return NULL;
2364 }
2365 }
2366 /* We must return here, since ANCHORs can't be followed
2367 by repetition operators.
2368 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2369 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2370 fetch_token (token, regexp, syntax);
2371 return tree;
2372 case OP_PERIOD:
2373 tree = create_token_tree (dfa, NULL, NULL, token);
2374 if (BE (tree == NULL, 0))
2375 {
2376 *err = REG_ESPACE;
2377 return NULL;
2378 }
2379 if (dfa->mb_cur_max > 1)
2380 dfa->has_mb_node = 1;
2381 break;
2382 case OP_WORD:
2383 case OP_NOTWORD:
2384 tree = build_charclass_op (dfa, regexp->trans,
2385 "alnum",
2386 "_",
2387 token->type == OP_NOTWORD, err);
2388 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389 return NULL;
2390 break;
2391 case OP_SPACE:
2392 case OP_NOTSPACE:
2393 tree = build_charclass_op (dfa, regexp->trans,
2394 "space",
2395 "",
2396 token->type == OP_NOTSPACE, err);
2397 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2398 return NULL;
2399 break;
2400 case OP_ALT:
2401 case END_OF_RE:
2402 return NULL;
2403 case BACK_SLASH:
2404 *err = REG_EESCAPE;
2405 return NULL;
2406 default:
2407 /* Must not happen? */
2408#ifdef DEBUG
2409 assert (0);
2410#endif
2411 return NULL;
2412 }
2413 fetch_token (token, regexp, syntax);
2414
2415 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2416 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2417 {
2418 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2419 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2420 return NULL;
2421 /* In BRE consecutive duplications are not allowed. */
2422 if ((syntax & RE_CONTEXT_INVALID_DUP)
2423 && (token->type == OP_DUP_ASTERISK
2424 || token->type == OP_OPEN_DUP_NUM))
2425 {
2426 *err = REG_BADRPT;
2427 return NULL;
2428 }
2429 }
2430
2431 return tree;
2432}
2433
2434/* This function build the following tree, from regular expression
2435 (<reg_exp>):
2436 SUBEXP
2437 |
2438 <reg_exp>
2439*/
2440
2441static bin_tree_t *
2442parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2443 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2444{
2445 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2446 bin_tree_t *tree;
2447 size_t cur_nsub;
2448 cur_nsub = preg->re_nsub++;
2449
2450 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2451
2452 /* The subexpression may be a null string. */
2453 if (token->type == OP_CLOSE_SUBEXP)
2454 tree = NULL;
2455 else
2456 {
2457 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2458 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2459 *err = REG_EPAREN;
2460 if (BE (*err != REG_NOERROR, 0))
2461 return NULL;
2462 }
2463
2464 if (cur_nsub <= '9' - '1')
2465 dfa->completed_bkref_map |= 1 << cur_nsub;
2466
2467 tree = create_tree (dfa, tree, NULL, SUBEXP);
2468 if (BE (tree == NULL, 0))
2469 {
2470 *err = REG_ESPACE;
2471 return NULL;
2472 }
2473 tree->token.opr.idx = cur_nsub;
2474 return tree;
2475}
2476
2477/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2478
2479static bin_tree_t *
2480parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2481 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2482{
2483 bin_tree_t *tree = NULL, *old_tree = NULL;
2484 int i, start, end, start_idx = re_string_cur_idx (regexp);
2485#ifndef RE_TOKEN_INIT_BUG
2486 re_token_t start_token = *token;
2487#else
2488 re_token_t start_token;
2489
2490 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2491#endif
2492
2493 if (token->type == OP_OPEN_DUP_NUM)
2494 {
2495 end = 0;
2496 start = fetch_number (regexp, token, syntax);
2497 if (start == -1)
2498 {
2499 if (token->type == CHARACTER && token->opr.c == ',')
2500 start = 0; /* We treat "{,m}" as "{0,m}". */
2501 else
2502 {
2503 *err = REG_BADBR; /* <re>{} is invalid. */
2504 return NULL;
2505 }
2506 }
2507 if (BE (start != -2, 1))
2508 {
2509 /* We treat "{n}" as "{n,n}". */
2510 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2511 : ((token->type == CHARACTER && token->opr.c == ',')
2512 ? fetch_number (regexp, token, syntax) : -2));
2513 }
2514 if (BE (start == -2 || end == -2, 0))
2515 {
2516 /* Invalid sequence. */
2517 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2518 {
2519 if (token->type == END_OF_RE)
2520 *err = REG_EBRACE;
2521 else
2522 *err = REG_BADBR;
2523
2524 return NULL;
2525 }
2526
2527 /* If the syntax bit is set, rollback. */
2528 re_string_set_index (regexp, start_idx);
2529 *token = start_token;
2530 token->type = CHARACTER;
2531 /* mb_partial and word_char bits should be already initialized by
2532 peek_token. */
2533 return elem;
2534 }
2535
2536 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2537 {
2538 /* First number greater than second. */
2539 *err = REG_BADBR;
2540 return NULL;
2541 }
2542 }
2543 else
2544 {
2545 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2546 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2547 }
2548
2549 fetch_token (token, regexp, syntax);
2550
2551 if (BE (elem == NULL, 0))
2552 return NULL;
2553 if (BE (start == 0 && end == 0, 0))
2554 {
2555 postorder (elem, free_tree, NULL);
2556 return NULL;
2557 }
2558
2559 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2560 if (BE (start > 0, 0))
2561 {
2562 tree = elem;
2563 for (i = 2; i <= start; ++i)
2564 {
2565 elem = duplicate_tree (elem, dfa);
2566 tree = create_tree (dfa, tree, elem, CONCAT);
2567 if (BE (elem == NULL || tree == NULL, 0))
2568 goto parse_dup_op_espace;
2569 }
2570
2571 if (start == end)
2572 return tree;
2573
2574 /* Duplicate ELEM before it is marked optional. */
2575 elem = duplicate_tree (elem, dfa);
2576 old_tree = tree;
2577 }
2578 else
2579 old_tree = NULL;
2580
2581 if (elem->token.type == SUBEXP)
2582 postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2583
2584 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2585 if (BE (tree == NULL, 0))
2586 goto parse_dup_op_espace;
2587
2588 /* This loop is actually executed only when end != -1,
2589 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2590 already created the start+1-th copy. */
2591 for (i = start + 2; i <= end; ++i)
2592 {
2593 elem = duplicate_tree (elem, dfa);
2594 tree = create_tree (dfa, tree, elem, CONCAT);
2595 if (BE (elem == NULL || tree == NULL, 0))
2596 goto parse_dup_op_espace;
2597
2598 tree = create_tree (dfa, tree, NULL, OP_ALT);
2599 if (BE (tree == NULL, 0))
2600 goto parse_dup_op_espace;
2601 }
2602
2603 if (old_tree)
2604 tree = create_tree (dfa, old_tree, tree, CONCAT);
2605
2606 return tree;
2607
2608 parse_dup_op_espace:
2609 *err = REG_ESPACE;
2610 return NULL;
2611}
2612
2613/* Size of the names for collating symbol/equivalence_class/character_class.
2614 I'm not sure, but maybe enough. */
2615#define BRACKET_NAME_BUF_SIZE 32
2616
2617#ifndef _LIBC
2618 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2619 Build the range expression which starts from START_ELEM, and ends
2620 at END_ELEM. The result are written to MBCSET and SBCSET.
2621 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2622 mbcset->range_ends, is a pointer argument since we may
2623 update it. */
2624
2625static reg_errcode_t
2626internal_function
2627# ifdef RE_ENABLE_I18N
2628build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2629 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2630# else /* not RE_ENABLE_I18N */
2631build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2632 bracket_elem_t *end_elem)
2633# endif /* not RE_ENABLE_I18N */
2634{
2635 unsigned int start_ch, end_ch;
2636 /* Equivalence Classes and Character Classes can't be a range start/end. */
2637 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2638 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2639 0))
2640 return REG_ERANGE;
2641
2642 /* We can handle no multi character collating elements without libc
2643 support. */
2644 if (BE ((start_elem->type == COLL_SYM
2645 && strlen ((char *) start_elem->opr.name) > 1)
2646 || (end_elem->type == COLL_SYM
2647 && strlen ((char *) end_elem->opr.name) > 1), 0))
2648 return REG_ECOLLATE;
2649
2650# ifdef RE_ENABLE_I18N
2651 {
2652 wchar_t wc;
2653 wint_t start_wc;
2654 wint_t end_wc;
2655 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2656
2657 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2658 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2659 : 0));
2660 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2661 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2662 : 0));
2663#ifdef GAWK
2664 /*
2665 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2666 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2667 * unsigned, so we don't have sign extension problems.
2668 */
2669 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2670 ? start_ch : start_elem->opr.wch);
2671 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2672 ? end_ch : end_elem->opr.wch);
2673#else
2674 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2675 ? __btowc (start_ch) : start_elem->opr.wch);
2676 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2677 ? __btowc (end_ch) : end_elem->opr.wch);
2678#endif
2679 if (start_wc == WEOF || end_wc == WEOF)
2680 return REG_ECOLLATE;
2681 cmp_buf[0] = start_wc;
2682 cmp_buf[4] = end_wc;
2683 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2684 return REG_ERANGE;
2685
2686 /* Got valid collation sequence values, add them as a new entry.
2687 However, for !_LIBC we have no collation elements: if the
2688 character set is single byte, the single byte character set
2689 that we build below suffices. parse_bracket_exp passes
2690 no MBCSET if dfa->mb_cur_max == 1. */
2691 if (mbcset)
2692 {
2693 /* Check the space of the arrays. */
2694 if (BE (*range_alloc == mbcset->nranges, 0))
2695 {
2696 /* There is not enough space, need realloc. */
2697 wchar_t *new_array_start, *new_array_end;
2698 int new_nranges;
2699
2700 /* +1 in case of mbcset->nranges is 0. */
2701 new_nranges = 2 * mbcset->nranges + 1;
2702 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2703 are NULL if *range_alloc == 0. */
2704 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2705 new_nranges);
2706 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2707 new_nranges);
2708
2709 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2710 return REG_ESPACE;
2711
2712 mbcset->range_starts = new_array_start;
2713 mbcset->range_ends = new_array_end;
2714 *range_alloc = new_nranges;
2715 }
2716
2717 mbcset->range_starts[mbcset->nranges] = start_wc;
2718 mbcset->range_ends[mbcset->nranges++] = end_wc;
2719 }
2720
2721 /* Build the table for single byte characters. */
2722 for (wc = 0; wc < SBC_MAX; ++wc)
2723 {
2724 cmp_buf[2] = wc;
2725 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2726 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2727 bitset_set (sbcset, wc);
2728 }
2729 }
2730# else /* not RE_ENABLE_I18N */
2731 {
2732 unsigned int ch;
2733 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2734 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2735 : 0));
2736 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2737 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2738 : 0));
2739 if (start_ch > end_ch)
2740 return REG_ERANGE;
2741 /* Build the table for single byte characters. */
2742 for (ch = 0; ch < SBC_MAX; ++ch)
2743 if (start_ch <= ch && ch <= end_ch)
2744 bitset_set (sbcset, ch);
2745 }
2746# endif /* not RE_ENABLE_I18N */
2747 return REG_NOERROR;
2748}
2749#endif /* not _LIBC */
2750
2751#ifndef _LIBC
2752/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2753 Build the collating element which is represented by NAME.
2754 The result are written to MBCSET and SBCSET.
2755 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2756 pointer argument since we may update it. */
2757
2758static reg_errcode_t
2759internal_function
2760# ifdef RE_ENABLE_I18N
2761build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2762 int *coll_sym_alloc, const unsigned char *name)
2763# else /* not RE_ENABLE_I18N */
2764build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2765# endif /* not RE_ENABLE_I18N */
2766{
2767 size_t name_len = strlen ((const char *) name);
2768 if (BE (name_len != 1, 0))
2769 return REG_ECOLLATE;
2770 else
2771 {
2772 bitset_set (sbcset, name[0]);
2773 return REG_NOERROR;
2774 }
2775}
2776#endif /* not _LIBC */
2777
2778/* This function parse bracket expression like "[abc]", "[a-c]",
2779 "[[.a-a.]]" etc. */
2780
2781static bin_tree_t *
2782parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2783 reg_syntax_t syntax, reg_errcode_t *err)
2784{
2785#ifdef _LIBC
2786 const unsigned char *collseqmb;
2787 const char *collseqwc;
2788 uint32_t nrules;
2789 int32_t table_size;
2790 const int32_t *symb_table;
2791 const unsigned char *extra;
2792
2793 /* Local function for parse_bracket_exp used in _LIBC environment.
2794 Seek the collating symbol entry correspondings to NAME.
2795 Return the index of the symbol in the SYMB_TABLE. */
2796
2797 auto inline int32_t
2798 __attribute ((always_inline))
2799 seek_collating_symbol_entry (name, name_len)
2800 const unsigned char *name;
2801 size_t name_len;
2802 {
2803 int32_t hash = elem_hash ((const char *) name, name_len);
2804 int32_t elem = hash % table_size;
2805 if (symb_table[2 * elem] != 0)
2806 {
2807 int32_t second = hash % (table_size - 2) + 1;
2808
2809 do
2810 {
2811 /* First compare the hashing value. */
2812 if (symb_table[2 * elem] == hash
2813 /* Compare the length of the name. */
2814 && name_len == extra[symb_table[2 * elem + 1]]
2815 /* Compare the name. */
2816 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2817 name_len) == 0)
2818 {
2819 /* Yep, this is the entry. */
2820 break;
2821 }
2822
2823 /* Next entry. */
2824 elem += second;
2825 }
2826 while (symb_table[2 * elem] != 0);
2827 }
2828 return elem;
2829 }
2830
2831 /* Local function for parse_bracket_exp used in _LIBC environment.
2832 Look up the collation sequence value of BR_ELEM.
2833 Return the value if succeeded, UINT_MAX otherwise. */
2834
2835 auto inline unsigned int
2836 __attribute ((always_inline))
2837 lookup_collation_sequence_value (br_elem)
2838 bracket_elem_t *br_elem;
2839 {
2840 if (br_elem->type == SB_CHAR)
2841 {
2842 /*
2843 if (MB_CUR_MAX == 1)
2844 */
2845 if (nrules == 0)
2846 return collseqmb[br_elem->opr.ch];
2847 else
2848 {
2849 wint_t wc = __btowc (br_elem->opr.ch);
2850 return __collseq_table_lookup (collseqwc, wc);
2851 }
2852 }
2853 else if (br_elem->type == MB_CHAR)
2854 {
2855 if (nrules != 0)
2856 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2857 }
2858 else if (br_elem->type == COLL_SYM)
2859 {
2860 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2861 if (nrules != 0)
2862 {
2863 int32_t elem, idx;
2864 elem = seek_collating_symbol_entry (br_elem->opr.name,
2865 sym_name_len);
2866 if (symb_table[2 * elem] != 0)
2867 {
2868 /* We found the entry. */
2869 idx = symb_table[2 * elem + 1];
2870 /* Skip the name of collating element name. */
2871 idx += 1 + extra[idx];
2872 /* Skip the byte sequence of the collating element. */
2873 idx += 1 + extra[idx];
2874 /* Adjust for the alignment. */
2875 idx = (idx + 3) & ~3;
2876 /* Skip the multibyte collation sequence value. */
2877 idx += sizeof (unsigned int);
2878 /* Skip the wide char sequence of the collating element. */
2879 idx += sizeof (unsigned int) *
2880 (1 + *(unsigned int *) (extra + idx));
2881 /* Return the collation sequence value. */
2882 return *(unsigned int *) (extra + idx);
2883 }
2884 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2885 {
2886 /* No valid character. Match it as a single byte
2887 character. */
2888 return collseqmb[br_elem->opr.name[0]];
2889 }
2890 }
2891 else if (sym_name_len == 1)
2892 return collseqmb[br_elem->opr.name[0]];
2893 }
2894 return UINT_MAX;
2895 }
2896
2897 /* Local function for parse_bracket_exp used in _LIBC environment.
2898 Build the range expression which starts from START_ELEM, and ends
2899 at END_ELEM. The result are written to MBCSET and SBCSET.
2900 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2901 mbcset->range_ends, is a pointer argument since we may
2902 update it. */
2903
2904 auto inline reg_errcode_t
2905 __attribute ((always_inline))
2906 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2907 re_charset_t *mbcset;
2908 int *range_alloc;
2909 bitset_t sbcset;
2910 bracket_elem_t *start_elem, *end_elem;
2911 {
2912 unsigned int ch;
2913 uint32_t start_collseq;
2914 uint32_t end_collseq;
2915
2916 /* Equivalence Classes and Character Classes can't be a range
2917 start/end. */
2918 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2919 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2920 0))
2921 return REG_ERANGE;
2922
2923 start_collseq = lookup_collation_sequence_value (start_elem);
2924 end_collseq = lookup_collation_sequence_value (end_elem);
2925 /* Check start/end collation sequence values. */
2926 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2927 return REG_ECOLLATE;
2928 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2929 return REG_ERANGE;
2930
2931 /* Got valid collation sequence values, add them as a new entry.
2932 However, if we have no collation elements, and the character set
2933 is single byte, the single byte character set that we
2934 build below suffices. */
2935 if (nrules > 0 || dfa->mb_cur_max > 1)
2936 {
2937 /* Check the space of the arrays. */
2938 if (BE (*range_alloc == mbcset->nranges, 0))
2939 {
2940 /* There is not enough space, need realloc. */
2941 uint32_t *new_array_start;
2942 uint32_t *new_array_end;
2943 int new_nranges;
2944
2945 /* +1 in case of mbcset->nranges is 0. */
2946 new_nranges = 2 * mbcset->nranges + 1;
2947 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2948 new_nranges);
2949 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2950 new_nranges);
2951
2952 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2953 return REG_ESPACE;
2954
2955 mbcset->range_starts = new_array_start;
2956 mbcset->range_ends = new_array_end;
2957 *range_alloc = new_nranges;
2958 }
2959
2960 mbcset->range_starts[mbcset->nranges] = start_collseq;
2961 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2962 }
2963
2964 /* Build the table for single byte characters. */
2965 for (ch = 0; ch < SBC_MAX; ch++)
2966 {
2967 uint32_t ch_collseq;
2968 /*
2969 if (MB_CUR_MAX == 1)
2970 */
2971 if (nrules == 0)
2972 ch_collseq = collseqmb[ch];
2973 else
2974 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2975 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2976 bitset_set (sbcset, ch);
2977 }
2978 return REG_NOERROR;
2979 }
2980
2981 /* Local function for parse_bracket_exp used in _LIBC environment.
2982 Build the collating element which is represented by NAME.
2983 The result are written to MBCSET and SBCSET.
2984 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2985 pointer argument since we may update it. */
2986
2987 auto inline reg_errcode_t
2988 __attribute ((always_inline))
2989 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2990 re_charset_t *mbcset;
2991 int *coll_sym_alloc;
2992 bitset_t sbcset;
2993 const unsigned char *name;
2994 {
2995 int32_t elem, idx;
2996 size_t name_len = strlen ((const char *) name);
2997 if (nrules != 0)
2998 {
2999 elem = seek_collating_symbol_entry (name, name_len);
3000 if (symb_table[2 * elem] != 0)
3001 {
3002 /* We found the entry. */
3003 idx = symb_table[2 * elem + 1];
3004 /* Skip the name of collating element name. */
3005 idx += 1 + extra[idx];
3006 }
3007 else if (symb_table[2 * elem] == 0 && name_len == 1)
3008 {
3009 /* No valid character, treat it as a normal
3010 character. */
3011 bitset_set (sbcset, name[0]);
3012 return REG_NOERROR;
3013 }
3014 else
3015 return REG_ECOLLATE;
3016
3017 /* Got valid collation sequence, add it as a new entry. */
3018 /* Check the space of the arrays. */
3019 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3020 {
3021 /* Not enough, realloc it. */
3022 /* +1 in case of mbcset->ncoll_syms is 0. */
3023 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3024 /* Use realloc since mbcset->coll_syms is NULL
3025 if *alloc == 0. */
3026 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3027 new_coll_sym_alloc);
3028 if (BE (new_coll_syms == NULL, 0))
3029 return REG_ESPACE;
3030 mbcset->coll_syms = new_coll_syms;
3031 *coll_sym_alloc = new_coll_sym_alloc;
3032 }
3033 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3034 return REG_NOERROR;
3035 }
3036 else
3037 {
3038 if (BE (name_len != 1, 0))
3039 return REG_ECOLLATE;
3040 else
3041 {
3042 bitset_set (sbcset, name[0]);
3043 return REG_NOERROR;
3044 }
3045 }
3046 }
3047#endif
3048
3049 re_token_t br_token;
3050 re_bitset_ptr_t sbcset;
3051#ifdef RE_ENABLE_I18N
3052 re_charset_t *mbcset;
3053 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3054 int equiv_class_alloc = 0, char_class_alloc = 0;
3055#endif /* not RE_ENABLE_I18N */
3056 int non_match = 0;
3057 bin_tree_t *work_tree;
3058 int token_len;
3059 int first_round = 1;
3060#ifdef _LIBC
3061 collseqmb = (const unsigned char *)
3062 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3063 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3064 if (nrules)
3065 {
3066 /*
3067 if (MB_CUR_MAX > 1)
3068 */
3069 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3070 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3071 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3072 _NL_COLLATE_SYMB_TABLEMB);
3073 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3074 _NL_COLLATE_SYMB_EXTRAMB);
3075 }
3076#endif
3077 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3078#ifdef RE_ENABLE_I18N
3079 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3080#endif /* RE_ENABLE_I18N */
3081#ifdef RE_ENABLE_I18N
3082 if (BE (sbcset == NULL || mbcset == NULL, 0))
3083#else
3084 if (BE (sbcset == NULL, 0))
3085#endif /* RE_ENABLE_I18N */
3086 {
3087 *err = REG_ESPACE;
3088 return NULL;
3089 }
3090
3091 token_len = peek_token_bracket (token, regexp, syntax);
3092 if (BE (token->type == END_OF_RE, 0))
3093 {
3094 *err = REG_BADPAT;
3095 goto parse_bracket_exp_free_return;
3096 }
3097 if (token->type == OP_NON_MATCH_LIST)
3098 {
3099#ifdef RE_ENABLE_I18N
3100 mbcset->non_match = 1;
3101#endif /* not RE_ENABLE_I18N */
3102 non_match = 1;
3103 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3104 bitset_set (sbcset, '\n');
3105 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3106 token_len = peek_token_bracket (token, regexp, syntax);
3107 if (BE (token->type == END_OF_RE, 0))
3108 {
3109 *err = REG_BADPAT;
3110 goto parse_bracket_exp_free_return;
3111 }
3112 }
3113
3114 /* We treat the first ']' as a normal character. */
3115 if (token->type == OP_CLOSE_BRACKET)
3116 token->type = CHARACTER;
3117
3118 while (1)
3119 {
3120 bracket_elem_t start_elem, end_elem;
3121 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3122 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3123 reg_errcode_t ret;
3124 int token_len2 = 0, is_range_exp = 0;
3125 re_token_t token2;
3126
3127 start_elem.opr.name = start_name_buf;
3128 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3129 syntax, first_round);
3130 if (BE (ret != REG_NOERROR, 0))
3131 {
3132 *err = ret;
3133 goto parse_bracket_exp_free_return;
3134 }
3135 first_round = 0;
3136
3137 /* Get information about the next token. We need it in any case. */
3138 token_len = peek_token_bracket (token, regexp, syntax);
3139
3140 /* Do not check for ranges if we know they are not allowed. */
3141 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3142 {
3143 if (BE (token->type == END_OF_RE, 0))
3144 {
3145 *err = REG_EBRACK;
3146 goto parse_bracket_exp_free_return;
3147 }
3148 if (token->type == OP_CHARSET_RANGE)
3149 {
3150 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3151 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3152 if (BE (token2.type == END_OF_RE, 0))
3153 {
3154 *err = REG_EBRACK;
3155 goto parse_bracket_exp_free_return;
3156 }
3157 if (token2.type == OP_CLOSE_BRACKET)
3158 {
3159 /* We treat the last '-' as a normal character. */
3160 re_string_skip_bytes (regexp, -token_len);
3161 token->type = CHARACTER;
3162 }
3163 else
3164 is_range_exp = 1;
3165 }
3166 }
3167
3168 if (is_range_exp == 1)
3169 {
3170 end_elem.opr.name = end_name_buf;
3171 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3172 dfa, syntax, 1);
3173 if (BE (ret != REG_NOERROR, 0))
3174 {
3175 *err = ret;
3176 goto parse_bracket_exp_free_return;
3177 }
3178
3179 token_len = peek_token_bracket (token, regexp, syntax);
3180
3181#ifdef _LIBC
3182 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3183 &start_elem, &end_elem);
3184#else
3185# ifdef RE_ENABLE_I18N
3186 *err = build_range_exp (sbcset,
3187 dfa->mb_cur_max > 1 ? mbcset : NULL,
3188 &range_alloc, &start_elem, &end_elem);
3189# else
3190 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3191# endif
3192#endif /* RE_ENABLE_I18N */
3193 if (BE (*err != REG_NOERROR, 0))
3194 goto parse_bracket_exp_free_return;
3195 }
3196 else
3197 {
3198 switch (start_elem.type)
3199 {
3200 case SB_CHAR:
3201 bitset_set (sbcset, start_elem.opr.ch);
3202 break;
3203#ifdef RE_ENABLE_I18N
3204 case MB_CHAR:
3205 /* Check whether the array has enough space. */
3206 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3207 {
3208 wchar_t *new_mbchars;
3209 /* Not enough, realloc it. */
3210 /* +1 in case of mbcset->nmbchars is 0. */
3211 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3212 /* Use realloc since array is NULL if *alloc == 0. */
3213 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3214 mbchar_alloc);
3215 if (BE (new_mbchars == NULL, 0))
3216 goto parse_bracket_exp_espace;
3217 mbcset->mbchars = new_mbchars;
3218 }
3219 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3220 break;
3221#endif /* RE_ENABLE_I18N */
3222 case EQUIV_CLASS:
3223 *err = build_equiv_class (sbcset,
3224#ifdef RE_ENABLE_I18N
3225 mbcset, &equiv_class_alloc,
3226#endif /* RE_ENABLE_I18N */
3227 start_elem.opr.name);
3228 if (BE (*err != REG_NOERROR, 0))
3229 goto parse_bracket_exp_free_return;
3230 break;
3231 case COLL_SYM:
3232 *err = build_collating_symbol (sbcset,
3233#ifdef RE_ENABLE_I18N
3234 mbcset, &coll_sym_alloc,
3235#endif /* RE_ENABLE_I18N */
3236 start_elem.opr.name);
3237 if (BE (*err != REG_NOERROR, 0))
3238 goto parse_bracket_exp_free_return;
3239 break;
3240 case CHAR_CLASS:
3241 *err = build_charclass (regexp->trans, sbcset,
3242#ifdef RE_ENABLE_I18N
3243 mbcset, &char_class_alloc,
3244#endif /* RE_ENABLE_I18N */
3245 (const char *) start_elem.opr.name, syntax);
3246 if (BE (*err != REG_NOERROR, 0))
3247 goto parse_bracket_exp_free_return;
3248 break;
3249 default:
3250 assert (0);
3251 break;
3252 }
3253 }
3254 if (BE (token->type == END_OF_RE, 0))
3255 {
3256 *err = REG_EBRACK;
3257 goto parse_bracket_exp_free_return;
3258 }
3259 if (token->type == OP_CLOSE_BRACKET)
3260 break;
3261 }
3262
3263 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3264
3265 /* If it is non-matching list. */
3266 if (non_match)
3267 bitset_not (sbcset);
3268
3269#ifdef RE_ENABLE_I18N
3270 /* Ensure only single byte characters are set. */
3271 if (dfa->mb_cur_max > 1)
3272 bitset_mask (sbcset, dfa->sb_char);
3273
3274 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3275 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3276 || mbcset->non_match)))
3277 {
3278 bin_tree_t *mbc_tree;
3279 int sbc_idx;
3280 /* Build a tree for complex bracket. */
3281 dfa->has_mb_node = 1;
3282 br_token.type = COMPLEX_BRACKET;
3283 br_token.opr.mbcset = mbcset;
3284 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3285 if (BE (mbc_tree == NULL, 0))
3286 goto parse_bracket_exp_espace;
3287 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3288 if (sbcset[sbc_idx])
3289 break;
3290 /* If there are no bits set in sbcset, there is no point
3291 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3292 if (sbc_idx < BITSET_WORDS)
3293 {
3294 /* Build a tree for simple bracket. */
3295 br_token.type = SIMPLE_BRACKET;
3296 br_token.opr.sbcset = sbcset;
3297 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3298 if (BE (work_tree == NULL, 0))
3299 goto parse_bracket_exp_espace;
3300
3301 /* Then join them by ALT node. */
3302 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3303 if (BE (work_tree == NULL, 0))
3304 goto parse_bracket_exp_espace;
3305 }
3306 else
3307 {
3308 re_free (sbcset);
3309 work_tree = mbc_tree;
3310 }
3311 }
3312 else
3313#endif /* not RE_ENABLE_I18N */
3314 {
3315#ifdef RE_ENABLE_I18N
3316 free_charset (mbcset);
3317#endif
3318 /* Build a tree for simple bracket. */
3319 br_token.type = SIMPLE_BRACKET;
3320 br_token.opr.sbcset = sbcset;
3321 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3322 if (BE (work_tree == NULL, 0))
3323 goto parse_bracket_exp_espace;
3324 }
3325 return work_tree;
3326
3327 parse_bracket_exp_espace:
3328 *err = REG_ESPACE;
3329 parse_bracket_exp_free_return:
3330 re_free (sbcset);
3331#ifdef RE_ENABLE_I18N
3332 free_charset (mbcset);
3333#endif /* RE_ENABLE_I18N */
3334 return NULL;
3335}
3336
3337/* Parse an element in the bracket expression. */
3338
3339static reg_errcode_t
3340parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3341 re_token_t *token, int token_len, re_dfa_t *dfa,
3342 reg_syntax_t syntax, int accept_hyphen)
3343{
3344#ifdef RE_ENABLE_I18N
3345 int cur_char_size;
3346 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3347 if (cur_char_size > 1)
3348 {
3349 elem->type = MB_CHAR;
3350 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3351 re_string_skip_bytes (regexp, cur_char_size);
3352 return REG_NOERROR;
3353 }
3354#endif /* RE_ENABLE_I18N */
3355 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3356 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3357 || token->type == OP_OPEN_EQUIV_CLASS)
3358 return parse_bracket_symbol (elem, regexp, token);
3359 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3360 {
3361 /* A '-' must only appear as anything but a range indicator before
3362 the closing bracket. Everything else is an error. */
3363 re_token_t token2;
3364 (void) peek_token_bracket (&token2, regexp, syntax);
3365 if (token2.type != OP_CLOSE_BRACKET)
3366 /* The actual error value is not standardized since this whole
3367 case is undefined. But ERANGE makes good sense. */
3368 return REG_ERANGE;
3369 }
3370 elem->type = SB_CHAR;
3371 elem->opr.ch = token->opr.c;
3372 return REG_NOERROR;
3373}
3374
3375/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3376 such as [:<character_class>:], [.<collating_element>.], and
3377 [=<equivalent_class>=]. */
3378
3379static reg_errcode_t
3380parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3381 re_token_t *token)
3382{
3383 unsigned char ch, delim = token->opr.c;
3384 int i = 0;
3385 if (re_string_eoi(regexp))
3386 return REG_EBRACK;
3387 for (;; ++i)
3388 {
3389 if (i >= BRACKET_NAME_BUF_SIZE)
3390 return REG_EBRACK;
3391 if (token->type == OP_OPEN_CHAR_CLASS)
3392 ch = re_string_fetch_byte_case (regexp);
3393 else
3394 ch = re_string_fetch_byte (regexp);
3395 if (re_string_eoi(regexp))
3396 return REG_EBRACK;
3397 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3398 break;
3399 elem->opr.name[i] = ch;
3400 }
3401 re_string_skip_bytes (regexp, 1);
3402 elem->opr.name[i] = '\0';
3403 switch (token->type)
3404 {
3405 case OP_OPEN_COLL_ELEM:
3406 elem->type = COLL_SYM;
3407 break;
3408 case OP_OPEN_EQUIV_CLASS:
3409 elem->type = EQUIV_CLASS;
3410 break;
3411 case OP_OPEN_CHAR_CLASS:
3412 elem->type = CHAR_CLASS;
3413 break;
3414 default:
3415 break;
3416 }
3417 return REG_NOERROR;
3418}
3419
3420 /* Helper function for parse_bracket_exp.
3421 Build the equivalence class which is represented by NAME.
3422 The result are written to MBCSET and SBCSET.
3423 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3424 is a pointer argument since we may update it. */
3425
3426static reg_errcode_t
3427#ifdef RE_ENABLE_I18N
3428build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3429 int *equiv_class_alloc, const unsigned char *name)
3430#else /* not RE_ENABLE_I18N */
3431build_equiv_class (bitset_t sbcset, const unsigned char *name)
3432#endif /* not RE_ENABLE_I18N */
3433{
3434#ifdef _LIBC
3435 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3436 if (nrules != 0)
3437 {
3438 const int32_t *table, *indirect;
3439 const unsigned char *weights, *extra, *cp;
3440 unsigned char char_buf[2];
3441 int32_t idx1, idx2;
3442 unsigned int ch;
3443 size_t len;
3444 /* This #include defines a local function! */
3445# include <locale/weight.h>
3446 /* Calculate the index for equivalence class. */
3447 cp = name;
3448 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3449 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3450 _NL_COLLATE_WEIGHTMB);
3451 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3452 _NL_COLLATE_EXTRAMB);
3453 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3454 _NL_COLLATE_INDIRECTMB);
3455 idx1 = findidx (&cp);
3456 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3457 /* This isn't a valid character. */
3458 return REG_ECOLLATE;
3459
3460 /* Build single byte matcing table for this equivalence class. */
3461 char_buf[1] = (unsigned char) '\0';
3462 len = weights[idx1 & 0xffffff];
3463 for (ch = 0; ch < SBC_MAX; ++ch)
3464 {
3465 char_buf[0] = ch;
3466 cp = char_buf;
3467 idx2 = findidx (&cp);
3468/*
3469 idx2 = table[ch];
3470*/
3471 if (idx2 == 0)
3472 /* This isn't a valid character. */
3473 continue;
3474 /* Compare only if the length matches and the collation rule
3475 index is the same. */
3476 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3477 {
3478 int cnt = 0;
3479
3480 while (cnt <= len &&
3481 weights[(idx1 & 0xffffff) + 1 + cnt]
3482 == weights[(idx2 & 0xffffff) + 1 + cnt])
3483 ++cnt;
3484
3485 if (cnt > len)
3486 bitset_set (sbcset, ch);
3487 }
3488 }
3489 /* Check whether the array has enough space. */
3490 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3491 {
3492 /* Not enough, realloc it. */
3493 /* +1 in case of mbcset->nequiv_classes is 0. */
3494 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3495 /* Use realloc since the array is NULL if *alloc == 0. */
3496 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3497 int32_t,
3498 new_equiv_class_alloc);
3499 if (BE (new_equiv_classes == NULL, 0))
3500 return REG_ESPACE;
3501 mbcset->equiv_classes = new_equiv_classes;
3502 *equiv_class_alloc = new_equiv_class_alloc;
3503 }
3504 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3505 }
3506 else
3507#endif /* _LIBC */
3508 {
3509 if (BE (strlen ((const char *) name) != 1, 0))
3510 return REG_ECOLLATE;
3511 bitset_set (sbcset, *name);
3512 }
3513 return REG_NOERROR;
3514}
3515
3516 /* Helper function for parse_bracket_exp.
3517 Build the character class which is represented by NAME.
3518 The result are written to MBCSET and SBCSET.
3519 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3520 is a pointer argument since we may update it. */
3521
3522static reg_errcode_t
3523#ifdef RE_ENABLE_I18N
3524build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3525 re_charset_t *mbcset, int *char_class_alloc,
3526 const char *class_name, reg_syntax_t syntax)
3527#else /* not RE_ENABLE_I18N */
3528build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3529 const char *class_name, reg_syntax_t syntax)
3530#endif /* not RE_ENABLE_I18N */
3531{
3532 int i;
3533
3534 /* In case of REG_ICASE "upper" and "lower" match the both of
3535 upper and lower cases. */
3536 if ((syntax & RE_ICASE)
3537 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3538 class_name = "alpha";
3539
3540#ifdef RE_ENABLE_I18N
3541 /* Check the space of the arrays. */
3542 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3543 {
3544 /* Not enough, realloc it. */
3545 /* +1 in case of mbcset->nchar_classes is 0. */
3546 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3547 /* Use realloc since array is NULL if *alloc == 0. */
3548 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3549 new_char_class_alloc);
3550 if (BE (new_char_classes == NULL, 0))
3551 return REG_ESPACE;
3552 mbcset->char_classes = new_char_classes;
3553 *char_class_alloc = new_char_class_alloc;
3554 }
3555 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3556#endif /* RE_ENABLE_I18N */
3557
3558#define BUILD_CHARCLASS_LOOP(ctype_func) \
3559 do { \
3560 if (BE (trans != NULL, 0)) \
3561 { \
3562 for (i = 0; i < SBC_MAX; ++i) \
3563 if (ctype_func (i)) \
3564 bitset_set (sbcset, trans[i]); \
3565 } \
3566 else \
3567 { \
3568 for (i = 0; i < SBC_MAX; ++i) \
3569 if (ctype_func (i)) \
3570 bitset_set (sbcset, i); \
3571 } \
3572 } while (0)
3573
3574 if (strcmp (class_name, "alnum") == 0)
3575 BUILD_CHARCLASS_LOOP (isalnum);
3576 else if (strcmp (class_name, "cntrl") == 0)
3577 BUILD_CHARCLASS_LOOP (iscntrl);
3578 else if (strcmp (class_name, "lower") == 0)
3579 BUILD_CHARCLASS_LOOP (islower);
3580 else if (strcmp (class_name, "space") == 0)
3581 BUILD_CHARCLASS_LOOP (isspace);
3582 else if (strcmp (class_name, "alpha") == 0)
3583 BUILD_CHARCLASS_LOOP (isalpha);
3584 else if (strcmp (class_name, "digit") == 0)
3585 BUILD_CHARCLASS_LOOP (isdigit);
3586 else if (strcmp (class_name, "print") == 0)
3587 BUILD_CHARCLASS_LOOP (isprint);
3588 else if (strcmp (class_name, "upper") == 0)
3589 BUILD_CHARCLASS_LOOP (isupper);
3590 else if (strcmp (class_name, "blank") == 0)
3591#ifndef GAWK
3592 BUILD_CHARCLASS_LOOP (isblank);
3593#else
3594 /* see comments above */
3595 BUILD_CHARCLASS_LOOP (is_blank);
3596#endif
3597 else if (strcmp (class_name, "graph") == 0)
3598 BUILD_CHARCLASS_LOOP (isgraph);
3599 else if (strcmp (class_name, "punct") == 0)
3600 BUILD_CHARCLASS_LOOP (ispunct);
3601 else if (strcmp (class_name, "xdigit") == 0)
3602 BUILD_CHARCLASS_LOOP (isxdigit);
3603 else
3604 return REG_ECTYPE;
3605
3606 return REG_NOERROR;
3607}
3608
3609static bin_tree_t *
3610build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3611 const char *class_name,
3612 const char *extra, int non_match,
3613 reg_errcode_t *err)
3614{
3615 re_bitset_ptr_t sbcset;
3616#ifdef RE_ENABLE_I18N
3617 re_charset_t *mbcset;
3618 int alloc = 0;
3619#endif /* not RE_ENABLE_I18N */
3620 reg_errcode_t ret;
3621 re_token_t br_token;
3622 bin_tree_t *tree;
3623
3624 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3625#ifdef RE_ENABLE_I18N
3626 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3627#endif /* RE_ENABLE_I18N */
3628
3629#ifdef RE_ENABLE_I18N
3630 if (BE (sbcset == NULL || mbcset == NULL, 0))
3631#else /* not RE_ENABLE_I18N */
3632 if (BE (sbcset == NULL, 0))
3633#endif /* not RE_ENABLE_I18N */
3634 {
3635 *err = REG_ESPACE;
3636 return NULL;
3637 }
3638
3639 if (non_match)
3640 {
3641#ifdef RE_ENABLE_I18N
3642 mbcset->non_match = 1;
3643#endif /* not RE_ENABLE_I18N */
3644 }
3645
3646 /* We don't care the syntax in this case. */
3647 ret = build_charclass (trans, sbcset,
3648#ifdef RE_ENABLE_I18N
3649 mbcset, &alloc,
3650#endif /* RE_ENABLE_I18N */
3651 class_name, 0);
3652
3653 if (BE (ret != REG_NOERROR, 0))
3654 {
3655 re_free (sbcset);
3656#ifdef RE_ENABLE_I18N
3657 free_charset (mbcset);
3658#endif /* RE_ENABLE_I18N */
3659 *err = ret;
3660 return NULL;
3661 }
3662 /* \w match '_' also. */
3663 for (; *extra; extra++)
3664 bitset_set (sbcset, *extra);
3665
3666 /* If it is non-matching list. */
3667 if (non_match)
3668 bitset_not (sbcset);
3669
3670#ifdef RE_ENABLE_I18N
3671 /* Ensure only single byte characters are set. */
3672 if (dfa->mb_cur_max > 1)
3673 bitset_mask (sbcset, dfa->sb_char);
3674#endif
3675
3676 /* Build a tree for simple bracket. */
3677 br_token.type = SIMPLE_BRACKET;
3678 br_token.opr.sbcset = sbcset;
3679 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3680 if (BE (tree == NULL, 0))
3681 goto build_word_op_espace;
3682
3683#ifdef RE_ENABLE_I18N
3684 if (dfa->mb_cur_max > 1)
3685 {
3686 bin_tree_t *mbc_tree;
3687 /* Build a tree for complex bracket. */
3688 br_token.type = COMPLEX_BRACKET;
3689 br_token.opr.mbcset = mbcset;
3690 dfa->has_mb_node = 1;
3691 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3692 if (BE (mbc_tree == NULL, 0))
3693 goto build_word_op_espace;
3694 /* Then join them by ALT node. */
3695 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3696 if (BE (mbc_tree != NULL, 1))
3697 return tree;
3698 }
3699 else
3700 {
3701 free_charset (mbcset);
3702 return tree;
3703 }
3704#else /* not RE_ENABLE_I18N */
3705 return tree;
3706#endif /* not RE_ENABLE_I18N */
3707
3708 build_word_op_espace:
3709 re_free (sbcset);
3710#ifdef RE_ENABLE_I18N
3711 free_charset (mbcset);
3712#endif /* RE_ENABLE_I18N */
3713 *err = REG_ESPACE;
3714 return NULL;
3715}
3716
3717/* This is intended for the expressions like "a{1,3}".
3718 Fetch a number from `input', and return the number.
3719 Return -1, if the number field is empty like "{,1}".
3720 Return -2, if an error has occurred. */
3721
3722static int
3723fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3724{
3725 int num = -1;
3726 unsigned char c;
3727 while (1)
3728 {
3729 fetch_token (token, input, syntax);
3730 c = token->opr.c;
3731 if (BE (token->type == END_OF_RE, 0))
3732 return -2;
3733 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3734 break;
3735 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3736 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3737 num = (num > RE_DUP_MAX) ? -2 : num;
3738 }
3739 return num;
3740}
3741\f
3742#ifdef RE_ENABLE_I18N
3743static void
3744free_charset (re_charset_t *cset)
3745{
3746 re_free (cset->mbchars);
3747# ifdef _LIBC
3748 re_free (cset->coll_syms);
3749 re_free (cset->equiv_classes);
3750 re_free (cset->range_starts);
3751 re_free (cset->range_ends);
3752# endif
3753 re_free (cset->char_classes);
3754 re_free (cset);
3755}
3756#endif /* RE_ENABLE_I18N */
3757\f
3758/* Functions for binary tree operation. */
3759
3760/* Create a tree node. */
3761
3762static bin_tree_t *
3763create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3764 re_token_type_t type)
3765{
3766 re_token_t t;
3767 t.type = type;
3768 return create_token_tree (dfa, left, right, &t);
3769}
3770
3771static bin_tree_t *
3772create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3773 const re_token_t *token)
3774{
3775 bin_tree_t *tree;
3776 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3777 {
3778 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3779
3780 if (storage == NULL)
3781 return NULL;
3782 storage->next = dfa->str_tree_storage;
3783 dfa->str_tree_storage = storage;
3784 dfa->str_tree_storage_idx = 0;
3785 }
3786 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3787
3788 tree->parent = NULL;
3789 tree->left = left;
3790 tree->right = right;
3791 tree->token = *token;
3792 tree->token.duplicated = 0;
3793 tree->token.opt_subexp = 0;
3794 tree->first = NULL;
3795 tree->next = NULL;
3796 tree->node_idx = -1;
3797
3798 if (left != NULL)
3799 left->parent = tree;
3800 if (right != NULL)
3801 right->parent = tree;
3802 return tree;
3803}
3804
3805/* Mark the tree SRC as an optional subexpression.
3806 To be called from preorder or postorder. */
3807
3808static reg_errcode_t
3809mark_opt_subexp (void *extra, bin_tree_t *node)
3810{
3811 int idx = (int) (intptr_t) extra;
3812 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3813 node->token.opt_subexp = 1;
3814
3815 return REG_NOERROR;
3816}
3817
3818/* Free the allocated memory inside NODE. */
3819
3820static void
3821free_token (re_token_t *node)
3822{
3823#ifdef RE_ENABLE_I18N
3824 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3825 free_charset (node->opr.mbcset);
3826 else
3827#endif /* RE_ENABLE_I18N */
3828 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3829 re_free (node->opr.sbcset);
3830}
3831
3832/* Worker function for tree walking. Free the allocated memory inside NODE
3833 and its children. */
3834
3835static reg_errcode_t
3836free_tree (void *extra, bin_tree_t *node)
3837{
3838 free_token (&node->token);
3839 return REG_NOERROR;
3840}
3841
3842
3843/* Duplicate the node SRC, and return new node. This is a preorder
3844 visit similar to the one implemented by the generic visitor, but
3845 we need more infrastructure to maintain two parallel trees --- so,
3846 it's easier to duplicate. */
3847
3848static bin_tree_t *
3849duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3850{
3851 const bin_tree_t *node;
3852 bin_tree_t *dup_root;
3853 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3854
3855 for (node = root; ; )
3856 {
3857 /* Create a new tree and link it back to the current parent. */
3858 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3859 if (*p_new == NULL)
3860 return NULL;
3861 (*p_new)->parent = dup_node;
3862 (*p_new)->token.duplicated = 1;
3863 dup_node = *p_new;
3864
3865 /* Go to the left node, or up and to the right. */
3866 if (node->left)
3867 {
3868 node = node->left;
3869 p_new = &dup_node->left;
3870 }
3871 else
3872 {
3873 const bin_tree_t *prev = NULL;
3874 while (node->right == prev || node->right == NULL)
3875 {
3876 prev = node;
3877 node = node->parent;
3878 dup_node = dup_node->parent;
3879 if (!node)
3880 return dup_root;
3881 }
3882 node = node->right;
3883 p_new = &dup_node->right;
3884 }
3885 }
3886}