compat / regex / regcomp.con commit Adjust for 2.19.x series (18ad13e)
   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, see
  18   <http://www.gnu.org/licenses/>.  */
  19
  20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                          size_t length, reg_syntax_t syntax);
  22static void re_compile_fastmap_iter (regex_t *bufp,
  23                                     const re_dfastate_t *init_state,
  24                                     char *fastmap);
  25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26#ifdef RE_ENABLE_I18N
  27static void free_charset (re_charset_t *cset);
  28#endif /* RE_ENABLE_I18N */
  29static void free_workarea_compile (regex_t *preg);
  30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31#ifdef RE_ENABLE_I18N
  32static void optimize_utf8 (re_dfa_t *dfa);
  33#endif
  34static reg_errcode_t analyze (regex_t *preg);
  35static reg_errcode_t preorder (bin_tree_t *root,
  36                               reg_errcode_t (fn (void *, bin_tree_t *)),
  37                               void *extra);
  38static reg_errcode_t postorder (bin_tree_t *root,
  39                                reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                void *extra);
  41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                 bin_tree_t *node);
  45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  49static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  50                                   unsigned int constraint);
  51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                         int node, int root);
  54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55static int fetch_number (re_string_t *input, re_token_t *token,
  56                         reg_syntax_t syntax);
  57static int peek_token (re_token_t *token, re_string_t *input,
  58                        reg_syntax_t syntax) internal_function;
  59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                          reg_syntax_t syntax, reg_errcode_t *err);
  61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                  re_token_t *token, reg_syntax_t syntax,
  63                                  int nest, reg_errcode_t *err);
  64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                 re_token_t *token, reg_syntax_t syntax,
  66                                 int nest, reg_errcode_t *err);
  67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                     re_token_t *token, reg_syntax_t syntax,
  69                                     int nest, reg_errcode_t *err);
  70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                  re_token_t *token, reg_syntax_t syntax,
  72                                  int nest, reg_errcode_t *err);
  73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                 re_dfa_t *dfa, re_token_t *token,
  75                                 reg_syntax_t syntax, reg_errcode_t *err);
  76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                      re_token_t *token, reg_syntax_t syntax,
  78                                      reg_errcode_t *err);
  79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                            re_string_t *regexp,
  81                                            re_token_t *token, int token_len,
  82                                            re_dfa_t *dfa,
  83                                            reg_syntax_t syntax,
  84                                            int accept_hyphen);
  85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                          re_string_t *regexp,
  87                                          re_token_t *token);
  88#ifdef RE_ENABLE_I18N
  89static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                        re_charset_t *mbcset,
  91                                        int *equiv_class_alloc,
  92                                        const unsigned char *name);
  93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                      bitset_t sbcset,
  95                                      re_charset_t *mbcset,
  96                                      int *char_class_alloc,
  97                                      const char *class_name,
  98                                      reg_syntax_t syntax);
  99#else  /* not RE_ENABLE_I18N */
 100static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                        const unsigned char *name);
 102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                      bitset_t sbcset,
 104                                      const char *class_name,
 105                                      reg_syntax_t syntax);
 106#endif /* not RE_ENABLE_I18N */
 107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                       RE_TRANSLATE_TYPE trans,
 109                                       const char *class_name,
 110                                       const char *extra,
 111                                       int non_match, reg_errcode_t *err);
 112static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                bin_tree_t *left, bin_tree_t *right,
 114                                re_token_type_t type);
 115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                      bin_tree_t *left, bin_tree_t *right,
 117                                      const re_token_t *token);
 118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119static void free_token (re_token_t *node);
 120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122\f
 123/* This table gives an error message for each of the error codes listed
 124   in regex.h.  Obviously the order here has to be same as there.
 125   POSIX doesn't require that we do anything for REG_NOERROR,
 126   but why not be nice?  */
 127
 128const char __re_error_msgid[] attribute_hidden =
 129  {
 130#define REG_NOERROR_IDX 0
 131    gettext_noop ("Success")    /* REG_NOERROR */
 132    "\0"
 133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134    gettext_noop ("No match")   /* REG_NOMATCH */
 135    "\0"
 136#define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138    "\0"
 139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141    "\0"
 142#define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144    "\0"
 145#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147    "\0"
 148#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150    "\0"
 151#define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152    gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153    "\0"
 154#define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156    "\0"
 157#define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159    "\0"
 160#define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162    "\0"
 163#define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164    gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165    "\0"
 166#define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167    gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168    "\0"
 169#define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171    "\0"
 172#define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173    gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174    "\0"
 175#define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176    gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177    "\0"
 178#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180  };
 181
 182const size_t __re_error_msgid_idx[] attribute_hidden =
 183  {
 184    REG_NOERROR_IDX,
 185    REG_NOMATCH_IDX,
 186    REG_BADPAT_IDX,
 187    REG_ECOLLATE_IDX,
 188    REG_ECTYPE_IDX,
 189    REG_EESCAPE_IDX,
 190    REG_ESUBREG_IDX,
 191    REG_EBRACK_IDX,
 192    REG_EPAREN_IDX,
 193    REG_EBRACE_IDX,
 194    REG_BADBR_IDX,
 195    REG_ERANGE_IDX,
 196    REG_ESPACE_IDX,
 197    REG_BADRPT_IDX,
 198    REG_EEND_IDX,
 199    REG_ESIZE_IDX,
 200    REG_ERPAREN_IDX
 201  };
 202\f
 203/* Entry points for GNU code.  */
 204
 205
 206#ifdef ZOS_USS
 207
 208/* For ZOS USS we must define btowc */
 209
 210wchar_t 
 211btowc (int c)
 212{
 213   wchar_t wtmp[2];
 214   char tmp[2];
 215
 216   tmp[0] = c;
 217   tmp[1] = 0;
 218
 219   mbtowc (wtmp, tmp, 1);
 220   return wtmp[0];
 221}
 222#endif
 223
 224/* re_compile_pattern is the GNU regular expression compiler: it
 225   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 226   Returns 0 if the pattern was valid, otherwise an error string.
 227
 228   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 229   are set in BUFP on entry.  */
 230
 231const char *
 232re_compile_pattern (const char *pattern,
 233                    size_t length,
 234                    struct re_pattern_buffer *bufp)
 235{
 236  reg_errcode_t ret;
 237
 238  /* And GNU code determines whether or not to get register information
 239     by passing null for the REGS argument to re_match, etc., not by
 240     setting no_sub, unless RE_NO_SUB is set.  */
 241  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 242
 243  /* Match anchors at newline.  */
 244  bufp->newline_anchor = 1;
 245
 246  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 247
 248  if (!ret)
 249    return NULL;
 250  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 251}
 252#ifdef _LIBC
 253weak_alias (__re_compile_pattern, re_compile_pattern)
 254#endif
 255
 256/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 257   also be assigned to arbitrarily: each pattern buffer stores its own
 258   syntax, so it can be changed between regex compilations.  */
 259/* This has no initializer because initialized variables in Emacs
 260   become read-only after dumping.  */
 261reg_syntax_t re_syntax_options;
 262
 263
 264/* Specify the precise syntax of regexps for compilation.  This provides
 265   for compatibility for various utilities which historically have
 266   different, incompatible syntaxes.
 267
 268   The argument SYNTAX is a bit mask comprised of the various bits
 269   defined in regex.h.  We return the old syntax.  */
 270
 271reg_syntax_t
 272re_set_syntax (reg_syntax_t syntax)
 273{
 274  reg_syntax_t ret = re_syntax_options;
 275
 276  re_syntax_options = syntax;
 277  return ret;
 278}
 279#ifdef _LIBC
 280weak_alias (__re_set_syntax, re_set_syntax)
 281#endif
 282
 283int
 284re_compile_fastmap (struct re_pattern_buffer *bufp)
 285{
 286  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 287  char *fastmap = bufp->fastmap;
 288
 289  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 290  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 291  if (dfa->init_state != dfa->init_state_word)
 292    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 293  if (dfa->init_state != dfa->init_state_nl)
 294    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 295  if (dfa->init_state != dfa->init_state_begbuf)
 296    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 297  bufp->fastmap_accurate = 1;
 298  return 0;
 299}
 300#ifdef _LIBC
 301weak_alias (__re_compile_fastmap, re_compile_fastmap)
 302#endif
 303
 304static inline void
 305__attribute ((always_inline))
 306re_set_fastmap (char *fastmap, int icase, int ch)
 307{
 308  fastmap[ch] = 1;
 309  if (icase)
 310    fastmap[tolower (ch)] = 1;
 311}
 312
 313/* Helper function for re_compile_fastmap.
 314   Compile fastmap for the initial_state INIT_STATE.  */
 315
 316static void
 317re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 318                         char *fastmap)
 319{
 320  volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 321  int node_cnt;
 322  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 323  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 324    {
 325      int node = init_state->nodes.elems[node_cnt];
 326      re_token_type_t type = dfa->nodes[node].type;
 327
 328      if (type == CHARACTER)
 329        {
 330          re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 331#ifdef RE_ENABLE_I18N
 332          if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 333            {
 334              unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 335              wchar_t wc;
 336              mbstate_t state;
 337
 338              p = buf;
 339              *p++ = dfa->nodes[node].opr.c;
 340              while (++node < dfa->nodes_len
 341                     && dfa->nodes[node].type == CHARACTER
 342                     && dfa->nodes[node].mb_partial)
 343                *p++ = dfa->nodes[node].opr.c;
 344              memset (&state, '\0', sizeof (state));
 345              if (__mbrtowc (&wc, (const char *) buf, p - buf,
 346                             &state) == p - buf
 347                  && (__wcrtomb ((char *) buf, towlower (wc), &state)
 348                      != (size_t) -1))
 349                re_set_fastmap (fastmap, 0, buf[0]);
 350              re_free (buf);
 351            }
 352#endif
 353        }
 354      else if (type == SIMPLE_BRACKET)
 355        {
 356          int i, ch;
 357          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 358            {
 359              int j;
 360              bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 361              for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 362                if (w & ((bitset_word_t) 1 << j))
 363                  re_set_fastmap (fastmap, icase, ch);
 364            }
 365        }
 366#ifdef RE_ENABLE_I18N
 367      else if (type == COMPLEX_BRACKET)
 368        {
 369          re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 370          int i;
 371
 372# ifdef _LIBC
 373          /* See if we have to try all bytes which start multiple collation
 374             elements.
 375             e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 376                  collation element, and don't catch 'b' since 'b' is
 377                  the only collation element which starts from 'b' (and
 378                  it is caught by SIMPLE_BRACKET).  */
 379              if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 380                  && (cset->ncoll_syms || cset->nranges))
 381                {
 382                  const int32_t *table = (const int32_t *)
 383                    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 384                  for (i = 0; i < SBC_MAX; ++i)
 385                    if (table[i] < 0)
 386                      re_set_fastmap (fastmap, icase, i);
 387                }
 388# endif /* _LIBC */
 389
 390          /* See if we have to start the match at all multibyte characters,
 391             i.e. where we would not find an invalid sequence.  This only
 392             applies to multibyte character sets; for single byte character
 393             sets, the SIMPLE_BRACKET again suffices.  */
 394          if (dfa->mb_cur_max > 1
 395              && (cset->nchar_classes || cset->non_match || cset->nranges
 396# ifdef _LIBC
 397                  || cset->nequiv_classes
 398# endif /* _LIBC */
 399                 ))
 400            {
 401              unsigned char c = 0;
 402              do
 403                {
 404                  mbstate_t mbs;
 405                  memset (&mbs, 0, sizeof (mbs));
 406                  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 407                    re_set_fastmap (fastmap, false, (int) c);
 408                }
 409              while (++c != 0);
 410            }
 411
 412          else
 413            {
 414              /* ... Else catch all bytes which can start the mbchars.  */
 415              for (i = 0; i < cset->nmbchars; ++i)
 416                {
 417                  char buf[256];
 418                  mbstate_t state;
 419                  memset (&state, '\0', sizeof (state));
 420                  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 421                    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 422                  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 423                    {
 424                      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 425                          != (size_t) -1)
 426                        re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 427                    }
 428                }
 429            }
 430        }
 431#endif /* RE_ENABLE_I18N */
 432      else if (type == OP_PERIOD
 433#ifdef RE_ENABLE_I18N
 434               || type == OP_UTF8_PERIOD
 435#endif /* RE_ENABLE_I18N */
 436               || type == END_OF_RE)
 437        {
 438          memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 439          if (type == END_OF_RE)
 440            bufp->can_be_null = 1;
 441          return;
 442        }
 443    }
 444}
 445\f
 446/* Entry point for POSIX code.  */
 447/* regcomp takes a regular expression as a string and compiles it.
 448
 449   PREG is a regex_t *.  We do not expect any fields to be initialized,
 450   since POSIX says we shouldn't.  Thus, we set
 451
 452     `buffer' to the compiled pattern;
 453     `used' to the length of the compiled pattern;
 454     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 455       REG_EXTENDED bit in CFLAGS is set; otherwise, to
 456       RE_SYNTAX_POSIX_BASIC;
 457     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 458     `fastmap' to an allocated space for the fastmap;
 459     `fastmap_accurate' to zero;
 460     `re_nsub' to the number of subexpressions in PATTERN.
 461
 462   PATTERN is the address of the pattern string.
 463
 464   CFLAGS is a series of bits which affect compilation.
 465
 466     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 467     use POSIX basic syntax.
 468
 469     If REG_NEWLINE is set, then . and [^...] don't match newline.
 470     Also, regexec will try a match beginning after every newline.
 471
 472     If REG_ICASE is set, then we considers upper- and lowercase
 473     versions of letters to be equivalent when matching.
 474
 475     If REG_NOSUB is set, then when PREG is passed to regexec, that
 476     routine will report only success or failure, and nothing about the
 477     registers.
 478
 479   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 480   the return codes and their meanings.)  */
 481
 482int
 483regcomp (regex_t *__restrict preg,
 484         const char *__restrict pattern,
 485         int cflags)
 486{
 487  reg_errcode_t ret;
 488  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 489                         : RE_SYNTAX_POSIX_BASIC);
 490
 491  preg->buffer = NULL;
 492  preg->allocated = 0;
 493  preg->used = 0;
 494
 495  /* Try to allocate space for the fastmap.  */
 496  preg->fastmap = re_malloc (char, SBC_MAX);
 497  if (BE (preg->fastmap == NULL, 0))
 498    return REG_ESPACE;
 499
 500  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 501
 502  /* If REG_NEWLINE is set, newlines are treated differently.  */
 503  if (cflags & REG_NEWLINE)
 504    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 505      syntax &= ~RE_DOT_NEWLINE;
 506      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 507      /* It also changes the matching behavior.  */
 508      preg->newline_anchor = 1;
 509    }
 510  else
 511    preg->newline_anchor = 0;
 512  preg->no_sub = !!(cflags & REG_NOSUB);
 513  preg->translate = NULL;
 514
 515  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 516
 517  /* POSIX doesn't distinguish between an unmatched open-group and an
 518     unmatched close-group: both are REG_EPAREN.  */
 519  if (ret == REG_ERPAREN)
 520    ret = REG_EPAREN;
 521
 522  /* We have already checked preg->fastmap != NULL.  */
 523  if (BE (ret == REG_NOERROR, 1))
 524    /* Compute the fastmap now, since regexec cannot modify the pattern
 525       buffer.  This function never fails in this implementation.  */
 526    (void) re_compile_fastmap (preg);
 527  else
 528    {
 529      /* Some error occurred while compiling the expression.  */
 530      re_free (preg->fastmap);
 531      preg->fastmap = NULL;
 532    }
 533
 534  return (int) ret;
 535}
 536#ifdef _LIBC
 537weak_alias (__regcomp, regcomp)
 538#endif
 539
 540/* Returns a message corresponding to an error code, ERRCODE, returned
 541   from either regcomp or regexec.   We don't use PREG here.  */
 542
 543size_t
 544regerror(int errcode, const regex_t *__restrict preg,
 545         char *__restrict errbuf, size_t errbuf_size)
 546{
 547  const char *msg;
 548  size_t msg_size;
 549
 550  if (BE (errcode < 0
 551          || errcode >= (int) (sizeof (__re_error_msgid_idx)
 552                               / sizeof (__re_error_msgid_idx[0])), 0))
 553    /* Only error codes returned by the rest of the code should be passed
 554       to this routine.  If we are given anything else, or if other regex
 555       code generates an invalid error code, then the program has a bug.
 556       Dump core so we can fix it.  */
 557    abort ();
 558
 559  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 560
 561  msg_size = strlen (msg) + 1; /* Includes the null.  */
 562
 563  if (BE (errbuf_size != 0, 1))
 564    {
 565      if (BE (msg_size > errbuf_size, 0))
 566        {
 567          memcpy (errbuf, msg, errbuf_size - 1);
 568          errbuf[errbuf_size - 1] = 0;
 569        }
 570      else
 571        memcpy (errbuf, msg, msg_size);
 572    }
 573
 574  return msg_size;
 575}
 576#ifdef _LIBC
 577weak_alias (__regerror, regerror)
 578#endif
 579
 580
 581#ifdef RE_ENABLE_I18N
 582/* This static array is used for the map to single-byte characters when
 583   UTF-8 is used.  Otherwise we would allocate memory just to initialize
 584   it the same all the time.  UTF-8 is the preferred encoding so this is
 585   a worthwhile optimization.  */
 586#if __GNUC__ >= 3
 587static const bitset_t utf8_sb_map = {
 588  /* Set the first 128 bits.  */
 589  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 590};
 591#else /* ! (__GNUC__ >= 3) */
 592static bitset_t utf8_sb_map;
 593#endif /* __GNUC__ >= 3 */
 594#endif /* RE_ENABLE_I18N */
 595
 596
 597static void
 598free_dfa_content (re_dfa_t *dfa)
 599{
 600  int i, j;
 601
 602  if (dfa->nodes)
 603    for (i = 0; i < dfa->nodes_len; ++i)
 604      free_token (dfa->nodes + i);
 605  re_free (dfa->nexts);
 606  for (i = 0; i < dfa->nodes_len; ++i)
 607    {
 608      if (dfa->eclosures != NULL)
 609        re_node_set_free (dfa->eclosures + i);
 610      if (dfa->inveclosures != NULL)
 611        re_node_set_free (dfa->inveclosures + i);
 612      if (dfa->edests != NULL)
 613        re_node_set_free (dfa->edests + i);
 614    }
 615  re_free (dfa->edests);
 616  re_free (dfa->eclosures);
 617  re_free (dfa->inveclosures);
 618  re_free (dfa->nodes);
 619
 620  if (dfa->state_table)
 621    for (i = 0; i <= dfa->state_hash_mask; ++i)
 622      {
 623        struct re_state_table_entry *entry = dfa->state_table + i;
 624        for (j = 0; j < entry->num; ++j)
 625          {
 626            re_dfastate_t *state = entry->array[j];
 627            free_state (state);
 628          }
 629        re_free (entry->array);
 630      }
 631  re_free (dfa->state_table);
 632#ifdef RE_ENABLE_I18N
 633  if (dfa->sb_char != utf8_sb_map)
 634    re_free (dfa->sb_char);
 635#endif
 636  re_free (dfa->subexp_map);
 637#ifdef DEBUG
 638  re_free (dfa->re_str);
 639#endif
 640
 641  re_free (dfa);
 642}
 643
 644
 645/* Free dynamically allocated space used by PREG.  */
 646
 647void
 648regfree (regex_t *preg)
 649{
 650  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 651  if (BE (dfa != NULL, 1))
 652    free_dfa_content (dfa);
 653  preg->buffer = NULL;
 654  preg->allocated = 0;
 655
 656  re_free (preg->fastmap);
 657  preg->fastmap = NULL;
 658
 659  re_free (preg->translate);
 660  preg->translate = NULL;
 661}
 662#ifdef _LIBC
 663weak_alias (__regfree, regfree)
 664#endif
 665\f
 666/* Entry points compatible with 4.2 BSD regex library.  We don't define
 667   them unless specifically requested.  */
 668
 669#if defined _REGEX_RE_COMP || defined _LIBC
 670
 671/* BSD has one and only one pattern buffer.  */
 672static struct re_pattern_buffer re_comp_buf;
 673
 674char *
 675# ifdef _LIBC
 676/* Make these definitions weak in libc, so POSIX programs can redefine
 677   these names if they don't use our functions, and still use
 678   regcomp/regexec above without link errors.  */
 679weak_function
 680# endif
 681re_comp (s)
 682     const char *s;
 683{
 684  reg_errcode_t ret;
 685  char *fastmap;
 686
 687  if (!s)
 688    {
 689      if (!re_comp_buf.buffer)
 690        return gettext ("No previous regular expression");
 691      return 0;
 692    }
 693
 694  if (re_comp_buf.buffer)
 695    {
 696      fastmap = re_comp_buf.fastmap;
 697      re_comp_buf.fastmap = NULL;
 698      __regfree (&re_comp_buf);
 699      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 700      re_comp_buf.fastmap = fastmap;
 701    }
 702
 703  if (re_comp_buf.fastmap == NULL)
 704    {
 705      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 706      if (re_comp_buf.fastmap == NULL)
 707        return (char *) gettext (__re_error_msgid
 708                                 + __re_error_msgid_idx[(int) REG_ESPACE]);
 709    }
 710
 711  /* Since `re_exec' always passes NULL for the `regs' argument, we
 712     don't need to initialize the pattern buffer fields which affect it.  */
 713
 714  /* Match anchors at newlines.  */
 715  re_comp_buf.newline_anchor = 1;
 716
 717  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 718
 719  if (!ret)
 720    return NULL;
 721
 722  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 723  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 724}
 725
 726#ifdef _LIBC
 727libc_freeres_fn (free_mem)
 728{
 729  __regfree (&re_comp_buf);
 730}
 731#endif
 732
 733#endif /* _REGEX_RE_COMP */
 734\f
 735/* Internal entry point.
 736   Compile the regular expression PATTERN, whose length is LENGTH.
 737   SYNTAX indicate regular expression's syntax.  */
 738
 739static reg_errcode_t
 740re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 741                     reg_syntax_t syntax)
 742{
 743  reg_errcode_t err = REG_NOERROR;
 744  re_dfa_t *dfa;
 745  re_string_t regexp;
 746
 747  /* Initialize the pattern buffer.  */
 748  preg->fastmap_accurate = 0;
 749  preg->syntax = syntax;
 750  preg->not_bol = preg->not_eol = 0;
 751  preg->used = 0;
 752  preg->re_nsub = 0;
 753  preg->can_be_null = 0;
 754  preg->regs_allocated = REGS_UNALLOCATED;
 755
 756  /* Initialize the dfa.  */
 757  dfa = (re_dfa_t *) preg->buffer;
 758  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 759    {
 760      /* If zero allocated, but buffer is non-null, try to realloc
 761         enough space.  This loses if buffer's address is bogus, but
 762         that is the user's responsibility.  If ->buffer is NULL this
 763         is a simple allocation.  */
 764      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 765      if (dfa == NULL)
 766        return REG_ESPACE;
 767      preg->allocated = sizeof (re_dfa_t);
 768      preg->buffer = (unsigned char *) dfa;
 769    }
 770  preg->used = sizeof (re_dfa_t);
 771
 772  err = init_dfa (dfa, length);
 773  if (BE (err != REG_NOERROR, 0))
 774    {
 775      free_dfa_content (dfa);
 776      preg->buffer = NULL;
 777      preg->allocated = 0;
 778      return err;
 779    }
 780#ifdef DEBUG
 781  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 782  dfa->re_str = re_malloc (char, length + 1);
 783  strncpy (dfa->re_str, pattern, length + 1);
 784#endif
 785
 786  __libc_lock_init (dfa->lock);
 787
 788  err = re_string_construct (&regexp, pattern, length, preg->translate,
 789                             syntax & RE_ICASE, dfa);
 790  if (BE (err != REG_NOERROR, 0))
 791    {
 792    re_compile_internal_free_return:
 793      free_workarea_compile (preg);
 794      re_string_destruct (&regexp);
 795      free_dfa_content (dfa);
 796      preg->buffer = NULL;
 797      preg->allocated = 0;
 798      return err;
 799    }
 800
 801  /* Parse the regular expression, and build a structure tree.  */
 802  preg->re_nsub = 0;
 803  dfa->str_tree = parse (&regexp, preg, syntax, &err);
 804  if (BE (dfa->str_tree == NULL, 0))
 805    goto re_compile_internal_free_return;
 806
 807  /* Analyze the tree and create the nfa.  */
 808  err = analyze (preg);
 809  if (BE (err != REG_NOERROR, 0))
 810    goto re_compile_internal_free_return;
 811
 812#ifdef RE_ENABLE_I18N
 813  /* If possible, do searching in single byte encoding to speed things up.  */
 814  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 815    optimize_utf8 (dfa);
 816#endif
 817
 818  /* Then create the initial state of the dfa.  */
 819  err = create_initial_state (dfa);
 820
 821  /* Release work areas.  */
 822  free_workarea_compile (preg);
 823  re_string_destruct (&regexp);
 824
 825  if (BE (err != REG_NOERROR, 0))
 826    {
 827      free_dfa_content (dfa);
 828      preg->buffer = NULL;
 829      preg->allocated = 0;
 830    }
 831
 832  return err;
 833}
 834
 835/* Initialize DFA.  We use the length of the regular expression PAT_LEN
 836   as the initial length of some arrays.  */
 837
 838static reg_errcode_t
 839init_dfa (re_dfa_t *dfa, size_t pat_len)
 840{
 841  unsigned int table_size;
 842#ifndef _LIBC
 843  char *codeset_name;
 844#endif
 845
 846  memset (dfa, '\0', sizeof (re_dfa_t));
 847
 848  /* Force allocation of str_tree_storage the first time.  */
 849  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 850
 851  /* Avoid overflows.  */
 852  if (pat_len == SIZE_MAX)
 853    return REG_ESPACE;
 854
 855  dfa->nodes_alloc = pat_len + 1;
 856  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 857
 858  /*  table_size = 2 ^ ceil(log pat_len) */
 859  for (table_size = 1; ; table_size <<= 1)
 860    if (table_size > pat_len)
 861      break;
 862
 863  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 864  dfa->state_hash_mask = table_size - 1;
 865
 866  dfa->mb_cur_max = MB_CUR_MAX;
 867#ifdef _LIBC
 868  if (dfa->mb_cur_max == 6
 869      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 870    dfa->is_utf8 = 1;
 871  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 872                       != 0);
 873#else
 874# ifdef HAVE_LANGINFO_CODESET
 875  codeset_name = nl_langinfo (CODESET);
 876# else
 877  codeset_name = getenv ("LC_ALL");
 878  if (codeset_name == NULL || codeset_name[0] == '\0')
 879    codeset_name = getenv ("LC_CTYPE");
 880  if (codeset_name == NULL || codeset_name[0] == '\0')
 881    codeset_name = getenv ("LANG");
 882  if (codeset_name == NULL)
 883    codeset_name = "";
 884  else if (strchr (codeset_name, '.') !=  NULL)
 885    codeset_name = strchr (codeset_name, '.') + 1;
 886# endif
 887
 888  /* strcasecmp isn't a standard interface. brute force check */
 889#if 0
 890  if (strcasecmp (codeset_name, "UTF-8") == 0
 891      || strcasecmp (codeset_name, "UTF8") == 0)
 892    dfa->is_utf8 = 1;
 893#else
 894  if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 895      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 896      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 897      && (codeset_name[3] == '-'
 898          ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 899          : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 900    dfa->is_utf8 = 1;
 901#endif
 902
 903  /* We check exhaustively in the loop below if this charset is a
 904     superset of ASCII.  */
 905  dfa->map_notascii = 0;
 906#endif
 907
 908#ifdef RE_ENABLE_I18N
 909  if (dfa->mb_cur_max > 1)
 910    {
 911      if (dfa->is_utf8)
 912        {
 913#if !defined(__GNUC__) || __GNUC__ < 3
 914          static short utf8_sb_map_inited = 0;
 915
 916          if (! utf8_sb_map_inited)
 917            {
 918                int i;
 919
 920                utf8_sb_map_inited = 0;
 921                for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 922                  utf8_sb_map[i] = BITSET_WORD_MAX;
 923            }
 924#endif
 925          dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 926        }
 927      else
 928        {
 929          int i, j, ch;
 930
 931          dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 932          if (BE (dfa->sb_char == NULL, 0))
 933            return REG_ESPACE;
 934
 935          /* Set the bits corresponding to single byte chars.  */
 936          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 937            for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 938              {
 939                wint_t wch = __btowc (ch);
 940                if (wch != WEOF)
 941                  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 942# ifndef _LIBC
 943                if (isascii (ch) && wch != ch)
 944                  dfa->map_notascii = 1;
 945# endif
 946              }
 947        }
 948    }
 949#endif
 950
 951  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 952    return REG_ESPACE;
 953  return REG_NOERROR;
 954}
 955
 956/* Initialize WORD_CHAR table, which indicate which character is
 957   "word".  In this case "word" means that it is the word construction
 958   character used by some operators like "\<", "\>", etc.  */
 959
 960static void
 961internal_function
 962init_word_char (re_dfa_t *dfa)
 963{
 964  int i, j, ch;
 965  dfa->word_ops_used = 1;
 966  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 967    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 968      if (isalnum (ch) || ch == '_')
 969        dfa->word_char[i] |= (bitset_word_t) 1 << j;
 970}
 971
 972/* Free the work area which are only used while compiling.  */
 973
 974static void
 975free_workarea_compile (regex_t *preg)
 976{
 977  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 978  bin_tree_storage_t *storage, *next;
 979  for (storage = dfa->str_tree_storage; storage; storage = next)
 980    {
 981      next = storage->next;
 982      re_free (storage);
 983    }
 984  dfa->str_tree_storage = NULL;
 985  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 986  dfa->str_tree = NULL;
 987  re_free (dfa->org_indices);
 988  dfa->org_indices = NULL;
 989}
 990
 991/* Create initial states for all contexts.  */
 992
 993static reg_errcode_t
 994create_initial_state (re_dfa_t *dfa)
 995{
 996  int first, i;
 997  reg_errcode_t err;
 998  re_node_set init_nodes;
 999
1000  /* Initial states have the epsilon closure of the node which is
1001     the first node of the regular expression.  */
1002  first = dfa->str_tree->first->node_idx;
1003  dfa->init_node = first;
1004  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1005  if (BE (err != REG_NOERROR, 0))
1006    return err;
1007
1008  /* The back-references which are in initial states can epsilon transit,
1009     since in this case all of the subexpressions can be null.
1010     Then we add epsilon closures of the nodes which are the next nodes of
1011     the back-references.  */
1012  if (dfa->nbackref > 0)
1013    for (i = 0; i < init_nodes.nelem; ++i)
1014      {
1015        int node_idx = init_nodes.elems[i];
1016        re_token_type_t type = dfa->nodes[node_idx].type;
1017
1018        int clexp_idx;
1019        if (type != OP_BACK_REF)
1020          continue;
1021        for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1022          {
1023            re_token_t *clexp_node;
1024            clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1025            if (clexp_node->type == OP_CLOSE_SUBEXP
1026                && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1027              break;
1028          }
1029        if (clexp_idx == init_nodes.nelem)
1030          continue;
1031
1032        if (type == OP_BACK_REF)
1033          {
1034            int dest_idx = dfa->edests[node_idx].elems[0];
1035            if (!re_node_set_contains (&init_nodes, dest_idx))
1036              {
1037                reg_errcode_t err = re_node_set_merge (&init_nodes,
1038                                                       dfa->eclosures
1039                                                       + dest_idx);
1040                if (err != REG_NOERROR)
1041                  return err;
1042                i = 0;
1043              }
1044          }
1045      }
1046
1047  /* It must be the first time to invoke acquire_state.  */
1048  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1049  /* We don't check ERR here, since the initial state must not be NULL.  */
1050  if (BE (dfa->init_state == NULL, 0))
1051    return err;
1052  if (dfa->init_state->has_constraint)
1053    {
1054      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1055                                                       CONTEXT_WORD);
1056      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1057                                                     CONTEXT_NEWLINE);
1058      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1059                                                         &init_nodes,
1060                                                         CONTEXT_NEWLINE
1061                                                         | CONTEXT_BEGBUF);
1062      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1063              || dfa->init_state_begbuf == NULL, 0))
1064        return err;
1065    }
1066  else
1067    dfa->init_state_word = dfa->init_state_nl
1068      = dfa->init_state_begbuf = dfa->init_state;
1069
1070  re_node_set_free (&init_nodes);
1071  return REG_NOERROR;
1072}
1073\f
1074#ifdef RE_ENABLE_I18N
1075/* If it is possible to do searching in single byte encoding instead of UTF-8
1076   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1077   DFA nodes where needed.  */
1078
1079static void
1080optimize_utf8 (re_dfa_t *dfa)
1081{
1082  int node, i, mb_chars = 0, has_period = 0;
1083
1084  for (node = 0; node < dfa->nodes_len; ++node)
1085    switch (dfa->nodes[node].type)
1086      {
1087      case CHARACTER:
1088        if (dfa->nodes[node].opr.c >= 0x80)
1089          mb_chars = 1;
1090        break;
1091      case ANCHOR:
1092        switch (dfa->nodes[node].opr.ctx_type)
1093          {
1094          case LINE_FIRST:
1095          case LINE_LAST:
1096          case BUF_FIRST:
1097          case BUF_LAST:
1098            break;
1099          default:
1100            /* Word anchors etc. cannot be handled.  It's okay to test
1101               opr.ctx_type since constraints (for all DFA nodes) are
1102               created by ORing one or more opr.ctx_type values.  */
1103            return;
1104          }
1105        break;
1106      case OP_PERIOD:
1107        has_period = 1;
1108        break;
1109      case OP_BACK_REF:
1110      case OP_ALT:
1111      case END_OF_RE:
1112      case OP_DUP_ASTERISK:
1113      case OP_OPEN_SUBEXP:
1114      case OP_CLOSE_SUBEXP:
1115        break;
1116      case COMPLEX_BRACKET:
1117        return;
1118      case SIMPLE_BRACKET:
1119        /* Just double check.  The non-ASCII range starts at 0x80.  */
1120        assert (0x80 % BITSET_WORD_BITS == 0);
1121        for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1122          if (dfa->nodes[node].opr.sbcset[i])
1123            return;
1124        break;
1125      default:
1126        abort ();
1127      }
1128
1129  if (mb_chars || has_period)
1130    for (node = 0; node < dfa->nodes_len; ++node)
1131      {
1132        if (dfa->nodes[node].type == CHARACTER
1133            && dfa->nodes[node].opr.c >= 0x80)
1134          dfa->nodes[node].mb_partial = 0;
1135        else if (dfa->nodes[node].type == OP_PERIOD)
1136          dfa->nodes[node].type = OP_UTF8_PERIOD;
1137      }
1138
1139  /* The search can be in single byte locale.  */
1140  dfa->mb_cur_max = 1;
1141  dfa->is_utf8 = 0;
1142  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1143}
1144#endif
1145\f
1146/* Analyze the structure tree, and calculate "first", "next", "edest",
1147   "eclosure", and "inveclosure".  */
1148
1149static reg_errcode_t
1150analyze (regex_t *preg)
1151{
1152  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1153  reg_errcode_t ret;
1154
1155  /* Allocate arrays.  */
1156  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1157  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1158  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1159  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1160  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1161          || dfa->eclosures == NULL, 0))
1162    return REG_ESPACE;
1163
1164  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1165  if (dfa->subexp_map != NULL)
1166    {
1167      int i;
1168      for (i = 0; i < preg->re_nsub; i++)
1169        dfa->subexp_map[i] = i;
1170      preorder (dfa->str_tree, optimize_subexps, dfa);
1171      for (i = 0; i < preg->re_nsub; i++)
1172        if (dfa->subexp_map[i] != i)
1173          break;
1174      if (i == preg->re_nsub)
1175        {
1176          free (dfa->subexp_map);
1177          dfa->subexp_map = NULL;
1178        }
1179    }
1180
1181  ret = postorder (dfa->str_tree, lower_subexps, preg);
1182  if (BE (ret != REG_NOERROR, 0))
1183    return ret;
1184  ret = postorder (dfa->str_tree, calc_first, dfa);
1185  if (BE (ret != REG_NOERROR, 0))
1186    return ret;
1187  preorder (dfa->str_tree, calc_next, dfa);
1188  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1189  if (BE (ret != REG_NOERROR, 0))
1190    return ret;
1191  ret = calc_eclosure (dfa);
1192  if (BE (ret != REG_NOERROR, 0))
1193    return ret;
1194
1195  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1196     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1197  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1198      || dfa->nbackref)
1199    {
1200      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1201      if (BE (dfa->inveclosures == NULL, 0))
1202        return REG_ESPACE;
1203      ret = calc_inveclosure (dfa);
1204    }
1205
1206  return ret;
1207}
1208
1209/* Our parse trees are very unbalanced, so we cannot use a stack to
1210   implement parse tree visits.  Instead, we use parent pointers and
1211   some hairy code in these two functions.  */
1212static reg_errcode_t
1213postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1214           void *extra)
1215{
1216  bin_tree_t *node, *prev;
1217
1218  for (node = root; ; )
1219    {
1220      /* Descend down the tree, preferably to the left (or to the right
1221         if that's the only child).  */
1222      while (node->left || node->right)
1223        if (node->left)
1224          node = node->left;
1225        else
1226          node = node->right;
1227
1228      do
1229        {
1230          reg_errcode_t err = fn (extra, node);
1231          if (BE (err != REG_NOERROR, 0))
1232            return err;
1233          if (node->parent == NULL)
1234            return REG_NOERROR;
1235          prev = node;
1236          node = node->parent;
1237        }
1238      /* Go up while we have a node that is reached from the right.  */
1239      while (node->right == prev || node->right == NULL);
1240      node = node->right;
1241    }
1242}
1243
1244static reg_errcode_t
1245preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1246          void *extra)
1247{
1248  bin_tree_t *node;
1249
1250  for (node = root; ; )
1251    {
1252      reg_errcode_t err = fn (extra, node);
1253      if (BE (err != REG_NOERROR, 0))
1254        return err;
1255
1256      /* Go to the left node, or up and to the right.  */
1257      if (node->left)
1258        node = node->left;
1259      else
1260        {
1261          bin_tree_t *prev = NULL;
1262          while (node->right == prev || node->right == NULL)
1263            {
1264              prev = node;
1265              node = node->parent;
1266              if (!node)
1267                return REG_NOERROR;
1268            }
1269          node = node->right;
1270        }
1271    }
1272}
1273
1274/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1275   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1276   backreferences as well.  Requires a preorder visit.  */
1277static reg_errcode_t
1278optimize_subexps (void *extra, bin_tree_t *node)
1279{
1280  re_dfa_t *dfa = (re_dfa_t *) extra;
1281
1282  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1283    {
1284      int idx = node->token.opr.idx;
1285      node->token.opr.idx = dfa->subexp_map[idx];
1286      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1287    }
1288
1289  else if (node->token.type == SUBEXP
1290           && node->left && node->left->token.type == SUBEXP)
1291    {
1292      int other_idx = node->left->token.opr.idx;
1293
1294      node->left = node->left->left;
1295      if (node->left)
1296        node->left->parent = node;
1297
1298      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1299      if (other_idx < BITSET_WORD_BITS)
1300          dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1301    }
1302
1303  return REG_NOERROR;
1304}
1305
1306/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1307   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1308static reg_errcode_t
1309lower_subexps (void *extra, bin_tree_t *node)
1310{
1311  regex_t *preg = (regex_t *) extra;
1312  reg_errcode_t err = REG_NOERROR;
1313
1314  if (node->left && node->left->token.type == SUBEXP)
1315    {
1316      node->left = lower_subexp (&err, preg, node->left);
1317      if (node->left)
1318        node->left->parent = node;
1319    }
1320  if (node->right && node->right->token.type == SUBEXP)
1321    {
1322      node->right = lower_subexp (&err, preg, node->right);
1323      if (node->right)
1324        node->right->parent = node;
1325    }
1326
1327  return err;
1328}
1329
1330static bin_tree_t *
1331lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1332{
1333  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1334  bin_tree_t *body = node->left;
1335  bin_tree_t *op, *cls, *tree1, *tree;
1336
1337  if (preg->no_sub
1338      /* We do not optimize empty subexpressions, because otherwise we may
1339         have bad CONCAT nodes with NULL children.  This is obviously not
1340         very common, so we do not lose much.  An example that triggers
1341         this case is the sed "script" /\(\)/x.  */
1342      && node->left != NULL
1343      && (node->token.opr.idx >= BITSET_WORD_BITS
1344          || !(dfa->used_bkref_map
1345               & ((bitset_word_t) 1 << node->token.opr.idx))))
1346    return node->left;
1347
1348  /* Convert the SUBEXP node to the concatenation of an
1349     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1350  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1351  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1352  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1353  tree = create_tree (dfa, op, tree1, CONCAT);
1354  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1355    {
1356      *err = REG_ESPACE;
1357      return NULL;
1358    }
1359
1360  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1361  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1362  return tree;
1363}
1364
1365/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1366   nodes.  Requires a postorder visit.  */
1367static reg_errcode_t
1368calc_first (void *extra, bin_tree_t *node)
1369{
1370  re_dfa_t *dfa = (re_dfa_t *) extra;
1371  if (node->token.type == CONCAT)
1372    {
1373      node->first = node->left->first;
1374      node->node_idx = node->left->node_idx;
1375    }
1376  else
1377    {
1378      node->first = node;
1379      node->node_idx = re_dfa_add_node (dfa, node->token);
1380      if (BE (node->node_idx == -1, 0))
1381        return REG_ESPACE;
1382      if (node->token.type == ANCHOR)
1383        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1384    }
1385  return REG_NOERROR;
1386}
1387
1388/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1389static reg_errcode_t
1390calc_next (void *extra, bin_tree_t *node)
1391{
1392  switch (node->token.type)
1393    {
1394    case OP_DUP_ASTERISK:
1395      node->left->next = node;
1396      break;
1397    case CONCAT:
1398      node->left->next = node->right->first;
1399      node->right->next = node->next;
1400      break;
1401    default:
1402      if (node->left)
1403        node->left->next = node->next;
1404      if (node->right)
1405        node->right->next = node->next;
1406      break;
1407    }
1408  return REG_NOERROR;
1409}
1410
1411/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1412static reg_errcode_t
1413link_nfa_nodes (void *extra, bin_tree_t *node)
1414{
1415  re_dfa_t *dfa = (re_dfa_t *) extra;
1416  int idx = node->node_idx;
1417  reg_errcode_t err = REG_NOERROR;
1418
1419  switch (node->token.type)
1420    {
1421    case CONCAT:
1422      break;
1423
1424    case END_OF_RE:
1425      assert (node->next == NULL);
1426      break;
1427
1428    case OP_DUP_ASTERISK:
1429    case OP_ALT:
1430      {
1431        int left, right;
1432        dfa->has_plural_match = 1;
1433        if (node->left != NULL)
1434          left = node->left->first->node_idx;
1435        else
1436          left = node->next->node_idx;
1437        if (node->right != NULL)
1438          right = node->right->first->node_idx;
1439        else
1440          right = node->next->node_idx;
1441        assert (left > -1);
1442        assert (right > -1);
1443        err = re_node_set_init_2 (dfa->edests + idx, left, right);
1444      }
1445      break;
1446
1447    case ANCHOR:
1448    case OP_OPEN_SUBEXP:
1449    case OP_CLOSE_SUBEXP:
1450      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1451      break;
1452
1453    case OP_BACK_REF:
1454      dfa->nexts[idx] = node->next->node_idx;
1455      if (node->token.type == OP_BACK_REF)
1456        err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1457      break;
1458
1459    default:
1460      assert (!IS_EPSILON_NODE (node->token.type));
1461      dfa->nexts[idx] = node->next->node_idx;
1462      break;
1463    }
1464
1465  return err;
1466}
1467
1468/* Duplicate the epsilon closure of the node ROOT_NODE.
1469   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1470   to their own constraint.  */
1471
1472static reg_errcode_t
1473internal_function
1474duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1475                        int root_node, unsigned int init_constraint)
1476{
1477  int org_node, clone_node, ret;
1478  unsigned int constraint = init_constraint;
1479  for (org_node = top_org_node, clone_node = top_clone_node;;)
1480    {
1481      int org_dest, clone_dest;
1482      if (dfa->nodes[org_node].type == OP_BACK_REF)
1483        {
1484          /* If the back reference epsilon-transit, its destination must
1485             also have the constraint.  Then duplicate the epsilon closure
1486             of the destination of the back reference, and store it in
1487             edests of the back reference.  */
1488          org_dest = dfa->nexts[org_node];
1489          re_node_set_empty (dfa->edests + clone_node);
1490          clone_dest = duplicate_node (dfa, org_dest, constraint);
1491          if (BE (clone_dest == -1, 0))
1492            return REG_ESPACE;
1493          dfa->nexts[clone_node] = dfa->nexts[org_node];
1494          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495          if (BE (ret < 0, 0))
1496            return REG_ESPACE;
1497        }
1498      else if (dfa->edests[org_node].nelem == 0)
1499        {
1500          /* In case of the node can't epsilon-transit, don't duplicate the
1501             destination and store the original destination as the
1502             destination of the node.  */
1503          dfa->nexts[clone_node] = dfa->nexts[org_node];
1504          break;
1505        }
1506      else if (dfa->edests[org_node].nelem == 1)
1507        {
1508          /* In case of the node can epsilon-transit, and it has only one
1509             destination.  */
1510          org_dest = dfa->edests[org_node].elems[0];
1511          re_node_set_empty (dfa->edests + clone_node);
1512          /* If the node is root_node itself, it means the epsilon clsoure
1513             has a loop.   Then tie it to the destination of the root_node.  */
1514          if (org_node == root_node && clone_node != org_node)
1515            {
1516              ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1517              if (BE (ret < 0, 0))
1518                return REG_ESPACE;
1519              break;
1520            }
1521          /* In case of the node has another constraint, add it.  */
1522          constraint |= dfa->nodes[org_node].constraint;
1523          clone_dest = duplicate_node (dfa, org_dest, constraint);
1524          if (BE (clone_dest == -1, 0))
1525            return REG_ESPACE;
1526          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527          if (BE (ret < 0, 0))
1528            return REG_ESPACE;
1529        }
1530      else /* dfa->edests[org_node].nelem == 2 */
1531        {
1532          /* In case of the node can epsilon-transit, and it has two
1533             destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1534          org_dest = dfa->edests[org_node].elems[0];
1535          re_node_set_empty (dfa->edests + clone_node);
1536          /* Search for a duplicated node which satisfies the constraint.  */
1537          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1538          if (clone_dest == -1)
1539            {
1540              /* There is no such duplicated node, create a new one.  */
1541              reg_errcode_t err;
1542              clone_dest = duplicate_node (dfa, org_dest, constraint);
1543              if (BE (clone_dest == -1, 0))
1544                return REG_ESPACE;
1545              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1546              if (BE (ret < 0, 0))
1547                return REG_ESPACE;
1548              err = duplicate_node_closure (dfa, org_dest, clone_dest,
1549                                            root_node, constraint);
1550              if (BE (err != REG_NOERROR, 0))
1551                return err;
1552            }
1553          else
1554            {
1555              /* There is a duplicated node which satisfies the constraint,
1556                 use it to avoid infinite loop.  */
1557              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1558              if (BE (ret < 0, 0))
1559                return REG_ESPACE;
1560            }
1561
1562          org_dest = dfa->edests[org_node].elems[1];
1563          clone_dest = duplicate_node (dfa, org_dest, constraint);
1564          if (BE (clone_dest == -1, 0))
1565            return REG_ESPACE;
1566          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567          if (BE (ret < 0, 0))
1568            return REG_ESPACE;
1569        }
1570      org_node = org_dest;
1571      clone_node = clone_dest;
1572    }
1573  return REG_NOERROR;
1574}
1575
1576/* Search for a node which is duplicated from the node ORG_NODE, and
1577   satisfies the constraint CONSTRAINT.  */
1578
1579static int
1580search_duplicated_node (const re_dfa_t *dfa, int org_node,
1581                        unsigned int constraint)
1582{
1583  int idx;
1584  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1585    {
1586      if (org_node == dfa->org_indices[idx]
1587          && constraint == dfa->nodes[idx].constraint)
1588        return idx; /* Found.  */
1589    }
1590  return -1; /* Not found.  */
1591}
1592
1593/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1594   Return the index of the new node, or -1 if insufficient storage is
1595   available.  */
1596
1597static int
1598duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1599{
1600  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1601  if (BE (dup_idx != -1, 1))
1602    {
1603      dfa->nodes[dup_idx].constraint = constraint;
1604      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1605      dfa->nodes[dup_idx].duplicated = 1;
1606
1607      /* Store the index of the original node.  */
1608      dfa->org_indices[dup_idx] = org_idx;
1609    }
1610  return dup_idx;
1611}
1612
1613static reg_errcode_t
1614calc_inveclosure (re_dfa_t *dfa)
1615{
1616  int src, idx, ret;
1617  for (idx = 0; idx < dfa->nodes_len; ++idx)
1618    re_node_set_init_empty (dfa->inveclosures + idx);
1619
1620  for (src = 0; src < dfa->nodes_len; ++src)
1621    {
1622      int *elems = dfa->eclosures[src].elems;
1623      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1624        {
1625          ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1626          if (BE (ret == -1, 0))
1627            return REG_ESPACE;
1628        }
1629    }
1630
1631  return REG_NOERROR;
1632}
1633
1634/* Calculate "eclosure" for all the node in DFA.  */
1635
1636static reg_errcode_t
1637calc_eclosure (re_dfa_t *dfa)
1638{
1639  int node_idx, incomplete;
1640#ifdef DEBUG
1641  assert (dfa->nodes_len > 0);
1642#endif
1643  incomplete = 0;
1644  /* For each nodes, calculate epsilon closure.  */
1645  for (node_idx = 0; ; ++node_idx)
1646    {
1647      reg_errcode_t err;
1648      re_node_set eclosure_elem;
1649      if (node_idx == dfa->nodes_len)
1650        {
1651          if (!incomplete)
1652            break;
1653          incomplete = 0;
1654          node_idx = 0;
1655        }
1656
1657#ifdef DEBUG
1658      assert (dfa->eclosures[node_idx].nelem != -1);
1659#endif
1660
1661      /* If we have already calculated, skip it.  */
1662      if (dfa->eclosures[node_idx].nelem != 0)
1663        continue;
1664      /* Calculate epsilon closure of `node_idx'.  */
1665      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1666      if (BE (err != REG_NOERROR, 0))
1667        return err;
1668
1669      if (dfa->eclosures[node_idx].nelem == 0)
1670        {
1671          incomplete = 1;
1672          re_node_set_free (&eclosure_elem);
1673        }
1674    }
1675  return REG_NOERROR;
1676}
1677
1678/* Calculate epsilon closure of NODE.  */
1679
1680static reg_errcode_t
1681calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1682{
1683  reg_errcode_t err;
1684  int i;
1685  re_node_set eclosure;
1686  int ret;
1687  int incomplete = 0;
1688  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1689  if (BE (err != REG_NOERROR, 0))
1690    return err;
1691
1692  /* This indicates that we are calculating this node now.
1693     We reference this value to avoid infinite loop.  */
1694  dfa->eclosures[node].nelem = -1;
1695
1696  /* If the current node has constraints, duplicate all nodes
1697     since they must inherit the constraints.  */
1698  if (dfa->nodes[node].constraint
1699      && dfa->edests[node].nelem
1700      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1701    {
1702      err = duplicate_node_closure (dfa, node, node, node,
1703                                    dfa->nodes[node].constraint);
1704      if (BE (err != REG_NOERROR, 0))
1705        return err;
1706    }
1707
1708  /* Expand each epsilon destination nodes.  */
1709  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1710    for (i = 0; i < dfa->edests[node].nelem; ++i)
1711      {
1712        re_node_set eclosure_elem;
1713        int edest = dfa->edests[node].elems[i];
1714        /* If calculating the epsilon closure of `edest' is in progress,
1715           return intermediate result.  */
1716        if (dfa->eclosures[edest].nelem == -1)
1717          {
1718            incomplete = 1;
1719            continue;
1720          }
1721        /* If we haven't calculated the epsilon closure of `edest' yet,
1722           calculate now. Otherwise use calculated epsilon closure.  */
1723        if (dfa->eclosures[edest].nelem == 0)
1724          {
1725            err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1726            if (BE (err != REG_NOERROR, 0))
1727              return err;
1728          }
1729        else
1730          eclosure_elem = dfa->eclosures[edest];
1731        /* Merge the epsilon closure of `edest'.  */
1732        err = re_node_set_merge (&eclosure, &eclosure_elem);
1733        if (BE (err != REG_NOERROR, 0))
1734          return err;
1735        /* If the epsilon closure of `edest' is incomplete,
1736           the epsilon closure of this node is also incomplete.  */
1737        if (dfa->eclosures[edest].nelem == 0)
1738          {
1739            incomplete = 1;
1740            re_node_set_free (&eclosure_elem);
1741          }
1742      }
1743
1744  /* An epsilon closure includes itself.  */
1745  ret = re_node_set_insert (&eclosure, node);
1746  if (BE (ret < 0, 0))
1747    return REG_ESPACE;
1748  if (incomplete && !root)
1749    dfa->eclosures[node].nelem = 0;
1750  else
1751    dfa->eclosures[node] = eclosure;
1752  *new_set = eclosure;
1753  return REG_NOERROR;
1754}
1755\f
1756/* Functions for token which are used in the parser.  */
1757
1758/* Fetch a token from INPUT.
1759   We must not use this function inside bracket expressions.  */
1760
1761static void
1762internal_function
1763fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1764{
1765  re_string_skip_bytes (input, peek_token (result, input, syntax));
1766}
1767
1768/* Peek a token from INPUT, and return the length of the token.
1769   We must not use this function inside bracket expressions.  */
1770
1771static int
1772internal_function
1773peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1774{
1775  unsigned char c;
1776
1777  if (re_string_eoi (input))
1778    {
1779      token->type = END_OF_RE;
1780      return 0;
1781    }
1782
1783  c = re_string_peek_byte (input, 0);
1784  token->opr.c = c;
1785
1786  token->word_char = 0;
1787#ifdef RE_ENABLE_I18N
1788  token->mb_partial = 0;
1789  if (input->mb_cur_max > 1 &&
1790      !re_string_first_byte (input, re_string_cur_idx (input)))
1791    {
1792      token->type = CHARACTER;
1793      token->mb_partial = 1;
1794      return 1;
1795    }
1796#endif
1797  if (c == '\\')
1798    {
1799      unsigned char c2;
1800      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1801        {
1802          token->type = BACK_SLASH;
1803          return 1;
1804        }
1805
1806      c2 = re_string_peek_byte_case (input, 1);
1807      token->opr.c = c2;
1808      token->type = CHARACTER;
1809#ifdef RE_ENABLE_I18N
1810      if (input->mb_cur_max > 1)
1811        {
1812          wint_t wc = re_string_wchar_at (input,
1813                                          re_string_cur_idx (input) + 1);
1814          token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1815        }
1816      else
1817#endif
1818        token->word_char = IS_WORD_CHAR (c2) != 0;
1819
1820      switch (c2)
1821        {
1822        case '|':
1823          if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1824            token->type = OP_ALT;
1825          break;
1826        case '1': case '2': case '3': case '4': case '5':
1827        case '6': case '7': case '8': case '9':
1828          if (!(syntax & RE_NO_BK_REFS))
1829            {
1830              token->type = OP_BACK_REF;
1831              token->opr.idx = c2 - '1';
1832            }
1833          break;
1834        case '<':
1835          if (!(syntax & RE_NO_GNU_OPS))
1836            {
1837              token->type = ANCHOR;
1838              token->opr.ctx_type = WORD_FIRST;
1839            }
1840          break;
1841        case '>':
1842          if (!(syntax & RE_NO_GNU_OPS))
1843            {
1844              token->type = ANCHOR;
1845              token->opr.ctx_type = WORD_LAST;
1846            }
1847          break;
1848        case 'b':
1849          if (!(syntax & RE_NO_GNU_OPS))
1850            {
1851              token->type = ANCHOR;
1852              token->opr.ctx_type = WORD_DELIM;
1853            }
1854          break;
1855        case 'B':
1856          if (!(syntax & RE_NO_GNU_OPS))
1857            {
1858              token->type = ANCHOR;
1859              token->opr.ctx_type = NOT_WORD_DELIM;
1860            }
1861          break;
1862        case 'w':
1863          if (!(syntax & RE_NO_GNU_OPS))
1864            token->type = OP_WORD;
1865          break;
1866        case 'W':
1867          if (!(syntax & RE_NO_GNU_OPS))
1868            token->type = OP_NOTWORD;
1869          break;
1870        case 's':
1871          if (!(syntax & RE_NO_GNU_OPS))
1872            token->type = OP_SPACE;
1873          break;
1874        case 'S':
1875          if (!(syntax & RE_NO_GNU_OPS))
1876            token->type = OP_NOTSPACE;
1877          break;
1878        case '`':
1879          if (!(syntax & RE_NO_GNU_OPS))
1880            {
1881              token->type = ANCHOR;
1882              token->opr.ctx_type = BUF_FIRST;
1883            }
1884          break;
1885        case '\'':
1886          if (!(syntax & RE_NO_GNU_OPS))
1887            {
1888              token->type = ANCHOR;
1889              token->opr.ctx_type = BUF_LAST;
1890            }
1891          break;
1892        case '(':
1893          if (!(syntax & RE_NO_BK_PARENS))
1894            token->type = OP_OPEN_SUBEXP;
1895          break;
1896        case ')':
1897          if (!(syntax & RE_NO_BK_PARENS))
1898            token->type = OP_CLOSE_SUBEXP;
1899          break;
1900        case '+':
1901          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902            token->type = OP_DUP_PLUS;
1903          break;
1904        case '?':
1905          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1906            token->type = OP_DUP_QUESTION;
1907          break;
1908        case '{':
1909          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910            token->type = OP_OPEN_DUP_NUM;
1911          break;
1912        case '}':
1913          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1914            token->type = OP_CLOSE_DUP_NUM;
1915          break;
1916        default:
1917          break;
1918        }
1919      return 2;
1920    }
1921
1922  token->type = CHARACTER;
1923#ifdef RE_ENABLE_I18N
1924  if (input->mb_cur_max > 1)
1925    {
1926      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1927      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1928    }
1929  else
1930#endif
1931    token->word_char = IS_WORD_CHAR (token->opr.c);
1932
1933  switch (c)
1934    {
1935    case '\n':
1936      if (syntax & RE_NEWLINE_ALT)
1937        token->type = OP_ALT;
1938      break;
1939    case '|':
1940      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1941        token->type = OP_ALT;
1942      break;
1943    case '*':
1944      token->type = OP_DUP_ASTERISK;
1945      break;
1946    case '+':
1947      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948        token->type = OP_DUP_PLUS;
1949      break;
1950    case '?':
1951      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1952        token->type = OP_DUP_QUESTION;
1953      break;
1954    case '{':
1955      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956        token->type = OP_OPEN_DUP_NUM;
1957      break;
1958    case '}':
1959      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1960        token->type = OP_CLOSE_DUP_NUM;
1961      break;
1962    case '(':
1963      if (syntax & RE_NO_BK_PARENS)
1964        token->type = OP_OPEN_SUBEXP;
1965      break;
1966    case ')':
1967      if (syntax & RE_NO_BK_PARENS)
1968        token->type = OP_CLOSE_SUBEXP;
1969      break;
1970    case '[':
1971      token->type = OP_OPEN_BRACKET;
1972      break;
1973    case '.':
1974      token->type = OP_PERIOD;
1975      break;
1976    case '^':
1977      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1978          re_string_cur_idx (input) != 0)
1979        {
1980          char prev = re_string_peek_byte (input, -1);
1981          if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1982            break;
1983        }
1984      token->type = ANCHOR;
1985      token->opr.ctx_type = LINE_FIRST;
1986      break;
1987    case '$':
1988      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1989          re_string_cur_idx (input) + 1 != re_string_length (input))
1990        {
1991          re_token_t next;
1992          re_string_skip_bytes (input, 1);
1993          peek_token (&next, input, syntax);
1994          re_string_skip_bytes (input, -1);
1995          if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1996            break;
1997        }
1998      token->type = ANCHOR;
1999      token->opr.ctx_type = LINE_LAST;
2000      break;
2001    default:
2002      break;
2003    }
2004  return 1;
2005}
2006
2007/* Peek a token from INPUT, and return the length of the token.
2008   We must not use this function out of bracket expressions.  */
2009
2010static int
2011internal_function
2012peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2013{
2014  unsigned char c;
2015  if (re_string_eoi (input))
2016    {
2017      token->type = END_OF_RE;
2018      return 0;
2019    }
2020  c = re_string_peek_byte (input, 0);
2021  token->opr.c = c;
2022
2023#ifdef RE_ENABLE_I18N
2024  if (input->mb_cur_max > 1 &&
2025      !re_string_first_byte (input, re_string_cur_idx (input)))
2026    {
2027      token->type = CHARACTER;
2028      return 1;
2029    }
2030#endif /* RE_ENABLE_I18N */
2031
2032  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2033      && re_string_cur_idx (input) + 1 < re_string_length (input))
2034    {
2035      /* In this case, '\' escape a character.  */
2036      unsigned char c2;
2037      re_string_skip_bytes (input, 1);
2038      c2 = re_string_peek_byte (input, 0);
2039      token->opr.c = c2;
2040      token->type = CHARACTER;
2041      return 1;
2042    }
2043  if (c == '[') /* '[' is a special char in a bracket exps.  */
2044    {
2045      unsigned char c2;
2046      int token_len;
2047      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2048        c2 = re_string_peek_byte (input, 1);
2049      else
2050        c2 = 0;
2051      token->opr.c = c2;
2052      token_len = 2;
2053      switch (c2)
2054        {
2055        case '.':
2056          token->type = OP_OPEN_COLL_ELEM;
2057          break;
2058        case '=':
2059          token->type = OP_OPEN_EQUIV_CLASS;
2060          break;
2061        case ':':
2062          if (syntax & RE_CHAR_CLASSES)
2063            {
2064              token->type = OP_OPEN_CHAR_CLASS;
2065              break;
2066            }
2067          /* else fall through.  */
2068        default:
2069          token->type = CHARACTER;
2070          token->opr.c = c;
2071          token_len = 1;
2072          break;
2073        }
2074      return token_len;
2075    }
2076  switch (c)
2077    {
2078    case '-':
2079      token->type = OP_CHARSET_RANGE;
2080      break;
2081    case ']':
2082      token->type = OP_CLOSE_BRACKET;
2083      break;
2084    case '^':
2085      token->type = OP_NON_MATCH_LIST;
2086      break;
2087    default:
2088      token->type = CHARACTER;
2089    }
2090  return 1;
2091}
2092\f
2093/* Functions for parser.  */
2094
2095/* Entry point of the parser.
2096   Parse the regular expression REGEXP and return the structure tree.
2097   If an error has occurred, ERR is set by error code, and return NULL.
2098   This function build the following tree, from regular expression <reg_exp>:
2099           CAT
2100           / \
2101          /   \
2102   <reg_exp>  EOR
2103
2104   CAT means concatenation.
2105   EOR means end of regular expression.  */
2106
2107static bin_tree_t *
2108parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2109       reg_errcode_t *err)
2110{
2111  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2112  bin_tree_t *tree, *eor, *root;
2113  re_token_t current_token;
2114  dfa->syntax = syntax;
2115  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2116  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2117  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2118    return NULL;
2119  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2120  if (tree != NULL)
2121    root = create_tree (dfa, tree, eor, CONCAT);
2122  else
2123    root = eor;
2124  if (BE (eor == NULL || root == NULL, 0))
2125    {
2126      *err = REG_ESPACE;
2127      return NULL;
2128    }
2129  return root;
2130}
2131
2132/* This function build the following tree, from regular expression
2133   <branch1>|<branch2>:
2134           ALT
2135           / \
2136          /   \
2137   <branch1> <branch2>
2138
2139   ALT means alternative, which represents the operator `|'.  */
2140
2141static bin_tree_t *
2142parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2143               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2144{
2145  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2146  bin_tree_t *tree, *branch = NULL;
2147  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2148  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149    return NULL;
2150
2151  while (token->type == OP_ALT)
2152    {
2153      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154      if (token->type != OP_ALT && token->type != END_OF_RE
2155          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2156        {
2157          branch = parse_branch (regexp, preg, token, syntax, nest, err);
2158          if (BE (*err != REG_NOERROR && branch == NULL, 0))
2159            return NULL;
2160        }
2161      else
2162        branch = NULL;
2163      tree = create_tree (dfa, tree, branch, OP_ALT);
2164      if (BE (tree == NULL, 0))
2165        {
2166          *err = REG_ESPACE;
2167          return NULL;
2168        }
2169    }
2170  return tree;
2171}
2172
2173/* This function build the following tree, from regular expression
2174   <exp1><exp2>:
2175        CAT
2176        / \
2177       /   \
2178   <exp1> <exp2>
2179
2180   CAT means concatenation.  */
2181
2182static bin_tree_t *
2183parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2184              reg_syntax_t syntax, int nest, reg_errcode_t *err)
2185{
2186  bin_tree_t *tree, *exp;
2187  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2188  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2189  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190    return NULL;
2191
2192  while (token->type != OP_ALT && token->type != END_OF_RE
2193         && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194    {
2195      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2196      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2197        {
2198          return NULL;
2199        }
2200      if (tree != NULL && exp != NULL)
2201        {
2202          tree = create_tree (dfa, tree, exp, CONCAT);
2203          if (tree == NULL)
2204            {
2205              *err = REG_ESPACE;
2206              return NULL;
2207            }
2208        }
2209      else if (tree == NULL)
2210        tree = exp;
2211      /* Otherwise exp == NULL, we don't need to create new tree.  */
2212    }
2213  return tree;
2214}
2215
2216/* This function build the following tree, from regular expression a*:
2217         *
2218         |
2219         a
2220*/
2221
2222static bin_tree_t *
2223parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224                  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2225{
2226  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227  bin_tree_t *tree;
2228  switch (token->type)
2229    {
2230    case CHARACTER:
2231      tree = create_token_tree (dfa, NULL, NULL, token);
2232      if (BE (tree == NULL, 0))
2233        {
2234          *err = REG_ESPACE;
2235          return NULL;
2236        }
2237#ifdef RE_ENABLE_I18N
2238      if (dfa->mb_cur_max > 1)
2239        {
2240          while (!re_string_eoi (regexp)
2241                 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242            {
2243              bin_tree_t *mbc_remain;
2244              fetch_token (token, regexp, syntax);
2245              mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2246              tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2247              if (BE (mbc_remain == NULL || tree == NULL, 0))
2248                {
2249                  *err = REG_ESPACE;
2250                  return NULL;
2251                }
2252            }
2253        }
2254#endif
2255      break;
2256    case OP_OPEN_SUBEXP:
2257      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2258      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259        return NULL;
2260      break;
2261    case OP_OPEN_BRACKET:
2262      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2264        return NULL;
2265      break;
2266    case OP_BACK_REF:
2267      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2268        {
2269          *err = REG_ESUBREG;
2270          return NULL;
2271        }
2272      dfa->used_bkref_map |= 1 << token->opr.idx;
2273      tree = create_token_tree (dfa, NULL, NULL, token);
2274      if (BE (tree == NULL, 0))
2275        {
2276          *err = REG_ESPACE;
2277          return NULL;
2278        }
2279      ++dfa->nbackref;
2280      dfa->has_mb_node = 1;
2281      break;
2282    case OP_OPEN_DUP_NUM:
2283      if (syntax & RE_CONTEXT_INVALID_DUP)
2284        {
2285          *err = REG_BADRPT;
2286          return NULL;
2287        }
2288      /* FALLTHROUGH */
2289    case OP_DUP_ASTERISK:
2290    case OP_DUP_PLUS:
2291    case OP_DUP_QUESTION:
2292      if (syntax & RE_CONTEXT_INVALID_OPS)
2293        {
2294          *err = REG_BADRPT;
2295          return NULL;
2296        }
2297      else if (syntax & RE_CONTEXT_INDEP_OPS)
2298        {
2299          fetch_token (token, regexp, syntax);
2300          return parse_expression (regexp, preg, token, syntax, nest, err);
2301        }
2302      /* else fall through  */
2303    case OP_CLOSE_SUBEXP:
2304      if ((token->type == OP_CLOSE_SUBEXP) &&
2305          !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2306        {
2307          *err = REG_ERPAREN;
2308          return NULL;
2309        }
2310      /* else fall through  */
2311    case OP_CLOSE_DUP_NUM:
2312      /* We treat it as a normal character.  */
2313
2314      /* Then we can these characters as normal characters.  */
2315      token->type = CHARACTER;
2316      /* mb_partial and word_char bits should be initialized already
2317         by peek_token.  */
2318      tree = create_token_tree (dfa, NULL, NULL, token);
2319      if (BE (tree == NULL, 0))
2320        {
2321          *err = REG_ESPACE;
2322          return NULL;
2323        }
2324      break;
2325    case ANCHOR:
2326      if ((token->opr.ctx_type
2327           & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2328          && dfa->word_ops_used == 0)
2329        init_word_char (dfa);
2330      if (token->opr.ctx_type == WORD_DELIM
2331          || token->opr.ctx_type == NOT_WORD_DELIM)
2332        {
2333          bin_tree_t *tree_first, *tree_last;
2334          if (token->opr.ctx_type == WORD_DELIM)
2335            {
2336              token->opr.ctx_type = WORD_FIRST;
2337              tree_first = create_token_tree (dfa, NULL, NULL, token);
2338              token->opr.ctx_type = WORD_LAST;
2339            }
2340          else
2341            {
2342              token->opr.ctx_type = INSIDE_WORD;
2343              tree_first = create_token_tree (dfa, NULL, NULL, token);
2344              token->opr.ctx_type = INSIDE_NOTWORD;
2345            }
2346          tree_last = create_token_tree (dfa, NULL, NULL, token);
2347          tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2348          if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2349            {
2350              *err = REG_ESPACE;
2351              return NULL;
2352            }
2353        }
2354      else
2355        {
2356          tree = create_token_tree (dfa, NULL, NULL, token);
2357          if (BE (tree == NULL, 0))
2358            {
2359              *err = REG_ESPACE;
2360              return NULL;
2361            }
2362        }
2363      /* We must return here, since ANCHORs can't be followed
2364         by repetition operators.
2365         eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366             it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2367      fetch_token (token, regexp, syntax);
2368      return tree;
2369    case OP_PERIOD:
2370      tree = create_token_tree (dfa, NULL, NULL, token);
2371      if (BE (tree == NULL, 0))
2372        {
2373          *err = REG_ESPACE;
2374          return NULL;
2375        }
2376      if (dfa->mb_cur_max > 1)
2377        dfa->has_mb_node = 1;
2378      break;
2379    case OP_WORD:
2380    case OP_NOTWORD:
2381      tree = build_charclass_op (dfa, regexp->trans,
2382                                 "alnum",
2383                                 "_",
2384                                 token->type == OP_NOTWORD, err);
2385      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386        return NULL;
2387      break;
2388    case OP_SPACE:
2389    case OP_NOTSPACE:
2390      tree = build_charclass_op (dfa, regexp->trans,
2391                                 "space",
2392                                 "",
2393                                 token->type == OP_NOTSPACE, err);
2394      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395        return NULL;
2396      break;
2397    case OP_ALT:
2398    case END_OF_RE:
2399      return NULL;
2400    case BACK_SLASH:
2401      *err = REG_EESCAPE;
2402      return NULL;
2403    default:
2404      /* Must not happen?  */
2405#ifdef DEBUG
2406      assert (0);
2407#endif
2408      return NULL;
2409    }
2410  fetch_token (token, regexp, syntax);
2411
2412  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2413         || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414    {
2415      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2416      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2417        return NULL;
2418      /* In BRE consecutive duplications are not allowed.  */
2419      if ((syntax & RE_CONTEXT_INVALID_DUP)
2420          && (token->type == OP_DUP_ASTERISK
2421              || token->type == OP_OPEN_DUP_NUM))
2422        {
2423          *err = REG_BADRPT;
2424          return NULL;
2425        }
2426    }
2427
2428  return tree;
2429}
2430
2431/* This function build the following tree, from regular expression
2432   (<reg_exp>):
2433         SUBEXP
2434            |
2435        <reg_exp>
2436*/
2437
2438static bin_tree_t *
2439parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2440               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2441{
2442  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2443  bin_tree_t *tree;
2444  size_t cur_nsub;
2445  cur_nsub = preg->re_nsub++;
2446
2447  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2448
2449  /* The subexpression may be a null string.  */
2450  if (token->type == OP_CLOSE_SUBEXP)
2451    tree = NULL;
2452  else
2453    {
2454      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2455      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2456        *err = REG_EPAREN;
2457      if (BE (*err != REG_NOERROR, 0))
2458        return NULL;
2459    }
2460
2461  if (cur_nsub <= '9' - '1')
2462    dfa->completed_bkref_map |= 1 << cur_nsub;
2463
2464  tree = create_tree (dfa, tree, NULL, SUBEXP);
2465  if (BE (tree == NULL, 0))
2466    {
2467      *err = REG_ESPACE;
2468      return NULL;
2469    }
2470  tree->token.opr.idx = cur_nsub;
2471  return tree;
2472}
2473
2474/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2475
2476static bin_tree_t *
2477parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2478              re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2479{
2480  bin_tree_t *tree = NULL, *old_tree = NULL;
2481  int i, start, end, start_idx = re_string_cur_idx (regexp);
2482#ifndef RE_TOKEN_INIT_BUG
2483  re_token_t start_token = *token;
2484#else
2485  re_token_t start_token;
2486
2487  memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2488#endif
2489
2490  if (token->type == OP_OPEN_DUP_NUM)
2491    {
2492      end = 0;
2493      start = fetch_number (regexp, token, syntax);
2494      if (start == -1)
2495        {
2496          if (token->type == CHARACTER && token->opr.c == ',')
2497            start = 0; /* We treat "{,m}" as "{0,m}".  */
2498          else
2499            {
2500              *err = REG_BADBR; /* <re>{} is invalid.  */
2501              return NULL;
2502            }
2503        }
2504      if (BE (start != -2, 1))
2505        {
2506          /* We treat "{n}" as "{n,n}".  */
2507          end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2508                 : ((token->type == CHARACTER && token->opr.c == ',')
2509                    ? fetch_number (regexp, token, syntax) : -2));
2510        }
2511      if (BE (start == -2 || end == -2, 0))
2512        {
2513          /* Invalid sequence.  */
2514          if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2515            {
2516              if (token->type == END_OF_RE)
2517                *err = REG_EBRACE;
2518              else
2519                *err = REG_BADBR;
2520
2521              return NULL;
2522            }
2523
2524          /* If the syntax bit is set, rollback.  */
2525          re_string_set_index (regexp, start_idx);
2526          *token = start_token;
2527          token->type = CHARACTER;
2528          /* mb_partial and word_char bits should be already initialized by
2529             peek_token.  */
2530          return elem;
2531        }
2532
2533      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2534        {
2535          /* First number greater than second.  */
2536          *err = REG_BADBR;
2537          return NULL;
2538        }
2539    }
2540  else
2541    {
2542      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2543      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2544    }
2545
2546  fetch_token (token, regexp, syntax);
2547
2548  if (BE (elem == NULL, 0))
2549    return NULL;
2550  if (BE (start == 0 && end == 0, 0))
2551    {
2552      postorder (elem, free_tree, NULL);
2553      return NULL;
2554    }
2555
2556  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2557  if (BE (start > 0, 0))
2558    {
2559      tree = elem;
2560      for (i = 2; i <= start; ++i)
2561        {
2562          elem = duplicate_tree (elem, dfa);
2563          tree = create_tree (dfa, tree, elem, CONCAT);
2564          if (BE (elem == NULL || tree == NULL, 0))
2565            goto parse_dup_op_espace;
2566        }
2567
2568      if (start == end)
2569        return tree;
2570
2571      /* Duplicate ELEM before it is marked optional.  */
2572      elem = duplicate_tree (elem, dfa);
2573      old_tree = tree;
2574    }
2575  else
2576    old_tree = NULL;
2577
2578  if (elem->token.type == SUBEXP)
2579    postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2580
2581  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2582  if (BE (tree == NULL, 0))
2583    goto parse_dup_op_espace;
2584
2585  /* This loop is actually executed only when end != -1,
2586     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2587     already created the start+1-th copy.  */
2588  for (i = start + 2; i <= end; ++i)
2589    {
2590      elem = duplicate_tree (elem, dfa);
2591      tree = create_tree (dfa, tree, elem, CONCAT);
2592      if (BE (elem == NULL || tree == NULL, 0))
2593        goto parse_dup_op_espace;
2594
2595      tree = create_tree (dfa, tree, NULL, OP_ALT);
2596      if (BE (tree == NULL, 0))
2597        goto parse_dup_op_espace;
2598    }
2599
2600  if (old_tree)
2601    tree = create_tree (dfa, old_tree, tree, CONCAT);
2602
2603  return tree;
2604
2605 parse_dup_op_espace:
2606  *err = REG_ESPACE;
2607  return NULL;
2608}
2609
2610/* Size of the names for collating symbol/equivalence_class/character_class.
2611   I'm not sure, but maybe enough.  */
2612#define BRACKET_NAME_BUF_SIZE 32
2613
2614#ifndef _LIBC
2615  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2616     Build the range expression which starts from START_ELEM, and ends
2617     at END_ELEM.  The result are written to MBCSET and SBCSET.
2618     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2619     mbcset->range_ends, is a pointer argument since we may
2620     update it.  */
2621
2622static reg_errcode_t
2623internal_function
2624# ifdef RE_ENABLE_I18N
2625build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2626                 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2627# else /* not RE_ENABLE_I18N */
2628build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2629                 bracket_elem_t *end_elem)
2630# endif /* not RE_ENABLE_I18N */
2631{
2632  unsigned int start_ch, end_ch;
2633  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2634  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2635          || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2636          0))
2637    return REG_ERANGE;
2638
2639  /* We can handle no multi character collating elements without libc
2640     support.  */
2641  if (BE ((start_elem->type == COLL_SYM
2642           && strlen ((char *) start_elem->opr.name) > 1)
2643          || (end_elem->type == COLL_SYM
2644              && strlen ((char *) end_elem->opr.name) > 1), 0))
2645    return REG_ECOLLATE;
2646
2647# ifdef RE_ENABLE_I18N
2648  {
2649    wchar_t wc;
2650    wint_t start_wc;
2651    wint_t end_wc;
2652    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2653
2654    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2655                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2656                   : 0));
2657    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2658              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2659                 : 0));
2660#ifdef GAWK
2661    /*
2662     * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2663     * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2664     * unsigned, so we don't have sign extension problems.
2665     */
2666    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2667                ? start_ch : start_elem->opr.wch);
2668    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2669              ? end_ch : end_elem->opr.wch);
2670#else
2671    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2672                ? __btowc (start_ch) : start_elem->opr.wch);
2673    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2674              ? __btowc (end_ch) : end_elem->opr.wch);
2675#endif
2676    if (start_wc == WEOF || end_wc == WEOF)
2677      return REG_ECOLLATE;
2678    cmp_buf[0] = start_wc;
2679    cmp_buf[4] = end_wc;
2680    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2681      return REG_ERANGE;
2682
2683    /* Got valid collation sequence values, add them as a new entry.
2684       However, for !_LIBC we have no collation elements: if the
2685       character set is single byte, the single byte character set
2686       that we build below suffices.  parse_bracket_exp passes
2687       no MBCSET if dfa->mb_cur_max == 1.  */
2688    if (mbcset)
2689      {
2690        /* Check the space of the arrays.  */
2691        if (BE (*range_alloc == mbcset->nranges, 0))
2692          {
2693            /* There is not enough space, need realloc.  */
2694            wchar_t *new_array_start, *new_array_end;
2695            int new_nranges;
2696
2697            /* +1 in case of mbcset->nranges is 0.  */
2698            new_nranges = 2 * mbcset->nranges + 1;
2699            /* Use realloc since mbcset->range_starts and mbcset->range_ends
2700               are NULL if *range_alloc == 0.  */
2701            new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2702                                          new_nranges);
2703            new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2704                                        new_nranges);
2705
2706            if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2707              return REG_ESPACE;
2708
2709            mbcset->range_starts = new_array_start;
2710            mbcset->range_ends = new_array_end;
2711            *range_alloc = new_nranges;
2712          }
2713
2714        mbcset->range_starts[mbcset->nranges] = start_wc;
2715        mbcset->range_ends[mbcset->nranges++] = end_wc;
2716      }
2717
2718    /* Build the table for single byte characters.  */
2719    for (wc = 0; wc < SBC_MAX; ++wc)
2720      {
2721        cmp_buf[2] = wc;
2722        if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2723            && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2724          bitset_set (sbcset, wc);
2725      }
2726  }
2727# else /* not RE_ENABLE_I18N */
2728  {
2729    unsigned int ch;
2730    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2731                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2732                   : 0));
2733    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2734              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2735                 : 0));
2736    if (start_ch > end_ch)
2737      return REG_ERANGE;
2738    /* Build the table for single byte characters.  */
2739    for (ch = 0; ch < SBC_MAX; ++ch)
2740      if (start_ch <= ch  && ch <= end_ch)
2741        bitset_set (sbcset, ch);
2742  }
2743# endif /* not RE_ENABLE_I18N */
2744  return REG_NOERROR;
2745}
2746#endif /* not _LIBC */
2747
2748#ifndef _LIBC
2749/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2750   Build the collating element which is represented by NAME.
2751   The result are written to MBCSET and SBCSET.
2752   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2753   pointer argument since we may update it.  */
2754
2755static reg_errcode_t
2756internal_function
2757# ifdef RE_ENABLE_I18N
2758build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2759                        int *coll_sym_alloc, const unsigned char *name)
2760# else /* not RE_ENABLE_I18N */
2761build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2762# endif /* not RE_ENABLE_I18N */
2763{
2764  size_t name_len = strlen ((const char *) name);
2765  if (BE (name_len != 1, 0))
2766    return REG_ECOLLATE;
2767  else
2768    {
2769      bitset_set (sbcset, name[0]);
2770      return REG_NOERROR;
2771    }
2772}
2773#endif /* not _LIBC */
2774
2775/* This function parse bracket expression like "[abc]", "[a-c]",
2776   "[[.a-a.]]" etc.  */
2777
2778static bin_tree_t *
2779parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2780                   reg_syntax_t syntax, reg_errcode_t *err)
2781{
2782#ifdef _LIBC
2783  const unsigned char *collseqmb;
2784  const char *collseqwc;
2785  uint32_t nrules;
2786  int32_t table_size;
2787  const int32_t *symb_table;
2788  const unsigned char *extra;
2789
2790  /* Local function for parse_bracket_exp used in _LIBC environment.
2791     Seek the collating symbol entry correspondings to NAME.
2792     Return the index of the symbol in the SYMB_TABLE.  */
2793
2794  auto inline int32_t
2795  __attribute ((always_inline))
2796  seek_collating_symbol_entry (name, name_len)
2797         const unsigned char *name;
2798         size_t name_len;
2799    {
2800      int32_t hash = elem_hash ((const char *) name, name_len);
2801      int32_t elem = hash % table_size;
2802      if (symb_table[2 * elem] != 0)
2803        {
2804          int32_t second = hash % (table_size - 2) + 1;
2805
2806          do
2807            {
2808              /* First compare the hashing value.  */
2809              if (symb_table[2 * elem] == hash
2810                  /* Compare the length of the name.  */
2811                  && name_len == extra[symb_table[2 * elem + 1]]
2812                  /* Compare the name.  */
2813                  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2814                             name_len) == 0)
2815                {
2816                  /* Yep, this is the entry.  */
2817                  break;
2818                }
2819
2820              /* Next entry.  */
2821              elem += second;
2822            }
2823          while (symb_table[2 * elem] != 0);
2824        }
2825      return elem;
2826    }
2827
2828  /* Local function for parse_bracket_exp used in _LIBC environment.
2829     Look up the collation sequence value of BR_ELEM.
2830     Return the value if succeeded, UINT_MAX otherwise.  */
2831
2832  auto inline unsigned int
2833  __attribute ((always_inline))
2834  lookup_collation_sequence_value (br_elem)
2835         bracket_elem_t *br_elem;
2836    {
2837      if (br_elem->type == SB_CHAR)
2838        {
2839          /*
2840          if (MB_CUR_MAX == 1)
2841          */
2842          if (nrules == 0)
2843            return collseqmb[br_elem->opr.ch];
2844          else
2845            {
2846              wint_t wc = __btowc (br_elem->opr.ch);
2847              return __collseq_table_lookup (collseqwc, wc);
2848            }
2849        }
2850      else if (br_elem->type == MB_CHAR)
2851        {
2852          if (nrules != 0)
2853            return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2854        }
2855      else if (br_elem->type == COLL_SYM)
2856        {
2857          size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2858          if (nrules != 0)
2859            {
2860              int32_t elem, idx;
2861              elem = seek_collating_symbol_entry (br_elem->opr.name,
2862                                                  sym_name_len);
2863              if (symb_table[2 * elem] != 0)
2864                {
2865                  /* We found the entry.  */
2866                  idx = symb_table[2 * elem + 1];
2867                  /* Skip the name of collating element name.  */
2868                  idx += 1 + extra[idx];
2869                  /* Skip the byte sequence of the collating element.  */
2870                  idx += 1 + extra[idx];
2871                  /* Adjust for the alignment.  */
2872                  idx = (idx + 3) & ~3;
2873                  /* Skip the multibyte collation sequence value.  */
2874                  idx += sizeof (unsigned int);
2875                  /* Skip the wide char sequence of the collating element.  */
2876                  idx += sizeof (unsigned int) *
2877                    (1 + *(unsigned int *) (extra + idx));
2878                  /* Return the collation sequence value.  */
2879                  return *(unsigned int *) (extra + idx);
2880                }
2881              else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2882                {
2883                  /* No valid character.  Match it as a single byte
2884                     character.  */
2885                  return collseqmb[br_elem->opr.name[0]];
2886                }
2887            }
2888          else if (sym_name_len == 1)
2889            return collseqmb[br_elem->opr.name[0]];
2890        }
2891      return UINT_MAX;
2892    }
2893
2894  /* Local function for parse_bracket_exp used in _LIBC environment.
2895     Build the range expression which starts from START_ELEM, and ends
2896     at END_ELEM.  The result are written to MBCSET and SBCSET.
2897     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2898     mbcset->range_ends, is a pointer argument since we may
2899     update it.  */
2900
2901  auto inline reg_errcode_t
2902  __attribute ((always_inline))
2903  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2904         re_charset_t *mbcset;
2905         int *range_alloc;
2906         bitset_t sbcset;
2907         bracket_elem_t *start_elem, *end_elem;
2908    {
2909      unsigned int ch;
2910      uint32_t start_collseq;
2911      uint32_t end_collseq;
2912
2913      /* Equivalence Classes and Character Classes can't be a range
2914         start/end.  */
2915      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2916              || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2917              0))
2918        return REG_ERANGE;
2919
2920      start_collseq = lookup_collation_sequence_value (start_elem);
2921      end_collseq = lookup_collation_sequence_value (end_elem);
2922      /* Check start/end collation sequence values.  */
2923      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2924        return REG_ECOLLATE;
2925      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2926        return REG_ERANGE;
2927
2928      /* Got valid collation sequence values, add them as a new entry.
2929         However, if we have no collation elements, and the character set
2930         is single byte, the single byte character set that we
2931         build below suffices. */
2932      if (nrules > 0 || dfa->mb_cur_max > 1)
2933        {
2934          /* Check the space of the arrays.  */
2935          if (BE (*range_alloc == mbcset->nranges, 0))
2936            {
2937              /* There is not enough space, need realloc.  */
2938              uint32_t *new_array_start;
2939              uint32_t *new_array_end;
2940              int new_nranges;
2941
2942              /* +1 in case of mbcset->nranges is 0.  */
2943              new_nranges = 2 * mbcset->nranges + 1;
2944              new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2945                                            new_nranges);
2946              new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2947                                          new_nranges);
2948
2949              if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2950                return REG_ESPACE;
2951
2952              mbcset->range_starts = new_array_start;
2953              mbcset->range_ends = new_array_end;
2954              *range_alloc = new_nranges;
2955            }
2956
2957          mbcset->range_starts[mbcset->nranges] = start_collseq;
2958          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2959        }
2960
2961      /* Build the table for single byte characters.  */
2962      for (ch = 0; ch < SBC_MAX; ch++)
2963        {
2964          uint32_t ch_collseq;
2965          /*
2966          if (MB_CUR_MAX == 1)
2967          */
2968          if (nrules == 0)
2969            ch_collseq = collseqmb[ch];
2970          else
2971            ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2972          if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2973            bitset_set (sbcset, ch);
2974        }
2975      return REG_NOERROR;
2976    }
2977
2978  /* Local function for parse_bracket_exp used in _LIBC environment.
2979     Build the collating element which is represented by NAME.
2980     The result are written to MBCSET and SBCSET.
2981     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2982     pointer argument since we may update it.  */
2983
2984  auto inline reg_errcode_t
2985  __attribute ((always_inline))
2986  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2987         re_charset_t *mbcset;
2988         int *coll_sym_alloc;
2989         bitset_t sbcset;
2990         const unsigned char *name;
2991    {
2992      int32_t elem, idx;
2993      size_t name_len = strlen ((const char *) name);
2994      if (nrules != 0)
2995        {
2996          elem = seek_collating_symbol_entry (name, name_len);
2997          if (symb_table[2 * elem] != 0)
2998            {
2999              /* We found the entry.  */
3000              idx = symb_table[2 * elem + 1];
3001              /* Skip the name of collating element name.  */
3002              idx += 1 + extra[idx];
3003            }
3004          else if (symb_table[2 * elem] == 0 && name_len == 1)
3005            {
3006              /* No valid character, treat it as a normal
3007                 character.  */
3008              bitset_set (sbcset, name[0]);
3009              return REG_NOERROR;
3010            }
3011          else
3012            return REG_ECOLLATE;
3013
3014          /* Got valid collation sequence, add it as a new entry.  */
3015          /* Check the space of the arrays.  */
3016          if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3017            {
3018              /* Not enough, realloc it.  */
3019              /* +1 in case of mbcset->ncoll_syms is 0.  */
3020              int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3021              /* Use realloc since mbcset->coll_syms is NULL
3022                 if *alloc == 0.  */
3023              int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3024                                                   new_coll_sym_alloc);
3025              if (BE (new_coll_syms == NULL, 0))
3026                return REG_ESPACE;
3027              mbcset->coll_syms = new_coll_syms;
3028              *coll_sym_alloc = new_coll_sym_alloc;
3029            }
3030          mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3031          return REG_NOERROR;
3032        }
3033      else
3034        {
3035          if (BE (name_len != 1, 0))
3036            return REG_ECOLLATE;
3037          else
3038            {
3039              bitset_set (sbcset, name[0]);
3040              return REG_NOERROR;
3041            }
3042        }
3043    }
3044#endif
3045
3046  re_token_t br_token;
3047  re_bitset_ptr_t sbcset;
3048#ifdef RE_ENABLE_I18N
3049  re_charset_t *mbcset;
3050  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3051  int equiv_class_alloc = 0, char_class_alloc = 0;
3052#endif /* not RE_ENABLE_I18N */
3053  int non_match = 0;
3054  bin_tree_t *work_tree;
3055  int token_len;
3056  int first_round = 1;
3057#ifdef _LIBC
3058  collseqmb = (const unsigned char *)
3059    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3060  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3061  if (nrules)
3062    {
3063      /*
3064      if (MB_CUR_MAX > 1)
3065      */
3066      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3067      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3068      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3069                                                  _NL_COLLATE_SYMB_TABLEMB);
3070      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3071                                                   _NL_COLLATE_SYMB_EXTRAMB);
3072    }
3073#endif
3074  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3075#ifdef RE_ENABLE_I18N
3076  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3077#endif /* RE_ENABLE_I18N */
3078#ifdef RE_ENABLE_I18N
3079  if (BE (sbcset == NULL || mbcset == NULL, 0))
3080#else
3081  if (BE (sbcset == NULL, 0))
3082#endif /* RE_ENABLE_I18N */
3083    {
3084      *err = REG_ESPACE;
3085      return NULL;
3086    }
3087
3088  token_len = peek_token_bracket (token, regexp, syntax);
3089  if (BE (token->type == END_OF_RE, 0))
3090    {
3091      *err = REG_BADPAT;
3092      goto parse_bracket_exp_free_return;
3093    }
3094  if (token->type == OP_NON_MATCH_LIST)
3095    {
3096#ifdef RE_ENABLE_I18N
3097      mbcset->non_match = 1;
3098#endif /* not RE_ENABLE_I18N */
3099      non_match = 1;
3100      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3101        bitset_set (sbcset, '\n');
3102      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3103      token_len = peek_token_bracket (token, regexp, syntax);
3104      if (BE (token->type == END_OF_RE, 0))
3105        {
3106          *err = REG_BADPAT;
3107          goto parse_bracket_exp_free_return;
3108        }
3109    }
3110
3111  /* We treat the first ']' as a normal character.  */
3112  if (token->type == OP_CLOSE_BRACKET)
3113    token->type = CHARACTER;
3114
3115  while (1)
3116    {
3117      bracket_elem_t start_elem, end_elem;
3118      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3119      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3120      reg_errcode_t ret;
3121      int token_len2 = 0, is_range_exp = 0;
3122      re_token_t token2;
3123
3124      start_elem.opr.name = start_name_buf;
3125      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3126                                   syntax, first_round);
3127      if (BE (ret != REG_NOERROR, 0))
3128        {
3129          *err = ret;
3130          goto parse_bracket_exp_free_return;
3131        }
3132      first_round = 0;
3133
3134      /* Get information about the next token.  We need it in any case.  */
3135      token_len = peek_token_bracket (token, regexp, syntax);
3136
3137      /* Do not check for ranges if we know they are not allowed.  */
3138      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3139        {
3140          if (BE (token->type == END_OF_RE, 0))
3141            {
3142              *err = REG_EBRACK;
3143              goto parse_bracket_exp_free_return;
3144            }
3145          if (token->type == OP_CHARSET_RANGE)
3146            {
3147              re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3148              token_len2 = peek_token_bracket (&token2, regexp, syntax);
3149              if (BE (token2.type == END_OF_RE, 0))
3150                {
3151                  *err = REG_EBRACK;
3152                  goto parse_bracket_exp_free_return;
3153                }
3154              if (token2.type == OP_CLOSE_BRACKET)
3155                {
3156                  /* We treat the last '-' as a normal character.  */
3157                  re_string_skip_bytes (regexp, -token_len);
3158                  token->type = CHARACTER;
3159                }
3160              else
3161                is_range_exp = 1;
3162            }
3163        }
3164
3165      if (is_range_exp == 1)
3166        {
3167          end_elem.opr.name = end_name_buf;
3168          ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3169                                       dfa, syntax, 1);
3170          if (BE (ret != REG_NOERROR, 0))
3171            {
3172              *err = ret;
3173              goto parse_bracket_exp_free_return;
3174            }
3175
3176          token_len = peek_token_bracket (token, regexp, syntax);
3177
3178#ifdef _LIBC
3179          *err = build_range_exp (sbcset, mbcset, &range_alloc,
3180                                  &start_elem, &end_elem);
3181#else
3182# ifdef RE_ENABLE_I18N
3183          *err = build_range_exp (sbcset,
3184                                  dfa->mb_cur_max > 1 ? mbcset : NULL,
3185                                  &range_alloc, &start_elem, &end_elem);
3186# else
3187          *err = build_range_exp (sbcset, &start_elem, &end_elem);
3188# endif
3189#endif /* RE_ENABLE_I18N */
3190          if (BE (*err != REG_NOERROR, 0))
3191            goto parse_bracket_exp_free_return;
3192        }
3193      else
3194        {
3195          switch (start_elem.type)
3196            {
3197            case SB_CHAR:
3198              bitset_set (sbcset, start_elem.opr.ch);
3199              break;
3200#ifdef RE_ENABLE_I18N
3201            case MB_CHAR:
3202              /* Check whether the array has enough space.  */
3203              if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3204                {
3205                  wchar_t *new_mbchars;
3206                  /* Not enough, realloc it.  */
3207                  /* +1 in case of mbcset->nmbchars is 0.  */
3208                  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3209                  /* Use realloc since array is NULL if *alloc == 0.  */
3210                  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3211                                            mbchar_alloc);
3212                  if (BE (new_mbchars == NULL, 0))
3213                    goto parse_bracket_exp_espace;
3214                  mbcset->mbchars = new_mbchars;
3215                }
3216              mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3217              break;
3218#endif /* RE_ENABLE_I18N */
3219            case EQUIV_CLASS:
3220              *err = build_equiv_class (sbcset,
3221#ifdef RE_ENABLE_I18N
3222                                        mbcset, &equiv_class_alloc,
3223#endif /* RE_ENABLE_I18N */
3224                                        start_elem.opr.name);
3225              if (BE (*err != REG_NOERROR, 0))
3226                goto parse_bracket_exp_free_return;
3227              break;
3228            case COLL_SYM:
3229              *err = build_collating_symbol (sbcset,
3230#ifdef RE_ENABLE_I18N
3231                                             mbcset, &coll_sym_alloc,
3232#endif /* RE_ENABLE_I18N */
3233                                             start_elem.opr.name);
3234              if (BE (*err != REG_NOERROR, 0))
3235                goto parse_bracket_exp_free_return;
3236              break;
3237            case CHAR_CLASS:
3238              *err = build_charclass (regexp->trans, sbcset,
3239#ifdef RE_ENABLE_I18N
3240                                      mbcset, &char_class_alloc,
3241#endif /* RE_ENABLE_I18N */
3242                                      (const char *) start_elem.opr.name, syntax);
3243              if (BE (*err != REG_NOERROR, 0))
3244               goto parse_bracket_exp_free_return;
3245              break;
3246            default:
3247              assert (0);
3248              break;
3249            }
3250        }
3251      if (BE (token->type == END_OF_RE, 0))
3252        {
3253          *err = REG_EBRACK;
3254          goto parse_bracket_exp_free_return;
3255        }
3256      if (token->type == OP_CLOSE_BRACKET)
3257        break;
3258    }
3259
3260  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3261
3262  /* If it is non-matching list.  */
3263  if (non_match)
3264    bitset_not (sbcset);
3265
3266#ifdef RE_ENABLE_I18N
3267  /* Ensure only single byte characters are set.  */
3268  if (dfa->mb_cur_max > 1)
3269    bitset_mask (sbcset, dfa->sb_char);
3270
3271  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3272      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3273                                                     || mbcset->non_match)))
3274    {
3275      bin_tree_t *mbc_tree;
3276      int sbc_idx;
3277      /* Build a tree for complex bracket.  */
3278      dfa->has_mb_node = 1;
3279      br_token.type = COMPLEX_BRACKET;
3280      br_token.opr.mbcset = mbcset;
3281      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282      if (BE (mbc_tree == NULL, 0))
3283        goto parse_bracket_exp_espace;
3284      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3285        if (sbcset[sbc_idx])
3286          break;
3287      /* If there are no bits set in sbcset, there is no point
3288         of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3289      if (sbc_idx < BITSET_WORDS)
3290        {
3291          /* Build a tree for simple bracket.  */
3292          br_token.type = SIMPLE_BRACKET;
3293          br_token.opr.sbcset = sbcset;
3294          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3295          if (BE (work_tree == NULL, 0))
3296            goto parse_bracket_exp_espace;
3297
3298          /* Then join them by ALT node.  */
3299          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3300          if (BE (work_tree == NULL, 0))
3301            goto parse_bracket_exp_espace;
3302        }
3303      else
3304        {
3305          re_free (sbcset);
3306          work_tree = mbc_tree;
3307        }
3308    }
3309  else
3310#endif /* not RE_ENABLE_I18N */
3311    {
3312#ifdef RE_ENABLE_I18N
3313      free_charset (mbcset);
3314#endif
3315      /* Build a tree for simple bracket.  */
3316      br_token.type = SIMPLE_BRACKET;
3317      br_token.opr.sbcset = sbcset;
3318      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3319      if (BE (work_tree == NULL, 0))
3320        goto parse_bracket_exp_espace;
3321    }
3322  return work_tree;
3323
3324 parse_bracket_exp_espace:
3325  *err = REG_ESPACE;
3326 parse_bracket_exp_free_return:
3327  re_free (sbcset);
3328#ifdef RE_ENABLE_I18N
3329  free_charset (mbcset);
3330#endif /* RE_ENABLE_I18N */
3331  return NULL;
3332}
3333
3334/* Parse an element in the bracket expression.  */
3335
3336static reg_errcode_t
3337parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3338                       re_token_t *token, int token_len, re_dfa_t *dfa,
3339                       reg_syntax_t syntax, int accept_hyphen)
3340{
3341#ifdef RE_ENABLE_I18N
3342  int cur_char_size;
3343  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3344  if (cur_char_size > 1)
3345    {
3346      elem->type = MB_CHAR;
3347      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3348      re_string_skip_bytes (regexp, cur_char_size);
3349      return REG_NOERROR;
3350    }
3351#endif /* RE_ENABLE_I18N */
3352  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3353  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3354      || token->type == OP_OPEN_EQUIV_CLASS)
3355    return parse_bracket_symbol (elem, regexp, token);
3356  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3357    {
3358      /* A '-' must only appear as anything but a range indicator before
3359         the closing bracket.  Everything else is an error.  */
3360      re_token_t token2;
3361      (void) peek_token_bracket (&token2, regexp, syntax);
3362      if (token2.type != OP_CLOSE_BRACKET)
3363        /* The actual error value is not standardized since this whole
3364           case is undefined.  But ERANGE makes good sense.  */
3365        return REG_ERANGE;
3366    }
3367  elem->type = SB_CHAR;
3368  elem->opr.ch = token->opr.c;
3369  return REG_NOERROR;
3370}
3371
3372/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3373   such as [:<character_class>:], [.<collating_element>.], and
3374   [=<equivalent_class>=].  */
3375
3376static reg_errcode_t
3377parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3378                      re_token_t *token)
3379{
3380  unsigned char ch, delim = token->opr.c;
3381  int i = 0;
3382  if (re_string_eoi(regexp))
3383    return REG_EBRACK;
3384  for (;; ++i)
3385    {
3386      if (i >= BRACKET_NAME_BUF_SIZE)
3387        return REG_EBRACK;
3388      if (token->type == OP_OPEN_CHAR_CLASS)
3389        ch = re_string_fetch_byte_case (regexp);
3390      else
3391        ch = re_string_fetch_byte (regexp);
3392      if (re_string_eoi(regexp))
3393        return REG_EBRACK;
3394      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3395        break;
3396      elem->opr.name[i] = ch;
3397    }
3398  re_string_skip_bytes (regexp, 1);
3399  elem->opr.name[i] = '\0';
3400  switch (token->type)
3401    {
3402    case OP_OPEN_COLL_ELEM:
3403      elem->type = COLL_SYM;
3404      break;
3405    case OP_OPEN_EQUIV_CLASS:
3406      elem->type = EQUIV_CLASS;
3407      break;
3408    case OP_OPEN_CHAR_CLASS:
3409      elem->type = CHAR_CLASS;
3410      break;
3411    default:
3412      break;
3413    }
3414  return REG_NOERROR;
3415}
3416
3417  /* Helper function for parse_bracket_exp.
3418     Build the equivalence class which is represented by NAME.
3419     The result are written to MBCSET and SBCSET.
3420     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3421     is a pointer argument since we may update it.  */
3422
3423static reg_errcode_t
3424#ifdef RE_ENABLE_I18N
3425build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3426                   int *equiv_class_alloc, const unsigned char *name)
3427#else /* not RE_ENABLE_I18N */
3428build_equiv_class (bitset_t sbcset, const unsigned char *name)
3429#endif /* not RE_ENABLE_I18N */
3430{
3431#ifdef _LIBC
3432  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3433  if (nrules != 0)
3434    {
3435      const int32_t *table, *indirect;
3436      const unsigned char *weights, *extra, *cp;
3437      unsigned char char_buf[2];
3438      int32_t idx1, idx2;
3439      unsigned int ch;
3440      size_t len;
3441      /* This #include defines a local function!  */
3442# include <locale/weight.h>
3443      /* Calculate the index for equivalence class.  */
3444      cp = name;
3445      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3446      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3447                                               _NL_COLLATE_WEIGHTMB);
3448      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3449                                                   _NL_COLLATE_EXTRAMB);
3450      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3451                                                _NL_COLLATE_INDIRECTMB);
3452      idx1 = findidx (&cp);
3453      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3454        /* This isn't a valid character.  */
3455        return REG_ECOLLATE;
3456
3457      /* Build single byte matcing table for this equivalence class.  */
3458      char_buf[1] = (unsigned char) '\0';
3459      len = weights[idx1 & 0xffffff];
3460      for (ch = 0; ch < SBC_MAX; ++ch)
3461        {
3462          char_buf[0] = ch;
3463          cp = char_buf;
3464          idx2 = findidx (&cp);
3465/*
3466          idx2 = table[ch];
3467*/
3468          if (idx2 == 0)
3469            /* This isn't a valid character.  */
3470            continue;
3471          /* Compare only if the length matches and the collation rule
3472             index is the same.  */
3473          if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3474            {
3475              int cnt = 0;
3476
3477              while (cnt <= len &&
3478                     weights[(idx1 & 0xffffff) + 1 + cnt]
3479                     == weights[(idx2 & 0xffffff) + 1 + cnt])
3480                ++cnt;
3481
3482              if (cnt > len)
3483                bitset_set (sbcset, ch);
3484            }
3485        }
3486      /* Check whether the array has enough space.  */
3487      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3488        {
3489          /* Not enough, realloc it.  */
3490          /* +1 in case of mbcset->nequiv_classes is 0.  */
3491          int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3492          /* Use realloc since the array is NULL if *alloc == 0.  */
3493          int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3494                                                   int32_t,
3495                                                   new_equiv_class_alloc);
3496          if (BE (new_equiv_classes == NULL, 0))
3497            return REG_ESPACE;
3498          mbcset->equiv_classes = new_equiv_classes;
3499          *equiv_class_alloc = new_equiv_class_alloc;
3500        }
3501      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3502    }
3503  else
3504#endif /* _LIBC */
3505    {
3506      if (BE (strlen ((const char *) name) != 1, 0))
3507        return REG_ECOLLATE;
3508      bitset_set (sbcset, *name);
3509    }
3510  return REG_NOERROR;
3511}
3512
3513  /* Helper function for parse_bracket_exp.
3514     Build the character class which is represented by NAME.
3515     The result are written to MBCSET and SBCSET.
3516     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3517     is a pointer argument since we may update it.  */
3518
3519static reg_errcode_t
3520#ifdef RE_ENABLE_I18N
3521build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3522                 re_charset_t *mbcset, int *char_class_alloc,
3523                 const char *class_name, reg_syntax_t syntax)
3524#else /* not RE_ENABLE_I18N */
3525build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3526                 const char *class_name, reg_syntax_t syntax)
3527#endif /* not RE_ENABLE_I18N */
3528{
3529  int i;
3530
3531  /* In case of REG_ICASE "upper" and "lower" match the both of
3532     upper and lower cases.  */
3533  if ((syntax & RE_ICASE)
3534      && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3535    class_name = "alpha";
3536
3537#ifdef RE_ENABLE_I18N
3538  /* Check the space of the arrays.  */
3539  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3540    {
3541      /* Not enough, realloc it.  */
3542      /* +1 in case of mbcset->nchar_classes is 0.  */
3543      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3544      /* Use realloc since array is NULL if *alloc == 0.  */
3545      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3546                                               new_char_class_alloc);
3547      if (BE (new_char_classes == NULL, 0))
3548        return REG_ESPACE;
3549      mbcset->char_classes = new_char_classes;
3550      *char_class_alloc = new_char_class_alloc;
3551    }
3552  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3553#endif /* RE_ENABLE_I18N */
3554
3555#define BUILD_CHARCLASS_LOOP(ctype_func)        \
3556  do {                                          \
3557    if (BE (trans != NULL, 0))                  \
3558      {                                         \
3559        for (i = 0; i < SBC_MAX; ++i)           \
3560          if (ctype_func (i))                   \
3561            bitset_set (sbcset, trans[i]);      \
3562      }                                         \
3563    else                                        \
3564      {                                         \
3565        for (i = 0; i < SBC_MAX; ++i)           \
3566          if (ctype_func (i))                   \
3567            bitset_set (sbcset, i);             \
3568      }                                         \
3569  } while (0)
3570
3571  if (strcmp (class_name, "alnum") == 0)
3572    BUILD_CHARCLASS_LOOP (isalnum);
3573  else if (strcmp (class_name, "cntrl") == 0)
3574    BUILD_CHARCLASS_LOOP (iscntrl);
3575  else if (strcmp (class_name, "lower") == 0)
3576    BUILD_CHARCLASS_LOOP (islower);
3577  else if (strcmp (class_name, "space") == 0)
3578    BUILD_CHARCLASS_LOOP (isspace);
3579  else if (strcmp (class_name, "alpha") == 0)
3580    BUILD_CHARCLASS_LOOP (isalpha);
3581  else if (strcmp (class_name, "digit") == 0)
3582    BUILD_CHARCLASS_LOOP (isdigit);
3583  else if (strcmp (class_name, "print") == 0)
3584    BUILD_CHARCLASS_LOOP (isprint);
3585  else if (strcmp (class_name, "upper") == 0)
3586    BUILD_CHARCLASS_LOOP (isupper);
3587  else if (strcmp (class_name, "blank") == 0)
3588#ifndef GAWK
3589    BUILD_CHARCLASS_LOOP (isblank);
3590#else
3591    /* see comments above */
3592    BUILD_CHARCLASS_LOOP (is_blank);
3593#endif
3594  else if (strcmp (class_name, "graph") == 0)
3595    BUILD_CHARCLASS_LOOP (isgraph);
3596  else if (strcmp (class_name, "punct") == 0)
3597    BUILD_CHARCLASS_LOOP (ispunct);
3598  else if (strcmp (class_name, "xdigit") == 0)
3599    BUILD_CHARCLASS_LOOP (isxdigit);
3600  else
3601    return REG_ECTYPE;
3602
3603  return REG_NOERROR;
3604}
3605
3606static bin_tree_t *
3607build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3608                    const char *class_name,
3609                    const char *extra, int non_match,
3610                    reg_errcode_t *err)
3611{
3612  re_bitset_ptr_t sbcset;
3613#ifdef RE_ENABLE_I18N
3614  re_charset_t *mbcset;
3615  int alloc = 0;
3616#endif /* not RE_ENABLE_I18N */
3617  reg_errcode_t ret;
3618  re_token_t br_token;
3619  bin_tree_t *tree;
3620
3621  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3622#ifdef RE_ENABLE_I18N
3623  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3624#endif /* RE_ENABLE_I18N */
3625
3626#ifdef RE_ENABLE_I18N
3627  if (BE (sbcset == NULL || mbcset == NULL, 0))
3628#else /* not RE_ENABLE_I18N */
3629  if (BE (sbcset == NULL, 0))
3630#endif /* not RE_ENABLE_I18N */
3631    {
3632      *err = REG_ESPACE;
3633      return NULL;
3634    }
3635
3636  if (non_match)
3637    {
3638#ifdef RE_ENABLE_I18N
3639      mbcset->non_match = 1;
3640#endif /* not RE_ENABLE_I18N */
3641    }
3642
3643  /* We don't care the syntax in this case.  */
3644  ret = build_charclass (trans, sbcset,
3645#ifdef RE_ENABLE_I18N
3646                         mbcset, &alloc,
3647#endif /* RE_ENABLE_I18N */
3648                         class_name, 0);
3649
3650  if (BE (ret != REG_NOERROR, 0))
3651    {
3652      re_free (sbcset);
3653#ifdef RE_ENABLE_I18N
3654      free_charset (mbcset);
3655#endif /* RE_ENABLE_I18N */
3656      *err = ret;
3657      return NULL;
3658    }
3659  /* \w match '_' also.  */
3660  for (; *extra; extra++)
3661    bitset_set (sbcset, *extra);
3662
3663  /* If it is non-matching list.  */
3664  if (non_match)
3665    bitset_not (sbcset);
3666
3667#ifdef RE_ENABLE_I18N
3668  /* Ensure only single byte characters are set.  */
3669  if (dfa->mb_cur_max > 1)
3670    bitset_mask (sbcset, dfa->sb_char);
3671#endif
3672
3673  /* Build a tree for simple bracket.  */
3674  br_token.type = SIMPLE_BRACKET;
3675  br_token.opr.sbcset = sbcset;
3676  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3677  if (BE (tree == NULL, 0))
3678    goto build_word_op_espace;
3679
3680#ifdef RE_ENABLE_I18N
3681  if (dfa->mb_cur_max > 1)
3682    {
3683      bin_tree_t *mbc_tree;
3684      /* Build a tree for complex bracket.  */
3685      br_token.type = COMPLEX_BRACKET;
3686      br_token.opr.mbcset = mbcset;
3687      dfa->has_mb_node = 1;
3688      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3689      if (BE (mbc_tree == NULL, 0))
3690        goto build_word_op_espace;
3691      /* Then join them by ALT node.  */
3692      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3693      if (BE (mbc_tree != NULL, 1))
3694        return tree;
3695    }
3696  else
3697    {
3698      free_charset (mbcset);
3699      return tree;
3700    }
3701#else /* not RE_ENABLE_I18N */
3702  return tree;
3703#endif /* not RE_ENABLE_I18N */
3704
3705 build_word_op_espace:
3706  re_free (sbcset);
3707#ifdef RE_ENABLE_I18N
3708  free_charset (mbcset);
3709#endif /* RE_ENABLE_I18N */
3710  *err = REG_ESPACE;
3711  return NULL;
3712}
3713
3714/* This is intended for the expressions like "a{1,3}".
3715   Fetch a number from `input', and return the number.
3716   Return -1, if the number field is empty like "{,1}".
3717   Return -2, if an error has occurred.  */
3718
3719static int
3720fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3721{
3722  int num = -1;
3723  unsigned char c;
3724  while (1)
3725    {
3726      fetch_token (token, input, syntax);
3727      c = token->opr.c;
3728      if (BE (token->type == END_OF_RE, 0))
3729        return -2;
3730      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3731        break;
3732      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3733             ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3734      num = (num > RE_DUP_MAX) ? -2 : num;
3735    }
3736  return num;
3737}
3738\f
3739#ifdef RE_ENABLE_I18N
3740static void
3741free_charset (re_charset_t *cset)
3742{
3743  re_free (cset->mbchars);
3744# ifdef _LIBC
3745  re_free (cset->coll_syms);
3746  re_free (cset->equiv_classes);
3747  re_free (cset->range_starts);
3748  re_free (cset->range_ends);
3749# endif
3750  re_free (cset->char_classes);
3751  re_free (cset);
3752}
3753#endif /* RE_ENABLE_I18N */
3754\f
3755/* Functions for binary tree operation.  */
3756
3757/* Create a tree node.  */
3758
3759static bin_tree_t *
3760create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3761             re_token_type_t type)
3762{
3763  re_token_t t;
3764  t.type = type;
3765  return create_token_tree (dfa, left, right, &t);
3766}
3767
3768static bin_tree_t *
3769create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3770                   const re_token_t *token)
3771{
3772  bin_tree_t *tree;
3773  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3774    {
3775      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3776
3777      if (storage == NULL)
3778        return NULL;
3779      storage->next = dfa->str_tree_storage;
3780      dfa->str_tree_storage = storage;
3781      dfa->str_tree_storage_idx = 0;
3782    }
3783  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3784
3785  tree->parent = NULL;
3786  tree->left = left;
3787  tree->right = right;
3788  tree->token = *token;
3789  tree->token.duplicated = 0;
3790  tree->token.opt_subexp = 0;
3791  tree->first = NULL;
3792  tree->next = NULL;
3793  tree->node_idx = -1;
3794
3795  if (left != NULL)
3796    left->parent = tree;
3797  if (right != NULL)
3798    right->parent = tree;
3799  return tree;
3800}
3801
3802/* Mark the tree SRC as an optional subexpression.
3803   To be called from preorder or postorder.  */
3804
3805static reg_errcode_t
3806mark_opt_subexp (void *extra, bin_tree_t *node)
3807{
3808  int idx = (int) (intptr_t) extra;
3809  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3810    node->token.opt_subexp = 1;
3811
3812  return REG_NOERROR;
3813}
3814
3815/* Free the allocated memory inside NODE. */
3816
3817static void
3818free_token (re_token_t *node)
3819{
3820#ifdef RE_ENABLE_I18N
3821  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3822    free_charset (node->opr.mbcset);
3823  else
3824#endif /* RE_ENABLE_I18N */
3825    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3826      re_free (node->opr.sbcset);
3827}
3828
3829/* Worker function for tree walking.  Free the allocated memory inside NODE
3830   and its children. */
3831
3832static reg_errcode_t
3833free_tree (void *extra, bin_tree_t *node)
3834{
3835  free_token (&node->token);
3836  return REG_NOERROR;
3837}
3838
3839
3840/* Duplicate the node SRC, and return new node.  This is a preorder
3841   visit similar to the one implemented by the generic visitor, but
3842   we need more infrastructure to maintain two parallel trees --- so,
3843   it's easier to duplicate.  */
3844
3845static bin_tree_t *
3846duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3847{
3848  const bin_tree_t *node;
3849  bin_tree_t *dup_root;
3850  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3851
3852  for (node = root; ; )
3853    {
3854      /* Create a new tree and link it back to the current parent.  */
3855      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3856      if (*p_new == NULL)
3857        return NULL;
3858      (*p_new)->parent = dup_node;
3859      (*p_new)->token.duplicated = 1;
3860      dup_node = *p_new;
3861
3862      /* Go to the left node, or up and to the right.  */
3863      if (node->left)
3864        {
3865          node = node->left;
3866          p_new = &dup_node->left;
3867        }
3868      else
3869        {
3870          const bin_tree_t *prev = NULL;
3871          while (node->right == prev || node->right == NULL)
3872            {
3873              prev = node;
3874              node = node->parent;
3875              dup_node = dup_node->parent;
3876              if (!node)
3877                return dup_root;
3878            }
3879          node = node->right;
3880          p_new = &dup_node->right;
3881        }
3882    }
3883}