compat / regex / regcomp.con commit Merge branch 'jk/virtual-objects-do-exist' (83b13e2)
   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, see
  18   <http://www.gnu.org/licenses/>.  */
  19
  20#if defined __TANDEM
  21 /* This is currently duplicated from git-compat-utils.h */
  22# ifdef NO_INTPTR_T
  23 typedef long intptr_t;
  24 typedef unsigned long uintptr_t;
  25# endif
  26#endif
  27
  28static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  29                                          size_t length, reg_syntax_t syntax);
  30static void re_compile_fastmap_iter (regex_t *bufp,
  31                                     const re_dfastate_t *init_state,
  32                                     char *fastmap);
  33static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  34#ifdef RE_ENABLE_I18N
  35static void free_charset (re_charset_t *cset);
  36#endif /* RE_ENABLE_I18N */
  37static void free_workarea_compile (regex_t *preg);
  38static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  39#ifdef RE_ENABLE_I18N
  40static void optimize_utf8 (re_dfa_t *dfa);
  41#endif
  42static reg_errcode_t analyze (regex_t *preg);
  43static reg_errcode_t preorder (bin_tree_t *root,
  44                               reg_errcode_t (fn (void *, bin_tree_t *)),
  45                               void *extra);
  46static reg_errcode_t postorder (bin_tree_t *root,
  47                                reg_errcode_t (fn (void *, bin_tree_t *)),
  48                                void *extra);
  49static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  50static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  51static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  52                                 bin_tree_t *node);
  53static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  54static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  55static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  56static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  57static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  58                                   unsigned int constraint);
  59static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  60static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  61                                         int node, int root);
  62static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  63static int fetch_number (re_string_t *input, re_token_t *token,
  64                         reg_syntax_t syntax);
  65static int peek_token (re_token_t *token, re_string_t *input,
  66                        reg_syntax_t syntax) internal_function;
  67static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  68                          reg_syntax_t syntax, reg_errcode_t *err);
  69static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  70                                  re_token_t *token, reg_syntax_t syntax,
  71                                  int nest, reg_errcode_t *err);
  72static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  73                                 re_token_t *token, reg_syntax_t syntax,
  74                                 int nest, reg_errcode_t *err);
  75static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  76                                     re_token_t *token, reg_syntax_t syntax,
  77                                     int nest, reg_errcode_t *err);
  78static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  79                                  re_token_t *token, reg_syntax_t syntax,
  80                                  int nest, reg_errcode_t *err);
  81static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  82                                 re_dfa_t *dfa, re_token_t *token,
  83                                 reg_syntax_t syntax, reg_errcode_t *err);
  84static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  85                                      re_token_t *token, reg_syntax_t syntax,
  86                                      reg_errcode_t *err);
  87static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  88                                            re_string_t *regexp,
  89                                            re_token_t *token, int token_len,
  90                                            re_dfa_t *dfa,
  91                                            reg_syntax_t syntax,
  92                                            int accept_hyphen);
  93static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  94                                          re_string_t *regexp,
  95                                          re_token_t *token);
  96#ifdef RE_ENABLE_I18N
  97static reg_errcode_t build_equiv_class (bitset_t sbcset,
  98                                        re_charset_t *mbcset,
  99                                        int *equiv_class_alloc,
 100                                        const unsigned char *name);
 101static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 102                                      bitset_t sbcset,
 103                                      re_charset_t *mbcset,
 104                                      int *char_class_alloc,
 105                                      const char *class_name,
 106                                      reg_syntax_t syntax);
 107#else  /* not RE_ENABLE_I18N */
 108static reg_errcode_t build_equiv_class (bitset_t sbcset,
 109                                        const unsigned char *name);
 110static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 111                                      bitset_t sbcset,
 112                                      const char *class_name,
 113                                      reg_syntax_t syntax);
 114#endif /* not RE_ENABLE_I18N */
 115static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 116                                       RE_TRANSLATE_TYPE trans,
 117                                       const char *class_name,
 118                                       const char *extra,
 119                                       int non_match, reg_errcode_t *err);
 120static bin_tree_t *create_tree (re_dfa_t *dfa,
 121                                bin_tree_t *left, bin_tree_t *right,
 122                                re_token_type_t type);
 123static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 124                                      bin_tree_t *left, bin_tree_t *right,
 125                                      const re_token_t *token);
 126static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 127static void free_token (re_token_t *node);
 128static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 129static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 130\f
 131/* This table gives an error message for each of the error codes listed
 132   in regex.h.  Obviously the order here has to be same as there.
 133   POSIX doesn't require that we do anything for REG_NOERROR,
 134   but why not be nice?  */
 135
 136const char __re_error_msgid[] attribute_hidden =
 137  {
 138#define REG_NOERROR_IDX 0
 139    gettext_noop ("Success")    /* REG_NOERROR */
 140    "\0"
 141#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 142    gettext_noop ("No match")   /* REG_NOMATCH */
 143    "\0"
 144#define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 145    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 146    "\0"
 147#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 148    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 149    "\0"
 150#define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 151    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 152    "\0"
 153#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 154    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 155    "\0"
 156#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 157    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 158    "\0"
 159#define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 160    gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 161    "\0"
 162#define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 163    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 164    "\0"
 165#define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 166    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 167    "\0"
 168#define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 169    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 170    "\0"
 171#define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 172    gettext_noop ("Invalid range end")  /* REG_ERANGE */
 173    "\0"
 174#define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 175    gettext_noop ("Memory exhausted") /* REG_ESPACE */
 176    "\0"
 177#define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 178    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 179    "\0"
 180#define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 181    gettext_noop ("Premature end of regular expression") /* REG_EEND */
 182    "\0"
 183#define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 184    gettext_noop ("Regular expression too big") /* REG_ESIZE */
 185    "\0"
 186#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 187    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 188  };
 189
 190const size_t __re_error_msgid_idx[] attribute_hidden =
 191  {
 192    REG_NOERROR_IDX,
 193    REG_NOMATCH_IDX,
 194    REG_BADPAT_IDX,
 195    REG_ECOLLATE_IDX,
 196    REG_ECTYPE_IDX,
 197    REG_EESCAPE_IDX,
 198    REG_ESUBREG_IDX,
 199    REG_EBRACK_IDX,
 200    REG_EPAREN_IDX,
 201    REG_EBRACE_IDX,
 202    REG_BADBR_IDX,
 203    REG_ERANGE_IDX,
 204    REG_ESPACE_IDX,
 205    REG_BADRPT_IDX,
 206    REG_EEND_IDX,
 207    REG_ESIZE_IDX,
 208    REG_ERPAREN_IDX
 209  };
 210\f
 211/* Entry points for GNU code.  */
 212
 213
 214#ifdef ZOS_USS
 215
 216/* For ZOS USS we must define btowc */
 217
 218wchar_t 
 219btowc (int c)
 220{
 221   wchar_t wtmp[2];
 222   char tmp[2];
 223
 224   tmp[0] = c;
 225   tmp[1] = 0;
 226
 227   mbtowc (wtmp, tmp, 1);
 228   return wtmp[0];
 229}
 230#endif
 231
 232/* re_compile_pattern is the GNU regular expression compiler: it
 233   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 234   Returns 0 if the pattern was valid, otherwise an error string.
 235
 236   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 237   are set in BUFP on entry.  */
 238
 239const char *
 240re_compile_pattern (const char *pattern,
 241                    size_t length,
 242                    struct re_pattern_buffer *bufp)
 243{
 244  reg_errcode_t ret;
 245
 246  /* And GNU code determines whether or not to get register information
 247     by passing null for the REGS argument to re_match, etc., not by
 248     setting no_sub, unless RE_NO_SUB is set.  */
 249  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 250
 251  /* Match anchors at newline.  */
 252  bufp->newline_anchor = 1;
 253
 254  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 255
 256  if (!ret)
 257    return NULL;
 258  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 259}
 260#ifdef _LIBC
 261weak_alias (__re_compile_pattern, re_compile_pattern)
 262#endif
 263
 264/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 265   also be assigned to arbitrarily: each pattern buffer stores its own
 266   syntax, so it can be changed between regex compilations.  */
 267/* This has no initializer because initialized variables in Emacs
 268   become read-only after dumping.  */
 269reg_syntax_t re_syntax_options;
 270
 271
 272/* Specify the precise syntax of regexps for compilation.  This provides
 273   for compatibility for various utilities which historically have
 274   different, incompatible syntaxes.
 275
 276   The argument SYNTAX is a bit mask comprised of the various bits
 277   defined in regex.h.  We return the old syntax.  */
 278
 279reg_syntax_t
 280re_set_syntax (reg_syntax_t syntax)
 281{
 282  reg_syntax_t ret = re_syntax_options;
 283
 284  re_syntax_options = syntax;
 285  return ret;
 286}
 287#ifdef _LIBC
 288weak_alias (__re_set_syntax, re_set_syntax)
 289#endif
 290
 291int
 292re_compile_fastmap (struct re_pattern_buffer *bufp)
 293{
 294  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 295  char *fastmap = bufp->fastmap;
 296
 297  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 298  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 299  if (dfa->init_state != dfa->init_state_word)
 300    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 301  if (dfa->init_state != dfa->init_state_nl)
 302    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 303  if (dfa->init_state != dfa->init_state_begbuf)
 304    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 305  bufp->fastmap_accurate = 1;
 306  return 0;
 307}
 308#ifdef _LIBC
 309weak_alias (__re_compile_fastmap, re_compile_fastmap)
 310#endif
 311
 312static inline void
 313__attribute ((always_inline))
 314re_set_fastmap (char *fastmap, int icase, int ch)
 315{
 316  fastmap[ch] = 1;
 317  if (icase)
 318    fastmap[tolower (ch)] = 1;
 319}
 320
 321/* Helper function for re_compile_fastmap.
 322   Compile fastmap for the initial_state INIT_STATE.  */
 323
 324static void
 325re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 326                         char *fastmap)
 327{
 328  volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 329  int node_cnt;
 330  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 331  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 332    {
 333      int node = init_state->nodes.elems[node_cnt];
 334      re_token_type_t type = dfa->nodes[node].type;
 335
 336      if (type == CHARACTER)
 337        {
 338          re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 339#ifdef RE_ENABLE_I18N
 340          if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 341            {
 342              unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 343              wchar_t wc;
 344              mbstate_t state;
 345
 346              p = buf;
 347              *p++ = dfa->nodes[node].opr.c;
 348              while (++node < dfa->nodes_len
 349                     && dfa->nodes[node].type == CHARACTER
 350                     && dfa->nodes[node].mb_partial)
 351                *p++ = dfa->nodes[node].opr.c;
 352              memset (&state, '\0', sizeof (state));
 353              if (__mbrtowc (&wc, (const char *) buf, p - buf,
 354                             &state) == p - buf
 355                  && (__wcrtomb ((char *) buf, towlower (wc), &state)
 356                      != (size_t) -1))
 357                re_set_fastmap (fastmap, 0, buf[0]);
 358              re_free (buf);
 359            }
 360#endif
 361        }
 362      else if (type == SIMPLE_BRACKET)
 363        {
 364          int i, ch;
 365          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 366            {
 367              int j;
 368              bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 369              for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 370                if (w & ((bitset_word_t) 1 << j))
 371                  re_set_fastmap (fastmap, icase, ch);
 372            }
 373        }
 374#ifdef RE_ENABLE_I18N
 375      else if (type == COMPLEX_BRACKET)
 376        {
 377          re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 378          int i;
 379
 380# ifdef _LIBC
 381          /* See if we have to try all bytes which start multiple collation
 382             elements.
 383             e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 384                  collation element, and don't catch 'b' since 'b' is
 385                  the only collation element which starts from 'b' (and
 386                  it is caught by SIMPLE_BRACKET).  */
 387              if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 388                  && (cset->ncoll_syms || cset->nranges))
 389                {
 390                  const int32_t *table = (const int32_t *)
 391                    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 392                  for (i = 0; i < SBC_MAX; ++i)
 393                    if (table[i] < 0)
 394                      re_set_fastmap (fastmap, icase, i);
 395                }
 396# endif /* _LIBC */
 397
 398          /* See if we have to start the match at all multibyte characters,
 399             i.e. where we would not find an invalid sequence.  This only
 400             applies to multibyte character sets; for single byte character
 401             sets, the SIMPLE_BRACKET again suffices.  */
 402          if (dfa->mb_cur_max > 1
 403              && (cset->nchar_classes || cset->non_match || cset->nranges
 404# ifdef _LIBC
 405                  || cset->nequiv_classes
 406# endif /* _LIBC */
 407                 ))
 408            {
 409              unsigned char c = 0;
 410              do
 411                {
 412                  mbstate_t mbs;
 413                  memset (&mbs, 0, sizeof (mbs));
 414                  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 415                    re_set_fastmap (fastmap, false, (int) c);
 416                }
 417              while (++c != 0);
 418            }
 419
 420          else
 421            {
 422              /* ... Else catch all bytes which can start the mbchars.  */
 423              for (i = 0; i < cset->nmbchars; ++i)
 424                {
 425                  char buf[256];
 426                  mbstate_t state;
 427                  memset (&state, '\0', sizeof (state));
 428                  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 429                    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 430                  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 431                    {
 432                      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 433                          != (size_t) -1)
 434                        re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 435                    }
 436                }
 437            }
 438        }
 439#endif /* RE_ENABLE_I18N */
 440      else if (type == OP_PERIOD
 441#ifdef RE_ENABLE_I18N
 442               || type == OP_UTF8_PERIOD
 443#endif /* RE_ENABLE_I18N */
 444               || type == END_OF_RE)
 445        {
 446          memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 447          if (type == END_OF_RE)
 448            bufp->can_be_null = 1;
 449          return;
 450        }
 451    }
 452}
 453\f
 454/* Entry point for POSIX code.  */
 455/* regcomp takes a regular expression as a string and compiles it.
 456
 457   PREG is a regex_t *.  We do not expect any fields to be initialized,
 458   since POSIX says we shouldn't.  Thus, we set
 459
 460     `buffer' to the compiled pattern;
 461     `used' to the length of the compiled pattern;
 462     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 463       REG_EXTENDED bit in CFLAGS is set; otherwise, to
 464       RE_SYNTAX_POSIX_BASIC;
 465     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 466     `fastmap' to an allocated space for the fastmap;
 467     `fastmap_accurate' to zero;
 468     `re_nsub' to the number of subexpressions in PATTERN.
 469
 470   PATTERN is the address of the pattern string.
 471
 472   CFLAGS is a series of bits which affect compilation.
 473
 474     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 475     use POSIX basic syntax.
 476
 477     If REG_NEWLINE is set, then . and [^...] don't match newline.
 478     Also, regexec will try a match beginning after every newline.
 479
 480     If REG_ICASE is set, then we considers upper- and lowercase
 481     versions of letters to be equivalent when matching.
 482
 483     If REG_NOSUB is set, then when PREG is passed to regexec, that
 484     routine will report only success or failure, and nothing about the
 485     registers.
 486
 487   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 488   the return codes and their meanings.)  */
 489
 490int
 491regcomp (regex_t *__restrict preg,
 492         const char *__restrict pattern,
 493         int cflags)
 494{
 495  reg_errcode_t ret;
 496  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 497                         : RE_SYNTAX_POSIX_BASIC);
 498
 499  preg->buffer = NULL;
 500  preg->allocated = 0;
 501  preg->used = 0;
 502
 503  /* Try to allocate space for the fastmap.  */
 504  preg->fastmap = re_malloc (char, SBC_MAX);
 505  if (BE (preg->fastmap == NULL, 0))
 506    return REG_ESPACE;
 507
 508  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 509
 510  /* If REG_NEWLINE is set, newlines are treated differently.  */
 511  if (cflags & REG_NEWLINE)
 512    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 513      syntax &= ~RE_DOT_NEWLINE;
 514      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 515      /* It also changes the matching behavior.  */
 516      preg->newline_anchor = 1;
 517    }
 518  else
 519    preg->newline_anchor = 0;
 520  preg->no_sub = !!(cflags & REG_NOSUB);
 521  preg->translate = NULL;
 522
 523  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 524
 525  /* POSIX doesn't distinguish between an unmatched open-group and an
 526     unmatched close-group: both are REG_EPAREN.  */
 527  if (ret == REG_ERPAREN)
 528    ret = REG_EPAREN;
 529
 530  /* We have already checked preg->fastmap != NULL.  */
 531  if (BE (ret == REG_NOERROR, 1))
 532    /* Compute the fastmap now, since regexec cannot modify the pattern
 533       buffer.  This function never fails in this implementation.  */
 534    (void) re_compile_fastmap (preg);
 535  else
 536    {
 537      /* Some error occurred while compiling the expression.  */
 538      re_free (preg->fastmap);
 539      preg->fastmap = NULL;
 540    }
 541
 542  return (int) ret;
 543}
 544#ifdef _LIBC
 545weak_alias (__regcomp, regcomp)
 546#endif
 547
 548/* Returns a message corresponding to an error code, ERRCODE, returned
 549   from either regcomp or regexec.   We don't use PREG here.  */
 550
 551size_t
 552regerror(int errcode, const regex_t *__restrict preg,
 553         char *__restrict errbuf, size_t errbuf_size)
 554{
 555  const char *msg;
 556  size_t msg_size;
 557
 558  if (BE (errcode < 0
 559          || errcode >= (int) (sizeof (__re_error_msgid_idx)
 560                               / sizeof (__re_error_msgid_idx[0])), 0))
 561    /* Only error codes returned by the rest of the code should be passed
 562       to this routine.  If we are given anything else, or if other regex
 563       code generates an invalid error code, then the program has a bug.
 564       Dump core so we can fix it.  */
 565    abort ();
 566
 567  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 568
 569  msg_size = strlen (msg) + 1; /* Includes the null.  */
 570
 571  if (BE (errbuf_size != 0, 1))
 572    {
 573      if (BE (msg_size > errbuf_size, 0))
 574        {
 575          memcpy (errbuf, msg, errbuf_size - 1);
 576          errbuf[errbuf_size - 1] = 0;
 577        }
 578      else
 579        memcpy (errbuf, msg, msg_size);
 580    }
 581
 582  return msg_size;
 583}
 584#ifdef _LIBC
 585weak_alias (__regerror, regerror)
 586#endif
 587
 588
 589#ifdef RE_ENABLE_I18N
 590/* This static array is used for the map to single-byte characters when
 591   UTF-8 is used.  Otherwise we would allocate memory just to initialize
 592   it the same all the time.  UTF-8 is the preferred encoding so this is
 593   a worthwhile optimization.  */
 594#if __GNUC__ >= 3
 595static const bitset_t utf8_sb_map = {
 596  /* Set the first 128 bits.  */
 597  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 598};
 599#else /* ! (__GNUC__ >= 3) */
 600static bitset_t utf8_sb_map;
 601#endif /* __GNUC__ >= 3 */
 602#endif /* RE_ENABLE_I18N */
 603
 604
 605static void
 606free_dfa_content (re_dfa_t *dfa)
 607{
 608  int i, j;
 609
 610  if (dfa->nodes)
 611    for (i = 0; i < dfa->nodes_len; ++i)
 612      free_token (dfa->nodes + i);
 613  re_free (dfa->nexts);
 614  for (i = 0; i < dfa->nodes_len; ++i)
 615    {
 616      if (dfa->eclosures != NULL)
 617        re_node_set_free (dfa->eclosures + i);
 618      if (dfa->inveclosures != NULL)
 619        re_node_set_free (dfa->inveclosures + i);
 620      if (dfa->edests != NULL)
 621        re_node_set_free (dfa->edests + i);
 622    }
 623  re_free (dfa->edests);
 624  re_free (dfa->eclosures);
 625  re_free (dfa->inveclosures);
 626  re_free (dfa->nodes);
 627
 628  if (dfa->state_table)
 629    for (i = 0; i <= dfa->state_hash_mask; ++i)
 630      {
 631        struct re_state_table_entry *entry = dfa->state_table + i;
 632        for (j = 0; j < entry->num; ++j)
 633          {
 634            re_dfastate_t *state = entry->array[j];
 635            free_state (state);
 636          }
 637        re_free (entry->array);
 638      }
 639  re_free (dfa->state_table);
 640#ifdef RE_ENABLE_I18N
 641  if (dfa->sb_char != utf8_sb_map)
 642    re_free (dfa->sb_char);
 643#endif
 644  re_free (dfa->subexp_map);
 645#ifdef DEBUG
 646  re_free (dfa->re_str);
 647#endif
 648
 649  re_free (dfa);
 650}
 651
 652
 653/* Free dynamically allocated space used by PREG.  */
 654
 655void
 656regfree (regex_t *preg)
 657{
 658  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 659  if (BE (dfa != NULL, 1))
 660    free_dfa_content (dfa);
 661  preg->buffer = NULL;
 662  preg->allocated = 0;
 663
 664  re_free (preg->fastmap);
 665  preg->fastmap = NULL;
 666
 667  re_free (preg->translate);
 668  preg->translate = NULL;
 669}
 670#ifdef _LIBC
 671weak_alias (__regfree, regfree)
 672#endif
 673\f
 674/* Entry points compatible with 4.2 BSD regex library.  We don't define
 675   them unless specifically requested.  */
 676
 677#if defined _REGEX_RE_COMP || defined _LIBC
 678
 679/* BSD has one and only one pattern buffer.  */
 680static struct re_pattern_buffer re_comp_buf;
 681
 682char *
 683# ifdef _LIBC
 684/* Make these definitions weak in libc, so POSIX programs can redefine
 685   these names if they don't use our functions, and still use
 686   regcomp/regexec above without link errors.  */
 687weak_function
 688# endif
 689re_comp (s)
 690     const char *s;
 691{
 692  reg_errcode_t ret;
 693  char *fastmap;
 694
 695  if (!s)
 696    {
 697      if (!re_comp_buf.buffer)
 698        return gettext ("No previous regular expression");
 699      return 0;
 700    }
 701
 702  if (re_comp_buf.buffer)
 703    {
 704      fastmap = re_comp_buf.fastmap;
 705      re_comp_buf.fastmap = NULL;
 706      __regfree (&re_comp_buf);
 707      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 708      re_comp_buf.fastmap = fastmap;
 709    }
 710
 711  if (re_comp_buf.fastmap == NULL)
 712    {
 713      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 714      if (re_comp_buf.fastmap == NULL)
 715        return (char *) gettext (__re_error_msgid
 716                                 + __re_error_msgid_idx[(int) REG_ESPACE]);
 717    }
 718
 719  /* Since `re_exec' always passes NULL for the `regs' argument, we
 720     don't need to initialize the pattern buffer fields which affect it.  */
 721
 722  /* Match anchors at newlines.  */
 723  re_comp_buf.newline_anchor = 1;
 724
 725  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 726
 727  if (!ret)
 728    return NULL;
 729
 730  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 731  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 732}
 733
 734#ifdef _LIBC
 735libc_freeres_fn (free_mem)
 736{
 737  __regfree (&re_comp_buf);
 738}
 739#endif
 740
 741#endif /* _REGEX_RE_COMP */
 742\f
 743/* Internal entry point.
 744   Compile the regular expression PATTERN, whose length is LENGTH.
 745   SYNTAX indicate regular expression's syntax.  */
 746
 747static reg_errcode_t
 748re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 749                     reg_syntax_t syntax)
 750{
 751  reg_errcode_t err = REG_NOERROR;
 752  re_dfa_t *dfa;
 753  re_string_t regexp;
 754
 755  /* Initialize the pattern buffer.  */
 756  preg->fastmap_accurate = 0;
 757  preg->syntax = syntax;
 758  preg->not_bol = preg->not_eol = 0;
 759  preg->used = 0;
 760  preg->re_nsub = 0;
 761  preg->can_be_null = 0;
 762  preg->regs_allocated = REGS_UNALLOCATED;
 763
 764  /* Initialize the dfa.  */
 765  dfa = (re_dfa_t *) preg->buffer;
 766  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 767    {
 768      /* If zero allocated, but buffer is non-null, try to realloc
 769         enough space.  This loses if buffer's address is bogus, but
 770         that is the user's responsibility.  If ->buffer is NULL this
 771         is a simple allocation.  */
 772      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 773      if (dfa == NULL)
 774        return REG_ESPACE;
 775      preg->allocated = sizeof (re_dfa_t);
 776      preg->buffer = (unsigned char *) dfa;
 777    }
 778  preg->used = sizeof (re_dfa_t);
 779
 780  err = init_dfa (dfa, length);
 781  if (BE (err != REG_NOERROR, 0))
 782    {
 783      free_dfa_content (dfa);
 784      preg->buffer = NULL;
 785      preg->allocated = 0;
 786      return err;
 787    }
 788#ifdef DEBUG
 789  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 790  dfa->re_str = re_malloc (char, length + 1);
 791  strncpy (dfa->re_str, pattern, length + 1);
 792#endif
 793
 794  __libc_lock_init (dfa->lock);
 795
 796  err = re_string_construct (&regexp, pattern, length, preg->translate,
 797                             syntax & RE_ICASE, dfa);
 798  if (BE (err != REG_NOERROR, 0))
 799    {
 800    re_compile_internal_free_return:
 801      free_workarea_compile (preg);
 802      re_string_destruct (&regexp);
 803      free_dfa_content (dfa);
 804      preg->buffer = NULL;
 805      preg->allocated = 0;
 806      return err;
 807    }
 808
 809  /* Parse the regular expression, and build a structure tree.  */
 810  preg->re_nsub = 0;
 811  dfa->str_tree = parse (&regexp, preg, syntax, &err);
 812  if (BE (dfa->str_tree == NULL, 0))
 813    goto re_compile_internal_free_return;
 814
 815  /* Analyze the tree and create the nfa.  */
 816  err = analyze (preg);
 817  if (BE (err != REG_NOERROR, 0))
 818    goto re_compile_internal_free_return;
 819
 820#ifdef RE_ENABLE_I18N
 821  /* If possible, do searching in single byte encoding to speed things up.  */
 822  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 823    optimize_utf8 (dfa);
 824#endif
 825
 826  /* Then create the initial state of the dfa.  */
 827  err = create_initial_state (dfa);
 828
 829  /* Release work areas.  */
 830  free_workarea_compile (preg);
 831  re_string_destruct (&regexp);
 832
 833  if (BE (err != REG_NOERROR, 0))
 834    {
 835      free_dfa_content (dfa);
 836      preg->buffer = NULL;
 837      preg->allocated = 0;
 838    }
 839
 840  return err;
 841}
 842
 843/* Initialize DFA.  We use the length of the regular expression PAT_LEN
 844   as the initial length of some arrays.  */
 845
 846static reg_errcode_t
 847init_dfa (re_dfa_t *dfa, size_t pat_len)
 848{
 849  unsigned int table_size;
 850#ifndef _LIBC
 851  char *codeset_name;
 852#endif
 853
 854  memset (dfa, '\0', sizeof (re_dfa_t));
 855
 856  /* Force allocation of str_tree_storage the first time.  */
 857  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 858
 859  /* Avoid overflows.  */
 860  if (pat_len == SIZE_MAX)
 861    return REG_ESPACE;
 862
 863  dfa->nodes_alloc = pat_len + 1;
 864  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 865
 866  /*  table_size = 2 ^ ceil(log pat_len) */
 867  for (table_size = 1; ; table_size <<= 1)
 868    if (table_size > pat_len)
 869      break;
 870
 871  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 872  dfa->state_hash_mask = table_size - 1;
 873
 874  dfa->mb_cur_max = MB_CUR_MAX;
 875#ifdef _LIBC
 876  if (dfa->mb_cur_max == 6
 877      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 878    dfa->is_utf8 = 1;
 879  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 880                       != 0);
 881#else
 882# ifdef HAVE_LANGINFO_CODESET
 883  codeset_name = nl_langinfo (CODESET);
 884# else
 885  codeset_name = getenv ("LC_ALL");
 886  if (codeset_name == NULL || codeset_name[0] == '\0')
 887    codeset_name = getenv ("LC_CTYPE");
 888  if (codeset_name == NULL || codeset_name[0] == '\0')
 889    codeset_name = getenv ("LANG");
 890  if (codeset_name == NULL)
 891    codeset_name = "";
 892  else if (strchr (codeset_name, '.') !=  NULL)
 893    codeset_name = strchr (codeset_name, '.') + 1;
 894# endif
 895
 896  /* strcasecmp isn't a standard interface. brute force check */
 897#if 0
 898  if (strcasecmp (codeset_name, "UTF-8") == 0
 899      || strcasecmp (codeset_name, "UTF8") == 0)
 900    dfa->is_utf8 = 1;
 901#else
 902  if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 903      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 904      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 905      && (codeset_name[3] == '-'
 906          ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 907          : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 908    dfa->is_utf8 = 1;
 909#endif
 910
 911  /* We check exhaustively in the loop below if this charset is a
 912     superset of ASCII.  */
 913  dfa->map_notascii = 0;
 914#endif
 915
 916#ifdef RE_ENABLE_I18N
 917  if (dfa->mb_cur_max > 1)
 918    {
 919      if (dfa->is_utf8)
 920        {
 921#if !defined(__GNUC__) || __GNUC__ < 3
 922          static short utf8_sb_map_inited = 0;
 923
 924          if (! utf8_sb_map_inited)
 925            {
 926                int i;
 927
 928                utf8_sb_map_inited = 0;
 929                for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 930                  utf8_sb_map[i] = BITSET_WORD_MAX;
 931            }
 932#endif
 933          dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 934        }
 935      else
 936        {
 937          int i, j, ch;
 938
 939          dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 940          if (BE (dfa->sb_char == NULL, 0))
 941            return REG_ESPACE;
 942
 943          /* Set the bits corresponding to single byte chars.  */
 944          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 945            for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 946              {
 947                wint_t wch = __btowc (ch);
 948                if (wch != WEOF)
 949                  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 950# ifndef _LIBC
 951                if (isascii (ch) && wch != ch)
 952                  dfa->map_notascii = 1;
 953# endif
 954              }
 955        }
 956    }
 957#endif
 958
 959  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 960    return REG_ESPACE;
 961  return REG_NOERROR;
 962}
 963
 964/* Initialize WORD_CHAR table, which indicate which character is
 965   "word".  In this case "word" means that it is the word construction
 966   character used by some operators like "\<", "\>", etc.  */
 967
 968static void
 969internal_function
 970init_word_char (re_dfa_t *dfa)
 971{
 972  int i, j, ch;
 973  dfa->word_ops_used = 1;
 974  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 975    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 976      if (isalnum (ch) || ch == '_')
 977        dfa->word_char[i] |= (bitset_word_t) 1 << j;
 978}
 979
 980/* Free the work area which are only used while compiling.  */
 981
 982static void
 983free_workarea_compile (regex_t *preg)
 984{
 985  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 986  bin_tree_storage_t *storage, *next;
 987  for (storage = dfa->str_tree_storage; storage; storage = next)
 988    {
 989      next = storage->next;
 990      re_free (storage);
 991    }
 992  dfa->str_tree_storage = NULL;
 993  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 994  dfa->str_tree = NULL;
 995  re_free (dfa->org_indices);
 996  dfa->org_indices = NULL;
 997}
 998
 999/* Create initial states for all contexts.  */
1000
1001static reg_errcode_t
1002create_initial_state (re_dfa_t *dfa)
1003{
1004  int first, i;
1005  reg_errcode_t err;
1006  re_node_set init_nodes;
1007
1008  /* Initial states have the epsilon closure of the node which is
1009     the first node of the regular expression.  */
1010  first = dfa->str_tree->first->node_idx;
1011  dfa->init_node = first;
1012  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1013  if (BE (err != REG_NOERROR, 0))
1014    return err;
1015
1016  /* The back-references which are in initial states can epsilon transit,
1017     since in this case all of the subexpressions can be null.
1018     Then we add epsilon closures of the nodes which are the next nodes of
1019     the back-references.  */
1020  if (dfa->nbackref > 0)
1021    for (i = 0; i < init_nodes.nelem; ++i)
1022      {
1023        int node_idx = init_nodes.elems[i];
1024        re_token_type_t type = dfa->nodes[node_idx].type;
1025
1026        int clexp_idx;
1027        if (type != OP_BACK_REF)
1028          continue;
1029        for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1030          {
1031            re_token_t *clexp_node;
1032            clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1033            if (clexp_node->type == OP_CLOSE_SUBEXP
1034                && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1035              break;
1036          }
1037        if (clexp_idx == init_nodes.nelem)
1038          continue;
1039
1040        if (type == OP_BACK_REF)
1041          {
1042            int dest_idx = dfa->edests[node_idx].elems[0];
1043            if (!re_node_set_contains (&init_nodes, dest_idx))
1044              {
1045                reg_errcode_t err = re_node_set_merge (&init_nodes,
1046                                                       dfa->eclosures
1047                                                       + dest_idx);
1048                if (err != REG_NOERROR)
1049                  return err;
1050                i = 0;
1051              }
1052          }
1053      }
1054
1055  /* It must be the first time to invoke acquire_state.  */
1056  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1057  /* We don't check ERR here, since the initial state must not be NULL.  */
1058  if (BE (dfa->init_state == NULL, 0))
1059    return err;
1060  if (dfa->init_state->has_constraint)
1061    {
1062      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1063                                                       CONTEXT_WORD);
1064      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1065                                                     CONTEXT_NEWLINE);
1066      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1067                                                         &init_nodes,
1068                                                         CONTEXT_NEWLINE
1069                                                         | CONTEXT_BEGBUF);
1070      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1071              || dfa->init_state_begbuf == NULL, 0))
1072        return err;
1073    }
1074  else
1075    dfa->init_state_word = dfa->init_state_nl
1076      = dfa->init_state_begbuf = dfa->init_state;
1077
1078  re_node_set_free (&init_nodes);
1079  return REG_NOERROR;
1080}
1081\f
1082#ifdef RE_ENABLE_I18N
1083/* If it is possible to do searching in single byte encoding instead of UTF-8
1084   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1085   DFA nodes where needed.  */
1086
1087static void
1088optimize_utf8 (re_dfa_t *dfa)
1089{
1090  int node, i, mb_chars = 0, has_period = 0;
1091
1092  for (node = 0; node < dfa->nodes_len; ++node)
1093    switch (dfa->nodes[node].type)
1094      {
1095      case CHARACTER:
1096        if (dfa->nodes[node].opr.c >= 0x80)
1097          mb_chars = 1;
1098        break;
1099      case ANCHOR:
1100        switch (dfa->nodes[node].opr.ctx_type)
1101          {
1102          case LINE_FIRST:
1103          case LINE_LAST:
1104          case BUF_FIRST:
1105          case BUF_LAST:
1106            break;
1107          default:
1108            /* Word anchors etc. cannot be handled.  It's okay to test
1109               opr.ctx_type since constraints (for all DFA nodes) are
1110               created by ORing one or more opr.ctx_type values.  */
1111            return;
1112          }
1113        break;
1114      case OP_PERIOD:
1115        has_period = 1;
1116        break;
1117      case OP_BACK_REF:
1118      case OP_ALT:
1119      case END_OF_RE:
1120      case OP_DUP_ASTERISK:
1121      case OP_OPEN_SUBEXP:
1122      case OP_CLOSE_SUBEXP:
1123        break;
1124      case COMPLEX_BRACKET:
1125        return;
1126      case SIMPLE_BRACKET:
1127        /* Just double check.  The non-ASCII range starts at 0x80.  */
1128        assert (0x80 % BITSET_WORD_BITS == 0);
1129        for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1130          if (dfa->nodes[node].opr.sbcset[i])
1131            return;
1132        break;
1133      default:
1134        abort ();
1135      }
1136
1137  if (mb_chars || has_period)
1138    for (node = 0; node < dfa->nodes_len; ++node)
1139      {
1140        if (dfa->nodes[node].type == CHARACTER
1141            && dfa->nodes[node].opr.c >= 0x80)
1142          dfa->nodes[node].mb_partial = 0;
1143        else if (dfa->nodes[node].type == OP_PERIOD)
1144          dfa->nodes[node].type = OP_UTF8_PERIOD;
1145      }
1146
1147  /* The search can be in single byte locale.  */
1148  dfa->mb_cur_max = 1;
1149  dfa->is_utf8 = 0;
1150  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1151}
1152#endif
1153\f
1154/* Analyze the structure tree, and calculate "first", "next", "edest",
1155   "eclosure", and "inveclosure".  */
1156
1157static reg_errcode_t
1158analyze (regex_t *preg)
1159{
1160  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1161  reg_errcode_t ret;
1162
1163  /* Allocate arrays.  */
1164  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1165  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1166  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1167  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1168  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1169          || dfa->eclosures == NULL, 0))
1170    return REG_ESPACE;
1171
1172  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1173  if (dfa->subexp_map != NULL)
1174    {
1175      int i;
1176      for (i = 0; i < preg->re_nsub; i++)
1177        dfa->subexp_map[i] = i;
1178      preorder (dfa->str_tree, optimize_subexps, dfa);
1179      for (i = 0; i < preg->re_nsub; i++)
1180        if (dfa->subexp_map[i] != i)
1181          break;
1182      if (i == preg->re_nsub)
1183        {
1184          free (dfa->subexp_map);
1185          dfa->subexp_map = NULL;
1186        }
1187    }
1188
1189  ret = postorder (dfa->str_tree, lower_subexps, preg);
1190  if (BE (ret != REG_NOERROR, 0))
1191    return ret;
1192  ret = postorder (dfa->str_tree, calc_first, dfa);
1193  if (BE (ret != REG_NOERROR, 0))
1194    return ret;
1195  preorder (dfa->str_tree, calc_next, dfa);
1196  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1197  if (BE (ret != REG_NOERROR, 0))
1198    return ret;
1199  ret = calc_eclosure (dfa);
1200  if (BE (ret != REG_NOERROR, 0))
1201    return ret;
1202
1203  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1204     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1205  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1206      || dfa->nbackref)
1207    {
1208      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1209      if (BE (dfa->inveclosures == NULL, 0))
1210        return REG_ESPACE;
1211      ret = calc_inveclosure (dfa);
1212    }
1213
1214  return ret;
1215}
1216
1217/* Our parse trees are very unbalanced, so we cannot use a stack to
1218   implement parse tree visits.  Instead, we use parent pointers and
1219   some hairy code in these two functions.  */
1220static reg_errcode_t
1221postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1222           void *extra)
1223{
1224  bin_tree_t *node, *prev;
1225
1226  for (node = root; ; )
1227    {
1228      /* Descend down the tree, preferably to the left (or to the right
1229         if that's the only child).  */
1230      while (node->left || node->right)
1231        if (node->left)
1232          node = node->left;
1233        else
1234          node = node->right;
1235
1236      do
1237        {
1238          reg_errcode_t err = fn (extra, node);
1239          if (BE (err != REG_NOERROR, 0))
1240            return err;
1241          if (node->parent == NULL)
1242            return REG_NOERROR;
1243          prev = node;
1244          node = node->parent;
1245        }
1246      /* Go up while we have a node that is reached from the right.  */
1247      while (node->right == prev || node->right == NULL);
1248      node = node->right;
1249    }
1250}
1251
1252static reg_errcode_t
1253preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254          void *extra)
1255{
1256  bin_tree_t *node;
1257
1258  for (node = root; ; )
1259    {
1260      reg_errcode_t err = fn (extra, node);
1261      if (BE (err != REG_NOERROR, 0))
1262        return err;
1263
1264      /* Go to the left node, or up and to the right.  */
1265      if (node->left)
1266        node = node->left;
1267      else
1268        {
1269          bin_tree_t *prev = NULL;
1270          while (node->right == prev || node->right == NULL)
1271            {
1272              prev = node;
1273              node = node->parent;
1274              if (!node)
1275                return REG_NOERROR;
1276            }
1277          node = node->right;
1278        }
1279    }
1280}
1281
1282/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1283   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1284   backreferences as well.  Requires a preorder visit.  */
1285static reg_errcode_t
1286optimize_subexps (void *extra, bin_tree_t *node)
1287{
1288  re_dfa_t *dfa = (re_dfa_t *) extra;
1289
1290  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1291    {
1292      int idx = node->token.opr.idx;
1293      node->token.opr.idx = dfa->subexp_map[idx];
1294      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1295    }
1296
1297  else if (node->token.type == SUBEXP
1298           && node->left && node->left->token.type == SUBEXP)
1299    {
1300      int other_idx = node->left->token.opr.idx;
1301
1302      node->left = node->left->left;
1303      if (node->left)
1304        node->left->parent = node;
1305
1306      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1307      if (other_idx < BITSET_WORD_BITS)
1308          dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1309    }
1310
1311  return REG_NOERROR;
1312}
1313
1314/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1315   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1316static reg_errcode_t
1317lower_subexps (void *extra, bin_tree_t *node)
1318{
1319  regex_t *preg = (regex_t *) extra;
1320  reg_errcode_t err = REG_NOERROR;
1321
1322  if (node->left && node->left->token.type == SUBEXP)
1323    {
1324      node->left = lower_subexp (&err, preg, node->left);
1325      if (node->left)
1326        node->left->parent = node;
1327    }
1328  if (node->right && node->right->token.type == SUBEXP)
1329    {
1330      node->right = lower_subexp (&err, preg, node->right);
1331      if (node->right)
1332        node->right->parent = node;
1333    }
1334
1335  return err;
1336}
1337
1338static bin_tree_t *
1339lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1340{
1341  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1342  bin_tree_t *body = node->left;
1343  bin_tree_t *op, *cls, *tree1, *tree;
1344
1345  if (preg->no_sub
1346      /* We do not optimize empty subexpressions, because otherwise we may
1347         have bad CONCAT nodes with NULL children.  This is obviously not
1348         very common, so we do not lose much.  An example that triggers
1349         this case is the sed "script" /\(\)/x.  */
1350      && node->left != NULL
1351      && (node->token.opr.idx >= BITSET_WORD_BITS
1352          || !(dfa->used_bkref_map
1353               & ((bitset_word_t) 1 << node->token.opr.idx))))
1354    return node->left;
1355
1356  /* Convert the SUBEXP node to the concatenation of an
1357     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1358  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1359  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1360  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1361  tree = create_tree (dfa, op, tree1, CONCAT);
1362  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1363    {
1364      *err = REG_ESPACE;
1365      return NULL;
1366    }
1367
1368  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1369  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1370  return tree;
1371}
1372
1373/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1374   nodes.  Requires a postorder visit.  */
1375static reg_errcode_t
1376calc_first (void *extra, bin_tree_t *node)
1377{
1378  re_dfa_t *dfa = (re_dfa_t *) extra;
1379  if (node->token.type == CONCAT)
1380    {
1381      node->first = node->left->first;
1382      node->node_idx = node->left->node_idx;
1383    }
1384  else
1385    {
1386      node->first = node;
1387      node->node_idx = re_dfa_add_node (dfa, node->token);
1388      if (BE (node->node_idx == -1, 0))
1389        return REG_ESPACE;
1390      if (node->token.type == ANCHOR)
1391        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1392    }
1393  return REG_NOERROR;
1394}
1395
1396/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1397static reg_errcode_t
1398calc_next (void *extra, bin_tree_t *node)
1399{
1400  switch (node->token.type)
1401    {
1402    case OP_DUP_ASTERISK:
1403      node->left->next = node;
1404      break;
1405    case CONCAT:
1406      node->left->next = node->right->first;
1407      node->right->next = node->next;
1408      break;
1409    default:
1410      if (node->left)
1411        node->left->next = node->next;
1412      if (node->right)
1413        node->right->next = node->next;
1414      break;
1415    }
1416  return REG_NOERROR;
1417}
1418
1419/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1420static reg_errcode_t
1421link_nfa_nodes (void *extra, bin_tree_t *node)
1422{
1423  re_dfa_t *dfa = (re_dfa_t *) extra;
1424  int idx = node->node_idx;
1425  reg_errcode_t err = REG_NOERROR;
1426
1427  switch (node->token.type)
1428    {
1429    case CONCAT:
1430      break;
1431
1432    case END_OF_RE:
1433      assert (node->next == NULL);
1434      break;
1435
1436    case OP_DUP_ASTERISK:
1437    case OP_ALT:
1438      {
1439        int left, right;
1440        dfa->has_plural_match = 1;
1441        if (node->left != NULL)
1442          left = node->left->first->node_idx;
1443        else
1444          left = node->next->node_idx;
1445        if (node->right != NULL)
1446          right = node->right->first->node_idx;
1447        else
1448          right = node->next->node_idx;
1449        assert (left > -1);
1450        assert (right > -1);
1451        err = re_node_set_init_2 (dfa->edests + idx, left, right);
1452      }
1453      break;
1454
1455    case ANCHOR:
1456    case OP_OPEN_SUBEXP:
1457    case OP_CLOSE_SUBEXP:
1458      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1459      break;
1460
1461    case OP_BACK_REF:
1462      dfa->nexts[idx] = node->next->node_idx;
1463      if (node->token.type == OP_BACK_REF)
1464        err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1465      break;
1466
1467    default:
1468      assert (!IS_EPSILON_NODE (node->token.type));
1469      dfa->nexts[idx] = node->next->node_idx;
1470      break;
1471    }
1472
1473  return err;
1474}
1475
1476/* Duplicate the epsilon closure of the node ROOT_NODE.
1477   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1478   to their own constraint.  */
1479
1480static reg_errcode_t
1481internal_function
1482duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1483                        int root_node, unsigned int init_constraint)
1484{
1485  int org_node, clone_node, ret;
1486  unsigned int constraint = init_constraint;
1487  for (org_node = top_org_node, clone_node = top_clone_node;;)
1488    {
1489      int org_dest, clone_dest;
1490      if (dfa->nodes[org_node].type == OP_BACK_REF)
1491        {
1492          /* If the back reference epsilon-transit, its destination must
1493             also have the constraint.  Then duplicate the epsilon closure
1494             of the destination of the back reference, and store it in
1495             edests of the back reference.  */
1496          org_dest = dfa->nexts[org_node];
1497          re_node_set_empty (dfa->edests + clone_node);
1498          clone_dest = duplicate_node (dfa, org_dest, constraint);
1499          if (BE (clone_dest == -1, 0))
1500            return REG_ESPACE;
1501          dfa->nexts[clone_node] = dfa->nexts[org_node];
1502          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503          if (BE (ret < 0, 0))
1504            return REG_ESPACE;
1505        }
1506      else if (dfa->edests[org_node].nelem == 0)
1507        {
1508          /* In case of the node can't epsilon-transit, don't duplicate the
1509             destination and store the original destination as the
1510             destination of the node.  */
1511          dfa->nexts[clone_node] = dfa->nexts[org_node];
1512          break;
1513        }
1514      else if (dfa->edests[org_node].nelem == 1)
1515        {
1516          /* In case of the node can epsilon-transit, and it has only one
1517             destination.  */
1518          org_dest = dfa->edests[org_node].elems[0];
1519          re_node_set_empty (dfa->edests + clone_node);
1520          /* If the node is root_node itself, it means the epsilon clsoure
1521             has a loop.   Then tie it to the destination of the root_node.  */
1522          if (org_node == root_node && clone_node != org_node)
1523            {
1524              ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1525              if (BE (ret < 0, 0))
1526                return REG_ESPACE;
1527              break;
1528            }
1529          /* In case of the node has another constraint, add it.  */
1530          constraint |= dfa->nodes[org_node].constraint;
1531          clone_dest = duplicate_node (dfa, org_dest, constraint);
1532          if (BE (clone_dest == -1, 0))
1533            return REG_ESPACE;
1534          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535          if (BE (ret < 0, 0))
1536            return REG_ESPACE;
1537        }
1538      else /* dfa->edests[org_node].nelem == 2 */
1539        {
1540          /* In case of the node can epsilon-transit, and it has two
1541             destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1542          org_dest = dfa->edests[org_node].elems[0];
1543          re_node_set_empty (dfa->edests + clone_node);
1544          /* Search for a duplicated node which satisfies the constraint.  */
1545          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1546          if (clone_dest == -1)
1547            {
1548              /* There is no such duplicated node, create a new one.  */
1549              reg_errcode_t err;
1550              clone_dest = duplicate_node (dfa, org_dest, constraint);
1551              if (BE (clone_dest == -1, 0))
1552                return REG_ESPACE;
1553              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554              if (BE (ret < 0, 0))
1555                return REG_ESPACE;
1556              err = duplicate_node_closure (dfa, org_dest, clone_dest,
1557                                            root_node, constraint);
1558              if (BE (err != REG_NOERROR, 0))
1559                return err;
1560            }
1561          else
1562            {
1563              /* There is a duplicated node which satisfies the constraint,
1564                 use it to avoid infinite loop.  */
1565              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566              if (BE (ret < 0, 0))
1567                return REG_ESPACE;
1568            }
1569
1570          org_dest = dfa->edests[org_node].elems[1];
1571          clone_dest = duplicate_node (dfa, org_dest, constraint);
1572          if (BE (clone_dest == -1, 0))
1573            return REG_ESPACE;
1574          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1575          if (BE (ret < 0, 0))
1576            return REG_ESPACE;
1577        }
1578      org_node = org_dest;
1579      clone_node = clone_dest;
1580    }
1581  return REG_NOERROR;
1582}
1583
1584/* Search for a node which is duplicated from the node ORG_NODE, and
1585   satisfies the constraint CONSTRAINT.  */
1586
1587static int
1588search_duplicated_node (const re_dfa_t *dfa, int org_node,
1589                        unsigned int constraint)
1590{
1591  int idx;
1592  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1593    {
1594      if (org_node == dfa->org_indices[idx]
1595          && constraint == dfa->nodes[idx].constraint)
1596        return idx; /* Found.  */
1597    }
1598  return -1; /* Not found.  */
1599}
1600
1601/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1602   Return the index of the new node, or -1 if insufficient storage is
1603   available.  */
1604
1605static int
1606duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1607{
1608  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1609  if (BE (dup_idx != -1, 1))
1610    {
1611      dfa->nodes[dup_idx].constraint = constraint;
1612      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1613      dfa->nodes[dup_idx].duplicated = 1;
1614
1615      /* Store the index of the original node.  */
1616      dfa->org_indices[dup_idx] = org_idx;
1617    }
1618  return dup_idx;
1619}
1620
1621static reg_errcode_t
1622calc_inveclosure (re_dfa_t *dfa)
1623{
1624  int src, idx, ret;
1625  for (idx = 0; idx < dfa->nodes_len; ++idx)
1626    re_node_set_init_empty (dfa->inveclosures + idx);
1627
1628  for (src = 0; src < dfa->nodes_len; ++src)
1629    {
1630      int *elems = dfa->eclosures[src].elems;
1631      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1632        {
1633          ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1634          if (BE (ret == -1, 0))
1635            return REG_ESPACE;
1636        }
1637    }
1638
1639  return REG_NOERROR;
1640}
1641
1642/* Calculate "eclosure" for all the node in DFA.  */
1643
1644static reg_errcode_t
1645calc_eclosure (re_dfa_t *dfa)
1646{
1647  int node_idx, incomplete;
1648#ifdef DEBUG
1649  assert (dfa->nodes_len > 0);
1650#endif
1651  incomplete = 0;
1652  /* For each nodes, calculate epsilon closure.  */
1653  for (node_idx = 0; ; ++node_idx)
1654    {
1655      reg_errcode_t err;
1656      re_node_set eclosure_elem;
1657      if (node_idx == dfa->nodes_len)
1658        {
1659          if (!incomplete)
1660            break;
1661          incomplete = 0;
1662          node_idx = 0;
1663        }
1664
1665#ifdef DEBUG
1666      assert (dfa->eclosures[node_idx].nelem != -1);
1667#endif
1668
1669      /* If we have already calculated, skip it.  */
1670      if (dfa->eclosures[node_idx].nelem != 0)
1671        continue;
1672      /* Calculate epsilon closure of `node_idx'.  */
1673      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1674      if (BE (err != REG_NOERROR, 0))
1675        return err;
1676
1677      if (dfa->eclosures[node_idx].nelem == 0)
1678        {
1679          incomplete = 1;
1680          re_node_set_free (&eclosure_elem);
1681        }
1682    }
1683  return REG_NOERROR;
1684}
1685
1686/* Calculate epsilon closure of NODE.  */
1687
1688static reg_errcode_t
1689calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1690{
1691  reg_errcode_t err;
1692  int i;
1693  re_node_set eclosure;
1694  int ret;
1695  int incomplete = 0;
1696  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1697  if (BE (err != REG_NOERROR, 0))
1698    return err;
1699
1700  /* This indicates that we are calculating this node now.
1701     We reference this value to avoid infinite loop.  */
1702  dfa->eclosures[node].nelem = -1;
1703
1704  /* If the current node has constraints, duplicate all nodes
1705     since they must inherit the constraints.  */
1706  if (dfa->nodes[node].constraint
1707      && dfa->edests[node].nelem
1708      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1709    {
1710      err = duplicate_node_closure (dfa, node, node, node,
1711                                    dfa->nodes[node].constraint);
1712      if (BE (err != REG_NOERROR, 0))
1713        return err;
1714    }
1715
1716  /* Expand each epsilon destination nodes.  */
1717  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1718    for (i = 0; i < dfa->edests[node].nelem; ++i)
1719      {
1720        re_node_set eclosure_elem;
1721        int edest = dfa->edests[node].elems[i];
1722        /* If calculating the epsilon closure of `edest' is in progress,
1723           return intermediate result.  */
1724        if (dfa->eclosures[edest].nelem == -1)
1725          {
1726            incomplete = 1;
1727            continue;
1728          }
1729        /* If we haven't calculated the epsilon closure of `edest' yet,
1730           calculate now. Otherwise use calculated epsilon closure.  */
1731        if (dfa->eclosures[edest].nelem == 0)
1732          {
1733            err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1734            if (BE (err != REG_NOERROR, 0))
1735              return err;
1736          }
1737        else
1738          eclosure_elem = dfa->eclosures[edest];
1739        /* Merge the epsilon closure of `edest'.  */
1740        err = re_node_set_merge (&eclosure, &eclosure_elem);
1741        if (BE (err != REG_NOERROR, 0))
1742          return err;
1743        /* If the epsilon closure of `edest' is incomplete,
1744           the epsilon closure of this node is also incomplete.  */
1745        if (dfa->eclosures[edest].nelem == 0)
1746          {
1747            incomplete = 1;
1748            re_node_set_free (&eclosure_elem);
1749          }
1750      }
1751
1752  /* An epsilon closure includes itself.  */
1753  ret = re_node_set_insert (&eclosure, node);
1754  if (BE (ret < 0, 0))
1755    return REG_ESPACE;
1756  if (incomplete && !root)
1757    dfa->eclosures[node].nelem = 0;
1758  else
1759    dfa->eclosures[node] = eclosure;
1760  *new_set = eclosure;
1761  return REG_NOERROR;
1762}
1763\f
1764/* Functions for token which are used in the parser.  */
1765
1766/* Fetch a token from INPUT.
1767   We must not use this function inside bracket expressions.  */
1768
1769static void
1770internal_function
1771fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1772{
1773  re_string_skip_bytes (input, peek_token (result, input, syntax));
1774}
1775
1776/* Peek a token from INPUT, and return the length of the token.
1777   We must not use this function inside bracket expressions.  */
1778
1779static int
1780internal_function
1781peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1782{
1783  unsigned char c;
1784
1785  if (re_string_eoi (input))
1786    {
1787      token->type = END_OF_RE;
1788      return 0;
1789    }
1790
1791  c = re_string_peek_byte (input, 0);
1792  token->opr.c = c;
1793
1794  token->word_char = 0;
1795#ifdef RE_ENABLE_I18N
1796  token->mb_partial = 0;
1797  if (input->mb_cur_max > 1 &&
1798      !re_string_first_byte (input, re_string_cur_idx (input)))
1799    {
1800      token->type = CHARACTER;
1801      token->mb_partial = 1;
1802      return 1;
1803    }
1804#endif
1805  if (c == '\\')
1806    {
1807      unsigned char c2;
1808      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1809        {
1810          token->type = BACK_SLASH;
1811          return 1;
1812        }
1813
1814      c2 = re_string_peek_byte_case (input, 1);
1815      token->opr.c = c2;
1816      token->type = CHARACTER;
1817#ifdef RE_ENABLE_I18N
1818      if (input->mb_cur_max > 1)
1819        {
1820          wint_t wc = re_string_wchar_at (input,
1821                                          re_string_cur_idx (input) + 1);
1822          token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1823        }
1824      else
1825#endif
1826        token->word_char = IS_WORD_CHAR (c2) != 0;
1827
1828      switch (c2)
1829        {
1830        case '|':
1831          if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1832            token->type = OP_ALT;
1833          break;
1834        case '1': case '2': case '3': case '4': case '5':
1835        case '6': case '7': case '8': case '9':
1836          if (!(syntax & RE_NO_BK_REFS))
1837            {
1838              token->type = OP_BACK_REF;
1839              token->opr.idx = c2 - '1';
1840            }
1841          break;
1842        case '<':
1843          if (!(syntax & RE_NO_GNU_OPS))
1844            {
1845              token->type = ANCHOR;
1846              token->opr.ctx_type = WORD_FIRST;
1847            }
1848          break;
1849        case '>':
1850          if (!(syntax & RE_NO_GNU_OPS))
1851            {
1852              token->type = ANCHOR;
1853              token->opr.ctx_type = WORD_LAST;
1854            }
1855          break;
1856        case 'b':
1857          if (!(syntax & RE_NO_GNU_OPS))
1858            {
1859              token->type = ANCHOR;
1860              token->opr.ctx_type = WORD_DELIM;
1861            }
1862          break;
1863        case 'B':
1864          if (!(syntax & RE_NO_GNU_OPS))
1865            {
1866              token->type = ANCHOR;
1867              token->opr.ctx_type = NOT_WORD_DELIM;
1868            }
1869          break;
1870        case 'w':
1871          if (!(syntax & RE_NO_GNU_OPS))
1872            token->type = OP_WORD;
1873          break;
1874        case 'W':
1875          if (!(syntax & RE_NO_GNU_OPS))
1876            token->type = OP_NOTWORD;
1877          break;
1878        case 's':
1879          if (!(syntax & RE_NO_GNU_OPS))
1880            token->type = OP_SPACE;
1881          break;
1882        case 'S':
1883          if (!(syntax & RE_NO_GNU_OPS))
1884            token->type = OP_NOTSPACE;
1885          break;
1886        case '`':
1887          if (!(syntax & RE_NO_GNU_OPS))
1888            {
1889              token->type = ANCHOR;
1890              token->opr.ctx_type = BUF_FIRST;
1891            }
1892          break;
1893        case '\'':
1894          if (!(syntax & RE_NO_GNU_OPS))
1895            {
1896              token->type = ANCHOR;
1897              token->opr.ctx_type = BUF_LAST;
1898            }
1899          break;
1900        case '(':
1901          if (!(syntax & RE_NO_BK_PARENS))
1902            token->type = OP_OPEN_SUBEXP;
1903          break;
1904        case ')':
1905          if (!(syntax & RE_NO_BK_PARENS))
1906            token->type = OP_CLOSE_SUBEXP;
1907          break;
1908        case '+':
1909          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910            token->type = OP_DUP_PLUS;
1911          break;
1912        case '?':
1913          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914            token->type = OP_DUP_QUESTION;
1915          break;
1916        case '{':
1917          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918            token->type = OP_OPEN_DUP_NUM;
1919          break;
1920        case '}':
1921          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922            token->type = OP_CLOSE_DUP_NUM;
1923          break;
1924        default:
1925          break;
1926        }
1927      return 2;
1928    }
1929
1930  token->type = CHARACTER;
1931#ifdef RE_ENABLE_I18N
1932  if (input->mb_cur_max > 1)
1933    {
1934      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1935      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1936    }
1937  else
1938#endif
1939    token->word_char = IS_WORD_CHAR (token->opr.c);
1940
1941  switch (c)
1942    {
1943    case '\n':
1944      if (syntax & RE_NEWLINE_ALT)
1945        token->type = OP_ALT;
1946      break;
1947    case '|':
1948      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1949        token->type = OP_ALT;
1950      break;
1951    case '*':
1952      token->type = OP_DUP_ASTERISK;
1953      break;
1954    case '+':
1955      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956        token->type = OP_DUP_PLUS;
1957      break;
1958    case '?':
1959      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960        token->type = OP_DUP_QUESTION;
1961      break;
1962    case '{':
1963      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964        token->type = OP_OPEN_DUP_NUM;
1965      break;
1966    case '}':
1967      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968        token->type = OP_CLOSE_DUP_NUM;
1969      break;
1970    case '(':
1971      if (syntax & RE_NO_BK_PARENS)
1972        token->type = OP_OPEN_SUBEXP;
1973      break;
1974    case ')':
1975      if (syntax & RE_NO_BK_PARENS)
1976        token->type = OP_CLOSE_SUBEXP;
1977      break;
1978    case '[':
1979      token->type = OP_OPEN_BRACKET;
1980      break;
1981    case '.':
1982      token->type = OP_PERIOD;
1983      break;
1984    case '^':
1985      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1986          re_string_cur_idx (input) != 0)
1987        {
1988          char prev = re_string_peek_byte (input, -1);
1989          if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1990            break;
1991        }
1992      token->type = ANCHOR;
1993      token->opr.ctx_type = LINE_FIRST;
1994      break;
1995    case '$':
1996      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1997          re_string_cur_idx (input) + 1 != re_string_length (input))
1998        {
1999          re_token_t next;
2000          re_string_skip_bytes (input, 1);
2001          peek_token (&next, input, syntax);
2002          re_string_skip_bytes (input, -1);
2003          if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2004            break;
2005        }
2006      token->type = ANCHOR;
2007      token->opr.ctx_type = LINE_LAST;
2008      break;
2009    default:
2010      break;
2011    }
2012  return 1;
2013}
2014
2015/* Peek a token from INPUT, and return the length of the token.
2016   We must not use this function out of bracket expressions.  */
2017
2018static int
2019internal_function
2020peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2021{
2022  unsigned char c;
2023  if (re_string_eoi (input))
2024    {
2025      token->type = END_OF_RE;
2026      return 0;
2027    }
2028  c = re_string_peek_byte (input, 0);
2029  token->opr.c = c;
2030
2031#ifdef RE_ENABLE_I18N
2032  if (input->mb_cur_max > 1 &&
2033      !re_string_first_byte (input, re_string_cur_idx (input)))
2034    {
2035      token->type = CHARACTER;
2036      return 1;
2037    }
2038#endif /* RE_ENABLE_I18N */
2039
2040  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2041      && re_string_cur_idx (input) + 1 < re_string_length (input))
2042    {
2043      /* In this case, '\' escape a character.  */
2044      unsigned char c2;
2045      re_string_skip_bytes (input, 1);
2046      c2 = re_string_peek_byte (input, 0);
2047      token->opr.c = c2;
2048      token->type = CHARACTER;
2049      return 1;
2050    }
2051  if (c == '[') /* '[' is a special char in a bracket exps.  */
2052    {
2053      unsigned char c2;
2054      int token_len;
2055      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2056        c2 = re_string_peek_byte (input, 1);
2057      else
2058        c2 = 0;
2059      token->opr.c = c2;
2060      token_len = 2;
2061      switch (c2)
2062        {
2063        case '.':
2064          token->type = OP_OPEN_COLL_ELEM;
2065          break;
2066        case '=':
2067          token->type = OP_OPEN_EQUIV_CLASS;
2068          break;
2069        case ':':
2070          if (syntax & RE_CHAR_CLASSES)
2071            {
2072              token->type = OP_OPEN_CHAR_CLASS;
2073              break;
2074            }
2075          /* else fall through.  */
2076        default:
2077          token->type = CHARACTER;
2078          token->opr.c = c;
2079          token_len = 1;
2080          break;
2081        }
2082      return token_len;
2083    }
2084  switch (c)
2085    {
2086    case '-':
2087      token->type = OP_CHARSET_RANGE;
2088      break;
2089    case ']':
2090      token->type = OP_CLOSE_BRACKET;
2091      break;
2092    case '^':
2093      token->type = OP_NON_MATCH_LIST;
2094      break;
2095    default:
2096      token->type = CHARACTER;
2097    }
2098  return 1;
2099}
2100\f
2101/* Functions for parser.  */
2102
2103/* Entry point of the parser.
2104   Parse the regular expression REGEXP and return the structure tree.
2105   If an error has occurred, ERR is set by error code, and return NULL.
2106   This function build the following tree, from regular expression <reg_exp>:
2107           CAT
2108           / \
2109          /   \
2110   <reg_exp>  EOR
2111
2112   CAT means concatenation.
2113   EOR means end of regular expression.  */
2114
2115static bin_tree_t *
2116parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2117       reg_errcode_t *err)
2118{
2119  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2120  bin_tree_t *tree, *eor, *root;
2121  re_token_t current_token;
2122  dfa->syntax = syntax;
2123  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2124  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2125  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126    return NULL;
2127  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2128  if (tree != NULL)
2129    root = create_tree (dfa, tree, eor, CONCAT);
2130  else
2131    root = eor;
2132  if (BE (eor == NULL || root == NULL, 0))
2133    {
2134      *err = REG_ESPACE;
2135      return NULL;
2136    }
2137  return root;
2138}
2139
2140/* This function build the following tree, from regular expression
2141   <branch1>|<branch2>:
2142           ALT
2143           / \
2144          /   \
2145   <branch1> <branch2>
2146
2147   ALT means alternative, which represents the operator `|'.  */
2148
2149static bin_tree_t *
2150parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2151               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2152{
2153  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2154  bin_tree_t *tree, *branch = NULL;
2155  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2156  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2157    return NULL;
2158
2159  while (token->type == OP_ALT)
2160    {
2161      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2162      if (token->type != OP_ALT && token->type != END_OF_RE
2163          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2164        {
2165          branch = parse_branch (regexp, preg, token, syntax, nest, err);
2166          if (BE (*err != REG_NOERROR && branch == NULL, 0))
2167            return NULL;
2168        }
2169      else
2170        branch = NULL;
2171      tree = create_tree (dfa, tree, branch, OP_ALT);
2172      if (BE (tree == NULL, 0))
2173        {
2174          *err = REG_ESPACE;
2175          return NULL;
2176        }
2177    }
2178  return tree;
2179}
2180
2181/* This function build the following tree, from regular expression
2182   <exp1><exp2>:
2183        CAT
2184        / \
2185       /   \
2186   <exp1> <exp2>
2187
2188   CAT means concatenation.  */
2189
2190static bin_tree_t *
2191parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2192              reg_syntax_t syntax, int nest, reg_errcode_t *err)
2193{
2194  bin_tree_t *tree, *exp;
2195  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2196  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2197  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2198    return NULL;
2199
2200  while (token->type != OP_ALT && token->type != END_OF_RE
2201         && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2202    {
2203      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2204      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2205        {
2206          return NULL;
2207        }
2208      if (tree != NULL && exp != NULL)
2209        {
2210          tree = create_tree (dfa, tree, exp, CONCAT);
2211          if (tree == NULL)
2212            {
2213              *err = REG_ESPACE;
2214              return NULL;
2215            }
2216        }
2217      else if (tree == NULL)
2218        tree = exp;
2219      /* Otherwise exp == NULL, we don't need to create new tree.  */
2220    }
2221  return tree;
2222}
2223
2224/* This function build the following tree, from regular expression a*:
2225         *
2226         |
2227         a
2228*/
2229
2230static bin_tree_t *
2231parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2232                  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2233{
2234  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2235  bin_tree_t *tree;
2236  switch (token->type)
2237    {
2238    case CHARACTER:
2239      tree = create_token_tree (dfa, NULL, NULL, token);
2240      if (BE (tree == NULL, 0))
2241        {
2242          *err = REG_ESPACE;
2243          return NULL;
2244        }
2245#ifdef RE_ENABLE_I18N
2246      if (dfa->mb_cur_max > 1)
2247        {
2248          while (!re_string_eoi (regexp)
2249                 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2250            {
2251              bin_tree_t *mbc_remain;
2252              fetch_token (token, regexp, syntax);
2253              mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2254              tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2255              if (BE (mbc_remain == NULL || tree == NULL, 0))
2256                {
2257                  *err = REG_ESPACE;
2258                  return NULL;
2259                }
2260            }
2261        }
2262#endif
2263      break;
2264    case OP_OPEN_SUBEXP:
2265      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2266      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267        return NULL;
2268      break;
2269    case OP_OPEN_BRACKET:
2270      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2271      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2272        return NULL;
2273      break;
2274    case OP_BACK_REF:
2275      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2276        {
2277          *err = REG_ESUBREG;
2278          return NULL;
2279        }
2280      dfa->used_bkref_map |= 1 << token->opr.idx;
2281      tree = create_token_tree (dfa, NULL, NULL, token);
2282      if (BE (tree == NULL, 0))
2283        {
2284          *err = REG_ESPACE;
2285          return NULL;
2286        }
2287      ++dfa->nbackref;
2288      dfa->has_mb_node = 1;
2289      break;
2290    case OP_OPEN_DUP_NUM:
2291      if (syntax & RE_CONTEXT_INVALID_DUP)
2292        {
2293          *err = REG_BADRPT;
2294          return NULL;
2295        }
2296      /* FALLTHROUGH */
2297    case OP_DUP_ASTERISK:
2298    case OP_DUP_PLUS:
2299    case OP_DUP_QUESTION:
2300      if (syntax & RE_CONTEXT_INVALID_OPS)
2301        {
2302          *err = REG_BADRPT;
2303          return NULL;
2304        }
2305      else if (syntax & RE_CONTEXT_INDEP_OPS)
2306        {
2307          fetch_token (token, regexp, syntax);
2308          return parse_expression (regexp, preg, token, syntax, nest, err);
2309        }
2310      /* else fall through  */
2311    case OP_CLOSE_SUBEXP:
2312      if ((token->type == OP_CLOSE_SUBEXP) &&
2313          !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2314        {
2315          *err = REG_ERPAREN;
2316          return NULL;
2317        }
2318      /* else fall through  */
2319    case OP_CLOSE_DUP_NUM:
2320      /* We treat it as a normal character.  */
2321
2322      /* Then we can these characters as normal characters.  */
2323      token->type = CHARACTER;
2324      /* mb_partial and word_char bits should be initialized already
2325         by peek_token.  */
2326      tree = create_token_tree (dfa, NULL, NULL, token);
2327      if (BE (tree == NULL, 0))
2328        {
2329          *err = REG_ESPACE;
2330          return NULL;
2331        }
2332      break;
2333    case ANCHOR:
2334      if ((token->opr.ctx_type
2335           & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2336          && dfa->word_ops_used == 0)
2337        init_word_char (dfa);
2338      if (token->opr.ctx_type == WORD_DELIM
2339          || token->opr.ctx_type == NOT_WORD_DELIM)
2340        {
2341          bin_tree_t *tree_first, *tree_last;
2342          if (token->opr.ctx_type == WORD_DELIM)
2343            {
2344              token->opr.ctx_type = WORD_FIRST;
2345              tree_first = create_token_tree (dfa, NULL, NULL, token);
2346              token->opr.ctx_type = WORD_LAST;
2347            }
2348          else
2349            {
2350              token->opr.ctx_type = INSIDE_WORD;
2351              tree_first = create_token_tree (dfa, NULL, NULL, token);
2352              token->opr.ctx_type = INSIDE_NOTWORD;
2353            }
2354          tree_last = create_token_tree (dfa, NULL, NULL, token);
2355          tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2356          if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2357            {
2358              *err = REG_ESPACE;
2359              return NULL;
2360            }
2361        }
2362      else
2363        {
2364          tree = create_token_tree (dfa, NULL, NULL, token);
2365          if (BE (tree == NULL, 0))
2366            {
2367              *err = REG_ESPACE;
2368              return NULL;
2369            }
2370        }
2371      /* We must return here, since ANCHORs can't be followed
2372         by repetition operators.
2373         eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2374             it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2375      fetch_token (token, regexp, syntax);
2376      return tree;
2377    case OP_PERIOD:
2378      tree = create_token_tree (dfa, NULL, NULL, token);
2379      if (BE (tree == NULL, 0))
2380        {
2381          *err = REG_ESPACE;
2382          return NULL;
2383        }
2384      if (dfa->mb_cur_max > 1)
2385        dfa->has_mb_node = 1;
2386      break;
2387    case OP_WORD:
2388    case OP_NOTWORD:
2389      tree = build_charclass_op (dfa, regexp->trans,
2390                                 "alnum",
2391                                 "_",
2392                                 token->type == OP_NOTWORD, err);
2393      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394        return NULL;
2395      break;
2396    case OP_SPACE:
2397    case OP_NOTSPACE:
2398      tree = build_charclass_op (dfa, regexp->trans,
2399                                 "space",
2400                                 "",
2401                                 token->type == OP_NOTSPACE, err);
2402      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2403        return NULL;
2404      break;
2405    case OP_ALT:
2406    case END_OF_RE:
2407      return NULL;
2408    case BACK_SLASH:
2409      *err = REG_EESCAPE;
2410      return NULL;
2411    default:
2412      /* Must not happen?  */
2413#ifdef DEBUG
2414      assert (0);
2415#endif
2416      return NULL;
2417    }
2418  fetch_token (token, regexp, syntax);
2419
2420  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2421         || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2422    {
2423      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2424      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2425        return NULL;
2426      /* In BRE consecutive duplications are not allowed.  */
2427      if ((syntax & RE_CONTEXT_INVALID_DUP)
2428          && (token->type == OP_DUP_ASTERISK
2429              || token->type == OP_OPEN_DUP_NUM))
2430        {
2431          *err = REG_BADRPT;
2432          return NULL;
2433        }
2434    }
2435
2436  return tree;
2437}
2438
2439/* This function build the following tree, from regular expression
2440   (<reg_exp>):
2441         SUBEXP
2442            |
2443        <reg_exp>
2444*/
2445
2446static bin_tree_t *
2447parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2448               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2449{
2450  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2451  bin_tree_t *tree;
2452  size_t cur_nsub;
2453  cur_nsub = preg->re_nsub++;
2454
2455  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2456
2457  /* The subexpression may be a null string.  */
2458  if (token->type == OP_CLOSE_SUBEXP)
2459    tree = NULL;
2460  else
2461    {
2462      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2463      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2464        *err = REG_EPAREN;
2465      if (BE (*err != REG_NOERROR, 0))
2466        return NULL;
2467    }
2468
2469  if (cur_nsub <= '9' - '1')
2470    dfa->completed_bkref_map |= 1 << cur_nsub;
2471
2472  tree = create_tree (dfa, tree, NULL, SUBEXP);
2473  if (BE (tree == NULL, 0))
2474    {
2475      *err = REG_ESPACE;
2476      return NULL;
2477    }
2478  tree->token.opr.idx = cur_nsub;
2479  return tree;
2480}
2481
2482/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2483
2484static bin_tree_t *
2485parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2486              re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2487{
2488  bin_tree_t *tree = NULL, *old_tree = NULL;
2489  int i, start, end, start_idx = re_string_cur_idx (regexp);
2490#ifndef RE_TOKEN_INIT_BUG
2491  re_token_t start_token = *token;
2492#else
2493  re_token_t start_token;
2494
2495  memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2496#endif
2497
2498  if (token->type == OP_OPEN_DUP_NUM)
2499    {
2500      end = 0;
2501      start = fetch_number (regexp, token, syntax);
2502      if (start == -1)
2503        {
2504          if (token->type == CHARACTER && token->opr.c == ',')
2505            start = 0; /* We treat "{,m}" as "{0,m}".  */
2506          else
2507            {
2508              *err = REG_BADBR; /* <re>{} is invalid.  */
2509              return NULL;
2510            }
2511        }
2512      if (BE (start != -2, 1))
2513        {
2514          /* We treat "{n}" as "{n,n}".  */
2515          end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2516                 : ((token->type == CHARACTER && token->opr.c == ',')
2517                    ? fetch_number (regexp, token, syntax) : -2));
2518        }
2519      if (BE (start == -2 || end == -2, 0))
2520        {
2521          /* Invalid sequence.  */
2522          if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2523            {
2524              if (token->type == END_OF_RE)
2525                *err = REG_EBRACE;
2526              else
2527                *err = REG_BADBR;
2528
2529              return NULL;
2530            }
2531
2532          /* If the syntax bit is set, rollback.  */
2533          re_string_set_index (regexp, start_idx);
2534          *token = start_token;
2535          token->type = CHARACTER;
2536          /* mb_partial and word_char bits should be already initialized by
2537             peek_token.  */
2538          return elem;
2539        }
2540
2541      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2542        {
2543          /* First number greater than second.  */
2544          *err = REG_BADBR;
2545          return NULL;
2546        }
2547    }
2548  else
2549    {
2550      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2552    }
2553
2554  fetch_token (token, regexp, syntax);
2555
2556  if (BE (elem == NULL, 0))
2557    return NULL;
2558  if (BE (start == 0 && end == 0, 0))
2559    {
2560      postorder (elem, free_tree, NULL);
2561      return NULL;
2562    }
2563
2564  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2565  if (BE (start > 0, 0))
2566    {
2567      tree = elem;
2568      for (i = 2; i <= start; ++i)
2569        {
2570          elem = duplicate_tree (elem, dfa);
2571          tree = create_tree (dfa, tree, elem, CONCAT);
2572          if (BE (elem == NULL || tree == NULL, 0))
2573            goto parse_dup_op_espace;
2574        }
2575
2576      if (start == end)
2577        return tree;
2578
2579      /* Duplicate ELEM before it is marked optional.  */
2580      elem = duplicate_tree (elem, dfa);
2581      old_tree = tree;
2582    }
2583  else
2584    old_tree = NULL;
2585
2586  if (elem->token.type == SUBEXP)
2587    postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2588
2589  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2590  if (BE (tree == NULL, 0))
2591    goto parse_dup_op_espace;
2592
2593  /* This loop is actually executed only when end != -1,
2594     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2595     already created the start+1-th copy.  */
2596  for (i = start + 2; i <= end; ++i)
2597    {
2598      elem = duplicate_tree (elem, dfa);
2599      tree = create_tree (dfa, tree, elem, CONCAT);
2600      if (BE (elem == NULL || tree == NULL, 0))
2601        goto parse_dup_op_espace;
2602
2603      tree = create_tree (dfa, tree, NULL, OP_ALT);
2604      if (BE (tree == NULL, 0))
2605        goto parse_dup_op_espace;
2606    }
2607
2608  if (old_tree)
2609    tree = create_tree (dfa, old_tree, tree, CONCAT);
2610
2611  return tree;
2612
2613 parse_dup_op_espace:
2614  *err = REG_ESPACE;
2615  return NULL;
2616}
2617
2618/* Size of the names for collating symbol/equivalence_class/character_class.
2619   I'm not sure, but maybe enough.  */
2620#define BRACKET_NAME_BUF_SIZE 32
2621
2622#ifndef _LIBC
2623  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2624     Build the range expression which starts from START_ELEM, and ends
2625     at END_ELEM.  The result are written to MBCSET and SBCSET.
2626     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2627     mbcset->range_ends, is a pointer argument since we may
2628     update it.  */
2629
2630static reg_errcode_t
2631internal_function
2632# ifdef RE_ENABLE_I18N
2633build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2634                 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2635# else /* not RE_ENABLE_I18N */
2636build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2637                 bracket_elem_t *end_elem)
2638# endif /* not RE_ENABLE_I18N */
2639{
2640  unsigned int start_ch, end_ch;
2641  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2642  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2643          || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2644          0))
2645    return REG_ERANGE;
2646
2647  /* We can handle no multi character collating elements without libc
2648     support.  */
2649  if (BE ((start_elem->type == COLL_SYM
2650           && strlen ((char *) start_elem->opr.name) > 1)
2651          || (end_elem->type == COLL_SYM
2652              && strlen ((char *) end_elem->opr.name) > 1), 0))
2653    return REG_ECOLLATE;
2654
2655# ifdef RE_ENABLE_I18N
2656  {
2657    wchar_t wc;
2658    wint_t start_wc;
2659    wint_t end_wc;
2660    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2661
2662    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2663                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2664                   : 0));
2665    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2666              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2667                 : 0));
2668#ifdef GAWK
2669    /*
2670     * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2671     * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2672     * unsigned, so we don't have sign extension problems.
2673     */
2674    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2675                ? start_ch : start_elem->opr.wch);
2676    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2677              ? end_ch : end_elem->opr.wch);
2678#else
2679    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2680                ? __btowc (start_ch) : start_elem->opr.wch);
2681    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2682              ? __btowc (end_ch) : end_elem->opr.wch);
2683#endif
2684    if (start_wc == WEOF || end_wc == WEOF)
2685      return REG_ECOLLATE;
2686    cmp_buf[0] = start_wc;
2687    cmp_buf[4] = end_wc;
2688    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2689      return REG_ERANGE;
2690
2691    /* Got valid collation sequence values, add them as a new entry.
2692       However, for !_LIBC we have no collation elements: if the
2693       character set is single byte, the single byte character set
2694       that we build below suffices.  parse_bracket_exp passes
2695       no MBCSET if dfa->mb_cur_max == 1.  */
2696    if (mbcset)
2697      {
2698        /* Check the space of the arrays.  */
2699        if (BE (*range_alloc == mbcset->nranges, 0))
2700          {
2701            /* There is not enough space, need realloc.  */
2702            wchar_t *new_array_start, *new_array_end;
2703            int new_nranges;
2704
2705            /* +1 in case of mbcset->nranges is 0.  */
2706            new_nranges = 2 * mbcset->nranges + 1;
2707            /* Use realloc since mbcset->range_starts and mbcset->range_ends
2708               are NULL if *range_alloc == 0.  */
2709            new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2710                                          new_nranges);
2711            new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2712                                        new_nranges);
2713
2714            if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2715              return REG_ESPACE;
2716
2717            mbcset->range_starts = new_array_start;
2718            mbcset->range_ends = new_array_end;
2719            *range_alloc = new_nranges;
2720          }
2721
2722        mbcset->range_starts[mbcset->nranges] = start_wc;
2723        mbcset->range_ends[mbcset->nranges++] = end_wc;
2724      }
2725
2726    /* Build the table for single byte characters.  */
2727    for (wc = 0; wc < SBC_MAX; ++wc)
2728      {
2729        cmp_buf[2] = wc;
2730        if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2731            && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2732          bitset_set (sbcset, wc);
2733      }
2734  }
2735# else /* not RE_ENABLE_I18N */
2736  {
2737    unsigned int ch;
2738    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2739                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2740                   : 0));
2741    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2742              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2743                 : 0));
2744    if (start_ch > end_ch)
2745      return REG_ERANGE;
2746    /* Build the table for single byte characters.  */
2747    for (ch = 0; ch < SBC_MAX; ++ch)
2748      if (start_ch <= ch  && ch <= end_ch)
2749        bitset_set (sbcset, ch);
2750  }
2751# endif /* not RE_ENABLE_I18N */
2752  return REG_NOERROR;
2753}
2754#endif /* not _LIBC */
2755
2756#ifndef _LIBC
2757/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2758   Build the collating element which is represented by NAME.
2759   The result are written to MBCSET and SBCSET.
2760   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2761   pointer argument since we may update it.  */
2762
2763static reg_errcode_t
2764internal_function
2765# ifdef RE_ENABLE_I18N
2766build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2767                        int *coll_sym_alloc, const unsigned char *name)
2768# else /* not RE_ENABLE_I18N */
2769build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2770# endif /* not RE_ENABLE_I18N */
2771{
2772  size_t name_len = strlen ((const char *) name);
2773  if (BE (name_len != 1, 0))
2774    return REG_ECOLLATE;
2775  else
2776    {
2777      bitset_set (sbcset, name[0]);
2778      return REG_NOERROR;
2779    }
2780}
2781#endif /* not _LIBC */
2782
2783/* This function parse bracket expression like "[abc]", "[a-c]",
2784   "[[.a-a.]]" etc.  */
2785
2786static bin_tree_t *
2787parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2788                   reg_syntax_t syntax, reg_errcode_t *err)
2789{
2790#ifdef _LIBC
2791  const unsigned char *collseqmb;
2792  const char *collseqwc;
2793  uint32_t nrules;
2794  int32_t table_size;
2795  const int32_t *symb_table;
2796  const unsigned char *extra;
2797
2798  /* Local function for parse_bracket_exp used in _LIBC environment.
2799     Seek the collating symbol entry correspondings to NAME.
2800     Return the index of the symbol in the SYMB_TABLE.  */
2801
2802  auto inline int32_t
2803  __attribute ((always_inline))
2804  seek_collating_symbol_entry (name, name_len)
2805         const unsigned char *name;
2806         size_t name_len;
2807    {
2808      int32_t hash = elem_hash ((const char *) name, name_len);
2809      int32_t elem = hash % table_size;
2810      if (symb_table[2 * elem] != 0)
2811        {
2812          int32_t second = hash % (table_size - 2) + 1;
2813
2814          do
2815            {
2816              /* First compare the hashing value.  */
2817              if (symb_table[2 * elem] == hash
2818                  /* Compare the length of the name.  */
2819                  && name_len == extra[symb_table[2 * elem + 1]]
2820                  /* Compare the name.  */
2821                  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2822                             name_len) == 0)
2823                {
2824                  /* Yep, this is the entry.  */
2825                  break;
2826                }
2827
2828              /* Next entry.  */
2829              elem += second;
2830            }
2831          while (symb_table[2 * elem] != 0);
2832        }
2833      return elem;
2834    }
2835
2836  /* Local function for parse_bracket_exp used in _LIBC environment.
2837     Look up the collation sequence value of BR_ELEM.
2838     Return the value if succeeded, UINT_MAX otherwise.  */
2839
2840  auto inline unsigned int
2841  __attribute ((always_inline))
2842  lookup_collation_sequence_value (br_elem)
2843         bracket_elem_t *br_elem;
2844    {
2845      if (br_elem->type == SB_CHAR)
2846        {
2847          /*
2848          if (MB_CUR_MAX == 1)
2849          */
2850          if (nrules == 0)
2851            return collseqmb[br_elem->opr.ch];
2852          else
2853            {
2854              wint_t wc = __btowc (br_elem->opr.ch);
2855              return __collseq_table_lookup (collseqwc, wc);
2856            }
2857        }
2858      else if (br_elem->type == MB_CHAR)
2859        {
2860          if (nrules != 0)
2861            return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2862        }
2863      else if (br_elem->type == COLL_SYM)
2864        {
2865          size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2866          if (nrules != 0)
2867            {
2868              int32_t elem, idx;
2869              elem = seek_collating_symbol_entry (br_elem->opr.name,
2870                                                  sym_name_len);
2871              if (symb_table[2 * elem] != 0)
2872                {
2873                  /* We found the entry.  */
2874                  idx = symb_table[2 * elem + 1];
2875                  /* Skip the name of collating element name.  */
2876                  idx += 1 + extra[idx];
2877                  /* Skip the byte sequence of the collating element.  */
2878                  idx += 1 + extra[idx];
2879                  /* Adjust for the alignment.  */
2880                  idx = (idx + 3) & ~3;
2881                  /* Skip the multibyte collation sequence value.  */
2882                  idx += sizeof (unsigned int);
2883                  /* Skip the wide char sequence of the collating element.  */
2884                  idx += sizeof (unsigned int) *
2885                    (1 + *(unsigned int *) (extra + idx));
2886                  /* Return the collation sequence value.  */
2887                  return *(unsigned int *) (extra + idx);
2888                }
2889              else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2890                {
2891                  /* No valid character.  Match it as a single byte
2892                     character.  */
2893                  return collseqmb[br_elem->opr.name[0]];
2894                }
2895            }
2896          else if (sym_name_len == 1)
2897            return collseqmb[br_elem->opr.name[0]];
2898        }
2899      return UINT_MAX;
2900    }
2901
2902  /* Local function for parse_bracket_exp used in _LIBC environment.
2903     Build the range expression which starts from START_ELEM, and ends
2904     at END_ELEM.  The result are written to MBCSET and SBCSET.
2905     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2906     mbcset->range_ends, is a pointer argument since we may
2907     update it.  */
2908
2909  auto inline reg_errcode_t
2910  __attribute ((always_inline))
2911  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2912         re_charset_t *mbcset;
2913         int *range_alloc;
2914         bitset_t sbcset;
2915         bracket_elem_t *start_elem, *end_elem;
2916    {
2917      unsigned int ch;
2918      uint32_t start_collseq;
2919      uint32_t end_collseq;
2920
2921      /* Equivalence Classes and Character Classes can't be a range
2922         start/end.  */
2923      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2924              || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2925              0))
2926        return REG_ERANGE;
2927
2928      start_collseq = lookup_collation_sequence_value (start_elem);
2929      end_collseq = lookup_collation_sequence_value (end_elem);
2930      /* Check start/end collation sequence values.  */
2931      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2932        return REG_ECOLLATE;
2933      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2934        return REG_ERANGE;
2935
2936      /* Got valid collation sequence values, add them as a new entry.
2937         However, if we have no collation elements, and the character set
2938         is single byte, the single byte character set that we
2939         build below suffices. */
2940      if (nrules > 0 || dfa->mb_cur_max > 1)
2941        {
2942          /* Check the space of the arrays.  */
2943          if (BE (*range_alloc == mbcset->nranges, 0))
2944            {
2945              /* There is not enough space, need realloc.  */
2946              uint32_t *new_array_start;
2947              uint32_t *new_array_end;
2948              int new_nranges;
2949
2950              /* +1 in case of mbcset->nranges is 0.  */
2951              new_nranges = 2 * mbcset->nranges + 1;
2952              new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2953                                            new_nranges);
2954              new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2955                                          new_nranges);
2956
2957              if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2958                return REG_ESPACE;
2959
2960              mbcset->range_starts = new_array_start;
2961              mbcset->range_ends = new_array_end;
2962              *range_alloc = new_nranges;
2963            }
2964
2965          mbcset->range_starts[mbcset->nranges] = start_collseq;
2966          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2967        }
2968
2969      /* Build the table for single byte characters.  */
2970      for (ch = 0; ch < SBC_MAX; ch++)
2971        {
2972          uint32_t ch_collseq;
2973          /*
2974          if (MB_CUR_MAX == 1)
2975          */
2976          if (nrules == 0)
2977            ch_collseq = collseqmb[ch];
2978          else
2979            ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2980          if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2981            bitset_set (sbcset, ch);
2982        }
2983      return REG_NOERROR;
2984    }
2985
2986  /* Local function for parse_bracket_exp used in _LIBC environment.
2987     Build the collating element which is represented by NAME.
2988     The result are written to MBCSET and SBCSET.
2989     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2990     pointer argument since we may update it.  */
2991
2992  auto inline reg_errcode_t
2993  __attribute ((always_inline))
2994  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2995         re_charset_t *mbcset;
2996         int *coll_sym_alloc;
2997         bitset_t sbcset;
2998         const unsigned char *name;
2999    {
3000      int32_t elem, idx;
3001      size_t name_len = strlen ((const char *) name);
3002      if (nrules != 0)
3003        {
3004          elem = seek_collating_symbol_entry (name, name_len);
3005          if (symb_table[2 * elem] != 0)
3006            {
3007              /* We found the entry.  */
3008              idx = symb_table[2 * elem + 1];
3009              /* Skip the name of collating element name.  */
3010              idx += 1 + extra[idx];
3011            }
3012          else if (symb_table[2 * elem] == 0 && name_len == 1)
3013            {
3014              /* No valid character, treat it as a normal
3015                 character.  */
3016              bitset_set (sbcset, name[0]);
3017              return REG_NOERROR;
3018            }
3019          else
3020            return REG_ECOLLATE;
3021
3022          /* Got valid collation sequence, add it as a new entry.  */
3023          /* Check the space of the arrays.  */
3024          if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3025            {
3026              /* Not enough, realloc it.  */
3027              /* +1 in case of mbcset->ncoll_syms is 0.  */
3028              int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3029              /* Use realloc since mbcset->coll_syms is NULL
3030                 if *alloc == 0.  */
3031              int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3032                                                   new_coll_sym_alloc);
3033              if (BE (new_coll_syms == NULL, 0))
3034                return REG_ESPACE;
3035              mbcset->coll_syms = new_coll_syms;
3036              *coll_sym_alloc = new_coll_sym_alloc;
3037            }
3038          mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3039          return REG_NOERROR;
3040        }
3041      else
3042        {
3043          if (BE (name_len != 1, 0))
3044            return REG_ECOLLATE;
3045          else
3046            {
3047              bitset_set (sbcset, name[0]);
3048              return REG_NOERROR;
3049            }
3050        }
3051    }
3052#endif
3053
3054  re_token_t br_token;
3055  re_bitset_ptr_t sbcset;
3056#ifdef RE_ENABLE_I18N
3057  re_charset_t *mbcset;
3058  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3059  int equiv_class_alloc = 0, char_class_alloc = 0;
3060#endif /* not RE_ENABLE_I18N */
3061  int non_match = 0;
3062  bin_tree_t *work_tree;
3063  int token_len;
3064  int first_round = 1;
3065#ifdef _LIBC
3066  collseqmb = (const unsigned char *)
3067    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3068  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3069  if (nrules)
3070    {
3071      /*
3072      if (MB_CUR_MAX > 1)
3073      */
3074      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3075      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3076      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3077                                                  _NL_COLLATE_SYMB_TABLEMB);
3078      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3079                                                   _NL_COLLATE_SYMB_EXTRAMB);
3080    }
3081#endif
3082  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3083#ifdef RE_ENABLE_I18N
3084  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3085#endif /* RE_ENABLE_I18N */
3086#ifdef RE_ENABLE_I18N
3087  if (BE (sbcset == NULL || mbcset == NULL, 0))
3088#else
3089  if (BE (sbcset == NULL, 0))
3090#endif /* RE_ENABLE_I18N */
3091    {
3092      *err = REG_ESPACE;
3093      return NULL;
3094    }
3095
3096  token_len = peek_token_bracket (token, regexp, syntax);
3097  if (BE (token->type == END_OF_RE, 0))
3098    {
3099      *err = REG_BADPAT;
3100      goto parse_bracket_exp_free_return;
3101    }
3102  if (token->type == OP_NON_MATCH_LIST)
3103    {
3104#ifdef RE_ENABLE_I18N
3105      mbcset->non_match = 1;
3106#endif /* not RE_ENABLE_I18N */
3107      non_match = 1;
3108      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3109        bitset_set (sbcset, '\n');
3110      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3111      token_len = peek_token_bracket (token, regexp, syntax);
3112      if (BE (token->type == END_OF_RE, 0))
3113        {
3114          *err = REG_BADPAT;
3115          goto parse_bracket_exp_free_return;
3116        }
3117    }
3118
3119  /* We treat the first ']' as a normal character.  */
3120  if (token->type == OP_CLOSE_BRACKET)
3121    token->type = CHARACTER;
3122
3123  while (1)
3124    {
3125      bracket_elem_t start_elem, end_elem;
3126      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3127      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3128      reg_errcode_t ret;
3129      int token_len2 = 0, is_range_exp = 0;
3130      re_token_t token2;
3131
3132      start_elem.opr.name = start_name_buf;
3133      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3134                                   syntax, first_round);
3135      if (BE (ret != REG_NOERROR, 0))
3136        {
3137          *err = ret;
3138          goto parse_bracket_exp_free_return;
3139        }
3140      first_round = 0;
3141
3142      /* Get information about the next token.  We need it in any case.  */
3143      token_len = peek_token_bracket (token, regexp, syntax);
3144
3145      /* Do not check for ranges if we know they are not allowed.  */
3146      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3147        {
3148          if (BE (token->type == END_OF_RE, 0))
3149            {
3150              *err = REG_EBRACK;
3151              goto parse_bracket_exp_free_return;
3152            }
3153          if (token->type == OP_CHARSET_RANGE)
3154            {
3155              re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3156              token_len2 = peek_token_bracket (&token2, regexp, syntax);
3157              if (BE (token2.type == END_OF_RE, 0))
3158                {
3159                  *err = REG_EBRACK;
3160                  goto parse_bracket_exp_free_return;
3161                }
3162              if (token2.type == OP_CLOSE_BRACKET)
3163                {
3164                  /* We treat the last '-' as a normal character.  */
3165                  re_string_skip_bytes (regexp, -token_len);
3166                  token->type = CHARACTER;
3167                }
3168              else
3169                is_range_exp = 1;
3170            }
3171        }
3172
3173      if (is_range_exp == 1)
3174        {
3175          end_elem.opr.name = end_name_buf;
3176          ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3177                                       dfa, syntax, 1);
3178          if (BE (ret != REG_NOERROR, 0))
3179            {
3180              *err = ret;
3181              goto parse_bracket_exp_free_return;
3182            }
3183
3184          token_len = peek_token_bracket (token, regexp, syntax);
3185
3186#ifdef _LIBC
3187          *err = build_range_exp (sbcset, mbcset, &range_alloc,
3188                                  &start_elem, &end_elem);
3189#else
3190# ifdef RE_ENABLE_I18N
3191          *err = build_range_exp (sbcset,
3192                                  dfa->mb_cur_max > 1 ? mbcset : NULL,
3193                                  &range_alloc, &start_elem, &end_elem);
3194# else
3195          *err = build_range_exp (sbcset, &start_elem, &end_elem);
3196# endif
3197#endif /* RE_ENABLE_I18N */
3198          if (BE (*err != REG_NOERROR, 0))
3199            goto parse_bracket_exp_free_return;
3200        }
3201      else
3202        {
3203          switch (start_elem.type)
3204            {
3205            case SB_CHAR:
3206              bitset_set (sbcset, start_elem.opr.ch);
3207              break;
3208#ifdef RE_ENABLE_I18N
3209            case MB_CHAR:
3210              /* Check whether the array has enough space.  */
3211              if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3212                {
3213                  wchar_t *new_mbchars;
3214                  /* Not enough, realloc it.  */
3215                  /* +1 in case of mbcset->nmbchars is 0.  */
3216                  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3217                  /* Use realloc since array is NULL if *alloc == 0.  */
3218                  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3219                                            mbchar_alloc);
3220                  if (BE (new_mbchars == NULL, 0))
3221                    goto parse_bracket_exp_espace;
3222                  mbcset->mbchars = new_mbchars;
3223                }
3224              mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3225              break;
3226#endif /* RE_ENABLE_I18N */
3227            case EQUIV_CLASS:
3228              *err = build_equiv_class (sbcset,
3229#ifdef RE_ENABLE_I18N
3230                                        mbcset, &equiv_class_alloc,
3231#endif /* RE_ENABLE_I18N */
3232                                        start_elem.opr.name);
3233              if (BE (*err != REG_NOERROR, 0))
3234                goto parse_bracket_exp_free_return;
3235              break;
3236            case COLL_SYM:
3237              *err = build_collating_symbol (sbcset,
3238#ifdef RE_ENABLE_I18N
3239                                             mbcset, &coll_sym_alloc,
3240#endif /* RE_ENABLE_I18N */
3241                                             start_elem.opr.name);
3242              if (BE (*err != REG_NOERROR, 0))
3243                goto parse_bracket_exp_free_return;
3244              break;
3245            case CHAR_CLASS:
3246              *err = build_charclass (regexp->trans, sbcset,
3247#ifdef RE_ENABLE_I18N
3248                                      mbcset, &char_class_alloc,
3249#endif /* RE_ENABLE_I18N */
3250                                      (const char *) start_elem.opr.name, syntax);
3251              if (BE (*err != REG_NOERROR, 0))
3252               goto parse_bracket_exp_free_return;
3253              break;
3254            default:
3255              assert (0);
3256              break;
3257            }
3258        }
3259      if (BE (token->type == END_OF_RE, 0))
3260        {
3261          *err = REG_EBRACK;
3262          goto parse_bracket_exp_free_return;
3263        }
3264      if (token->type == OP_CLOSE_BRACKET)
3265        break;
3266    }
3267
3268  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3269
3270  /* If it is non-matching list.  */
3271  if (non_match)
3272    bitset_not (sbcset);
3273
3274#ifdef RE_ENABLE_I18N
3275  /* Ensure only single byte characters are set.  */
3276  if (dfa->mb_cur_max > 1)
3277    bitset_mask (sbcset, dfa->sb_char);
3278
3279  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3280      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3281                                                     || mbcset->non_match)))
3282    {
3283      bin_tree_t *mbc_tree;
3284      int sbc_idx;
3285      /* Build a tree for complex bracket.  */
3286      dfa->has_mb_node = 1;
3287      br_token.type = COMPLEX_BRACKET;
3288      br_token.opr.mbcset = mbcset;
3289      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3290      if (BE (mbc_tree == NULL, 0))
3291        goto parse_bracket_exp_espace;
3292      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3293        if (sbcset[sbc_idx])
3294          break;
3295      /* If there are no bits set in sbcset, there is no point
3296         of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3297      if (sbc_idx < BITSET_WORDS)
3298        {
3299          /* Build a tree for simple bracket.  */
3300          br_token.type = SIMPLE_BRACKET;
3301          br_token.opr.sbcset = sbcset;
3302          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3303          if (BE (work_tree == NULL, 0))
3304            goto parse_bracket_exp_espace;
3305
3306          /* Then join them by ALT node.  */
3307          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3308          if (BE (work_tree == NULL, 0))
3309            goto parse_bracket_exp_espace;
3310        }
3311      else
3312        {
3313          re_free (sbcset);
3314          work_tree = mbc_tree;
3315        }
3316    }
3317  else
3318#endif /* not RE_ENABLE_I18N */
3319    {
3320#ifdef RE_ENABLE_I18N
3321      free_charset (mbcset);
3322#endif
3323      /* Build a tree for simple bracket.  */
3324      br_token.type = SIMPLE_BRACKET;
3325      br_token.opr.sbcset = sbcset;
3326      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3327      if (BE (work_tree == NULL, 0))
3328        goto parse_bracket_exp_espace;
3329    }
3330  return work_tree;
3331
3332 parse_bracket_exp_espace:
3333  *err = REG_ESPACE;
3334 parse_bracket_exp_free_return:
3335  re_free (sbcset);
3336#ifdef RE_ENABLE_I18N
3337  free_charset (mbcset);
3338#endif /* RE_ENABLE_I18N */
3339  return NULL;
3340}
3341
3342/* Parse an element in the bracket expression.  */
3343
3344static reg_errcode_t
3345parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3346                       re_token_t *token, int token_len, re_dfa_t *dfa,
3347                       reg_syntax_t syntax, int accept_hyphen)
3348{
3349#ifdef RE_ENABLE_I18N
3350  int cur_char_size;
3351  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3352  if (cur_char_size > 1)
3353    {
3354      elem->type = MB_CHAR;
3355      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3356      re_string_skip_bytes (regexp, cur_char_size);
3357      return REG_NOERROR;
3358    }
3359#endif /* RE_ENABLE_I18N */
3360  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3361  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3362      || token->type == OP_OPEN_EQUIV_CLASS)
3363    return parse_bracket_symbol (elem, regexp, token);
3364  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3365    {
3366      /* A '-' must only appear as anything but a range indicator before
3367         the closing bracket.  Everything else is an error.  */
3368      re_token_t token2;
3369      (void) peek_token_bracket (&token2, regexp, syntax);
3370      if (token2.type != OP_CLOSE_BRACKET)
3371        /* The actual error value is not standardized since this whole
3372           case is undefined.  But ERANGE makes good sense.  */
3373        return REG_ERANGE;
3374    }
3375  elem->type = SB_CHAR;
3376  elem->opr.ch = token->opr.c;
3377  return REG_NOERROR;
3378}
3379
3380/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3381   such as [:<character_class>:], [.<collating_element>.], and
3382   [=<equivalent_class>=].  */
3383
3384static reg_errcode_t
3385parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3386                      re_token_t *token)
3387{
3388  unsigned char ch, delim = token->opr.c;
3389  int i = 0;
3390  if (re_string_eoi(regexp))
3391    return REG_EBRACK;
3392  for (;; ++i)
3393    {
3394      if (i >= BRACKET_NAME_BUF_SIZE)
3395        return REG_EBRACK;
3396      if (token->type == OP_OPEN_CHAR_CLASS)
3397        ch = re_string_fetch_byte_case (regexp);
3398      else
3399        ch = re_string_fetch_byte (regexp);
3400      if (re_string_eoi(regexp))
3401        return REG_EBRACK;
3402      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3403        break;
3404      elem->opr.name[i] = ch;
3405    }
3406  re_string_skip_bytes (regexp, 1);
3407  elem->opr.name[i] = '\0';
3408  switch (token->type)
3409    {
3410    case OP_OPEN_COLL_ELEM:
3411      elem->type = COLL_SYM;
3412      break;
3413    case OP_OPEN_EQUIV_CLASS:
3414      elem->type = EQUIV_CLASS;
3415      break;
3416    case OP_OPEN_CHAR_CLASS:
3417      elem->type = CHAR_CLASS;
3418      break;
3419    default:
3420      break;
3421    }
3422  return REG_NOERROR;
3423}
3424
3425  /* Helper function for parse_bracket_exp.
3426     Build the equivalence class which is represented by NAME.
3427     The result are written to MBCSET and SBCSET.
3428     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3429     is a pointer argument since we may update it.  */
3430
3431static reg_errcode_t
3432#ifdef RE_ENABLE_I18N
3433build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3434                   int *equiv_class_alloc, const unsigned char *name)
3435#else /* not RE_ENABLE_I18N */
3436build_equiv_class (bitset_t sbcset, const unsigned char *name)
3437#endif /* not RE_ENABLE_I18N */
3438{
3439#ifdef _LIBC
3440  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3441  if (nrules != 0)
3442    {
3443      const int32_t *table, *indirect;
3444      const unsigned char *weights, *extra, *cp;
3445      unsigned char char_buf[2];
3446      int32_t idx1, idx2;
3447      unsigned int ch;
3448      size_t len;
3449      /* This #include defines a local function!  */
3450# include <locale/weight.h>
3451      /* Calculate the index for equivalence class.  */
3452      cp = name;
3453      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3454      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3455                                               _NL_COLLATE_WEIGHTMB);
3456      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3457                                                   _NL_COLLATE_EXTRAMB);
3458      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3459                                                _NL_COLLATE_INDIRECTMB);
3460      idx1 = findidx (&cp);
3461      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3462        /* This isn't a valid character.  */
3463        return REG_ECOLLATE;
3464
3465      /* Build single byte matcing table for this equivalence class.  */
3466      char_buf[1] = (unsigned char) '\0';
3467      len = weights[idx1 & 0xffffff];
3468      for (ch = 0; ch < SBC_MAX; ++ch)
3469        {
3470          char_buf[0] = ch;
3471          cp = char_buf;
3472          idx2 = findidx (&cp);
3473/*
3474          idx2 = table[ch];
3475*/
3476          if (idx2 == 0)
3477            /* This isn't a valid character.  */
3478            continue;
3479          /* Compare only if the length matches and the collation rule
3480             index is the same.  */
3481          if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3482            {
3483              int cnt = 0;
3484
3485              while (cnt <= len &&
3486                     weights[(idx1 & 0xffffff) + 1 + cnt]
3487                     == weights[(idx2 & 0xffffff) + 1 + cnt])
3488                ++cnt;
3489
3490              if (cnt > len)
3491                bitset_set (sbcset, ch);
3492            }
3493        }
3494      /* Check whether the array has enough space.  */
3495      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3496        {
3497          /* Not enough, realloc it.  */
3498          /* +1 in case of mbcset->nequiv_classes is 0.  */
3499          int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3500          /* Use realloc since the array is NULL if *alloc == 0.  */
3501          int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3502                                                   int32_t,
3503                                                   new_equiv_class_alloc);
3504          if (BE (new_equiv_classes == NULL, 0))
3505            return REG_ESPACE;
3506          mbcset->equiv_classes = new_equiv_classes;
3507          *equiv_class_alloc = new_equiv_class_alloc;
3508        }
3509      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3510    }
3511  else
3512#endif /* _LIBC */
3513    {
3514      if (BE (strlen ((const char *) name) != 1, 0))
3515        return REG_ECOLLATE;
3516      bitset_set (sbcset, *name);
3517    }
3518  return REG_NOERROR;
3519}
3520
3521  /* Helper function for parse_bracket_exp.
3522     Build the character class which is represented by NAME.
3523     The result are written to MBCSET and SBCSET.
3524     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3525     is a pointer argument since we may update it.  */
3526
3527static reg_errcode_t
3528#ifdef RE_ENABLE_I18N
3529build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3530                 re_charset_t *mbcset, int *char_class_alloc,
3531                 const char *class_name, reg_syntax_t syntax)
3532#else /* not RE_ENABLE_I18N */
3533build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3534                 const char *class_name, reg_syntax_t syntax)
3535#endif /* not RE_ENABLE_I18N */
3536{
3537  int i;
3538
3539  /* In case of REG_ICASE "upper" and "lower" match the both of
3540     upper and lower cases.  */
3541  if ((syntax & RE_ICASE)
3542      && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3543    class_name = "alpha";
3544
3545#ifdef RE_ENABLE_I18N
3546  /* Check the space of the arrays.  */
3547  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3548    {
3549      /* Not enough, realloc it.  */
3550      /* +1 in case of mbcset->nchar_classes is 0.  */
3551      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3552      /* Use realloc since array is NULL if *alloc == 0.  */
3553      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3554                                               new_char_class_alloc);
3555      if (BE (new_char_classes == NULL, 0))
3556        return REG_ESPACE;
3557      mbcset->char_classes = new_char_classes;
3558      *char_class_alloc = new_char_class_alloc;
3559    }
3560  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3561#endif /* RE_ENABLE_I18N */
3562
3563#define BUILD_CHARCLASS_LOOP(ctype_func)        \
3564  do {                                          \
3565    if (BE (trans != NULL, 0))                  \
3566      {                                         \
3567        for (i = 0; i < SBC_MAX; ++i)           \
3568          if (ctype_func (i))                   \
3569            bitset_set (sbcset, trans[i]);      \
3570      }                                         \
3571    else                                        \
3572      {                                         \
3573        for (i = 0; i < SBC_MAX; ++i)           \
3574          if (ctype_func (i))                   \
3575            bitset_set (sbcset, i);             \
3576      }                                         \
3577  } while (0)
3578
3579  if (strcmp (class_name, "alnum") == 0)
3580    BUILD_CHARCLASS_LOOP (isalnum);
3581  else if (strcmp (class_name, "cntrl") == 0)
3582    BUILD_CHARCLASS_LOOP (iscntrl);
3583  else if (strcmp (class_name, "lower") == 0)
3584    BUILD_CHARCLASS_LOOP (islower);
3585  else if (strcmp (class_name, "space") == 0)
3586    BUILD_CHARCLASS_LOOP (isspace);
3587  else if (strcmp (class_name, "alpha") == 0)
3588    BUILD_CHARCLASS_LOOP (isalpha);
3589  else if (strcmp (class_name, "digit") == 0)
3590    BUILD_CHARCLASS_LOOP (isdigit);
3591  else if (strcmp (class_name, "print") == 0)
3592    BUILD_CHARCLASS_LOOP (isprint);
3593  else if (strcmp (class_name, "upper") == 0)
3594    BUILD_CHARCLASS_LOOP (isupper);
3595  else if (strcmp (class_name, "blank") == 0)
3596#ifndef GAWK
3597    BUILD_CHARCLASS_LOOP (isblank);
3598#else
3599    /* see comments above */
3600    BUILD_CHARCLASS_LOOP (is_blank);
3601#endif
3602  else if (strcmp (class_name, "graph") == 0)
3603    BUILD_CHARCLASS_LOOP (isgraph);
3604  else if (strcmp (class_name, "punct") == 0)
3605    BUILD_CHARCLASS_LOOP (ispunct);
3606  else if (strcmp (class_name, "xdigit") == 0)
3607    BUILD_CHARCLASS_LOOP (isxdigit);
3608  else
3609    return REG_ECTYPE;
3610
3611  return REG_NOERROR;
3612}
3613
3614static bin_tree_t *
3615build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3616                    const char *class_name,
3617                    const char *extra, int non_match,
3618                    reg_errcode_t *err)
3619{
3620  re_bitset_ptr_t sbcset;
3621#ifdef RE_ENABLE_I18N
3622  re_charset_t *mbcset;
3623  int alloc = 0;
3624#endif /* not RE_ENABLE_I18N */
3625  reg_errcode_t ret;
3626  re_token_t br_token;
3627  bin_tree_t *tree;
3628
3629  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3630#ifdef RE_ENABLE_I18N
3631  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3632#endif /* RE_ENABLE_I18N */
3633
3634#ifdef RE_ENABLE_I18N
3635  if (BE (sbcset == NULL || mbcset == NULL, 0))
3636#else /* not RE_ENABLE_I18N */
3637  if (BE (sbcset == NULL, 0))
3638#endif /* not RE_ENABLE_I18N */
3639    {
3640      *err = REG_ESPACE;
3641      return NULL;
3642    }
3643
3644  if (non_match)
3645    {
3646#ifdef RE_ENABLE_I18N
3647      mbcset->non_match = 1;
3648#endif /* not RE_ENABLE_I18N */
3649    }
3650
3651  /* We don't care the syntax in this case.  */
3652  ret = build_charclass (trans, sbcset,
3653#ifdef RE_ENABLE_I18N
3654                         mbcset, &alloc,
3655#endif /* RE_ENABLE_I18N */
3656                         class_name, 0);
3657
3658  if (BE (ret != REG_NOERROR, 0))
3659    {
3660      re_free (sbcset);
3661#ifdef RE_ENABLE_I18N
3662      free_charset (mbcset);
3663#endif /* RE_ENABLE_I18N */
3664      *err = ret;
3665      return NULL;
3666    }
3667  /* \w match '_' also.  */
3668  for (; *extra; extra++)
3669    bitset_set (sbcset, *extra);
3670
3671  /* If it is non-matching list.  */
3672  if (non_match)
3673    bitset_not (sbcset);
3674
3675#ifdef RE_ENABLE_I18N
3676  /* Ensure only single byte characters are set.  */
3677  if (dfa->mb_cur_max > 1)
3678    bitset_mask (sbcset, dfa->sb_char);
3679#endif
3680
3681  /* Build a tree for simple bracket.  */
3682  br_token.type = SIMPLE_BRACKET;
3683  br_token.opr.sbcset = sbcset;
3684  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3685  if (BE (tree == NULL, 0))
3686    goto build_word_op_espace;
3687
3688#ifdef RE_ENABLE_I18N
3689  if (dfa->mb_cur_max > 1)
3690    {
3691      bin_tree_t *mbc_tree;
3692      /* Build a tree for complex bracket.  */
3693      br_token.type = COMPLEX_BRACKET;
3694      br_token.opr.mbcset = mbcset;
3695      dfa->has_mb_node = 1;
3696      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3697      if (BE (mbc_tree == NULL, 0))
3698        goto build_word_op_espace;
3699      /* Then join them by ALT node.  */
3700      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3701      if (BE (mbc_tree != NULL, 1))
3702        return tree;
3703    }
3704  else
3705    {
3706      free_charset (mbcset);
3707      return tree;
3708    }
3709#else /* not RE_ENABLE_I18N */
3710  return tree;
3711#endif /* not RE_ENABLE_I18N */
3712
3713 build_word_op_espace:
3714  re_free (sbcset);
3715#ifdef RE_ENABLE_I18N
3716  free_charset (mbcset);
3717#endif /* RE_ENABLE_I18N */
3718  *err = REG_ESPACE;
3719  return NULL;
3720}
3721
3722/* This is intended for the expressions like "a{1,3}".
3723   Fetch a number from `input', and return the number.
3724   Return -1, if the number field is empty like "{,1}".
3725   Return -2, if an error has occurred.  */
3726
3727static int
3728fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3729{
3730  int num = -1;
3731  unsigned char c;
3732  while (1)
3733    {
3734      fetch_token (token, input, syntax);
3735      c = token->opr.c;
3736      if (BE (token->type == END_OF_RE, 0))
3737        return -2;
3738      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3739        break;
3740      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3741             ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3742      num = (num > RE_DUP_MAX) ? -2 : num;
3743    }
3744  return num;
3745}
3746\f
3747#ifdef RE_ENABLE_I18N
3748static void
3749free_charset (re_charset_t *cset)
3750{
3751  re_free (cset->mbchars);
3752# ifdef _LIBC
3753  re_free (cset->coll_syms);
3754  re_free (cset->equiv_classes);
3755  re_free (cset->range_starts);
3756  re_free (cset->range_ends);
3757# endif
3758  re_free (cset->char_classes);
3759  re_free (cset);
3760}
3761#endif /* RE_ENABLE_I18N */
3762\f
3763/* Functions for binary tree operation.  */
3764
3765/* Create a tree node.  */
3766
3767static bin_tree_t *
3768create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3769             re_token_type_t type)
3770{
3771  re_token_t t;
3772  t.type = type;
3773  return create_token_tree (dfa, left, right, &t);
3774}
3775
3776static bin_tree_t *
3777create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3778                   const re_token_t *token)
3779{
3780  bin_tree_t *tree;
3781  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3782    {
3783      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3784
3785      if (storage == NULL)
3786        return NULL;
3787      storage->next = dfa->str_tree_storage;
3788      dfa->str_tree_storage = storage;
3789      dfa->str_tree_storage_idx = 0;
3790    }
3791  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3792
3793  tree->parent = NULL;
3794  tree->left = left;
3795  tree->right = right;
3796  tree->token = *token;
3797  tree->token.duplicated = 0;
3798  tree->token.opt_subexp = 0;
3799  tree->first = NULL;
3800  tree->next = NULL;
3801  tree->node_idx = -1;
3802
3803  if (left != NULL)
3804    left->parent = tree;
3805  if (right != NULL)
3806    right->parent = tree;
3807  return tree;
3808}
3809
3810/* Mark the tree SRC as an optional subexpression.
3811   To be called from preorder or postorder.  */
3812
3813static reg_errcode_t
3814mark_opt_subexp (void *extra, bin_tree_t *node)
3815{
3816  int idx = (int) (intptr_t) extra;
3817  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3818    node->token.opt_subexp = 1;
3819
3820  return REG_NOERROR;
3821}
3822
3823/* Free the allocated memory inside NODE. */
3824
3825static void
3826free_token (re_token_t *node)
3827{
3828#ifdef RE_ENABLE_I18N
3829  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3830    free_charset (node->opr.mbcset);
3831  else
3832#endif /* RE_ENABLE_I18N */
3833    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3834      re_free (node->opr.sbcset);
3835}
3836
3837/* Worker function for tree walking.  Free the allocated memory inside NODE
3838   and its children. */
3839
3840static reg_errcode_t
3841free_tree (void *extra, bin_tree_t *node)
3842{
3843  free_token (&node->token);
3844  return REG_NOERROR;
3845}
3846
3847
3848/* Duplicate the node SRC, and return new node.  This is a preorder
3849   visit similar to the one implemented by the generic visitor, but
3850   we need more infrastructure to maintain two parallel trees --- so,
3851   it's easier to duplicate.  */
3852
3853static bin_tree_t *
3854duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3855{
3856  const bin_tree_t *node;
3857  bin_tree_t *dup_root;
3858  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3859
3860  for (node = root; ; )
3861    {
3862      /* Create a new tree and link it back to the current parent.  */
3863      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3864      if (*p_new == NULL)
3865        return NULL;
3866      (*p_new)->parent = dup_node;
3867      (*p_new)->token.duplicated = 1;
3868      dup_node = *p_new;
3869
3870      /* Go to the left node, or up and to the right.  */
3871      if (node->left)
3872        {
3873          node = node->left;
3874          p_new = &dup_node->left;
3875        }
3876      else
3877        {
3878          const bin_tree_t *prev = NULL;
3879          while (node->right == prev || node->right == NULL)
3880            {
3881              prev = node;
3882              node = node->parent;
3883              dup_node = dup_node->parent;
3884              if (!node)
3885                return dup_root;
3886            }
3887          node = node->right;
3888          p_new = &dup_node->right;
3889        }
3890    }
3891}