compat / regex / regcomp.con commit sequencer (rebase -i): allow fast-forwarding for edit/reword (bcbb68b)
   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, write to the Free
  18   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19   02110-1301 USA.  */
  20
  21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                          size_t length, reg_syntax_t syntax);
  23static void re_compile_fastmap_iter (regex_t *bufp,
  24                                     const re_dfastate_t *init_state,
  25                                     char *fastmap);
  26static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27#ifdef RE_ENABLE_I18N
  28static void free_charset (re_charset_t *cset);
  29#endif /* RE_ENABLE_I18N */
  30static void free_workarea_compile (regex_t *preg);
  31static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32#ifdef RE_ENABLE_I18N
  33static void optimize_utf8 (re_dfa_t *dfa);
  34#endif
  35static reg_errcode_t analyze (regex_t *preg);
  36static reg_errcode_t preorder (bin_tree_t *root,
  37                               reg_errcode_t (fn (void *, bin_tree_t *)),
  38                               void *extra);
  39static reg_errcode_t postorder (bin_tree_t *root,
  40                                reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                void *extra);
  42static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                 bin_tree_t *node);
  46static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                   unsigned int constraint);
  52static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                         int node, int root);
  55static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56static int fetch_number (re_string_t *input, re_token_t *token,
  57                         reg_syntax_t syntax);
  58static int peek_token (re_token_t *token, re_string_t *input,
  59                        reg_syntax_t syntax) internal_function;
  60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                          reg_syntax_t syntax, reg_errcode_t *err);
  62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                  re_token_t *token, reg_syntax_t syntax,
  64                                  int nest, reg_errcode_t *err);
  65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                 re_token_t *token, reg_syntax_t syntax,
  67                                 int nest, reg_errcode_t *err);
  68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                     re_token_t *token, reg_syntax_t syntax,
  70                                     int nest, reg_errcode_t *err);
  71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                  re_token_t *token, reg_syntax_t syntax,
  73                                  int nest, reg_errcode_t *err);
  74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                 re_dfa_t *dfa, re_token_t *token,
  76                                 reg_syntax_t syntax, reg_errcode_t *err);
  77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                      re_token_t *token, reg_syntax_t syntax,
  79                                      reg_errcode_t *err);
  80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                            re_string_t *regexp,
  82                                            re_token_t *token, int token_len,
  83                                            re_dfa_t *dfa,
  84                                            reg_syntax_t syntax,
  85                                            int accept_hyphen);
  86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                          re_string_t *regexp,
  88                                          re_token_t *token);
  89#ifdef RE_ENABLE_I18N
  90static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                        re_charset_t *mbcset,
  92                                        int *equiv_class_alloc,
  93                                        const unsigned char *name);
  94static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                      bitset_t sbcset,
  96                                      re_charset_t *mbcset,
  97                                      int *char_class_alloc,
  98                                      const char *class_name,
  99                                      reg_syntax_t syntax);
 100#else  /* not RE_ENABLE_I18N */
 101static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                        const unsigned char *name);
 103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                      bitset_t sbcset,
 105                                      const char *class_name,
 106                                      reg_syntax_t syntax);
 107#endif /* not RE_ENABLE_I18N */
 108static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                       RE_TRANSLATE_TYPE trans,
 110                                       const char *class_name,
 111                                       const char *extra,
 112                                       int non_match, reg_errcode_t *err);
 113static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                bin_tree_t *left, bin_tree_t *right,
 115                                re_token_type_t type);
 116static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                      bin_tree_t *left, bin_tree_t *right,
 118                                      const re_token_t *token);
 119static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120static void free_token (re_token_t *node);
 121static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123\f
 124/* This table gives an error message for each of the error codes listed
 125   in regex.h.  Obviously the order here has to be same as there.
 126   POSIX doesn't require that we do anything for REG_NOERROR,
 127   but why not be nice?  */
 128
 129const char __re_error_msgid[] attribute_hidden =
 130  {
 131#define REG_NOERROR_IDX 0
 132    gettext_noop ("Success")    /* REG_NOERROR */
 133    "\0"
 134#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135    gettext_noop ("No match")   /* REG_NOMATCH */
 136    "\0"
 137#define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139    "\0"
 140#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142    "\0"
 143#define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145    "\0"
 146#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148    "\0"
 149#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151    "\0"
 152#define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153    gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154    "\0"
 155#define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157    "\0"
 158#define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160    "\0"
 161#define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163    "\0"
 164#define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165    gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166    "\0"
 167#define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168    gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169    "\0"
 170#define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172    "\0"
 173#define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174    gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175    "\0"
 176#define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177    gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178    "\0"
 179#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181  };
 182
 183const size_t __re_error_msgid_idx[] attribute_hidden =
 184  {
 185    REG_NOERROR_IDX,
 186    REG_NOMATCH_IDX,
 187    REG_BADPAT_IDX,
 188    REG_ECOLLATE_IDX,
 189    REG_ECTYPE_IDX,
 190    REG_EESCAPE_IDX,
 191    REG_ESUBREG_IDX,
 192    REG_EBRACK_IDX,
 193    REG_EPAREN_IDX,
 194    REG_EBRACE_IDX,
 195    REG_BADBR_IDX,
 196    REG_ERANGE_IDX,
 197    REG_ESPACE_IDX,
 198    REG_BADRPT_IDX,
 199    REG_EEND_IDX,
 200    REG_ESIZE_IDX,
 201    REG_ERPAREN_IDX
 202  };
 203\f
 204/* Entry points for GNU code.  */
 205
 206
 207#ifdef ZOS_USS
 208
 209/* For ZOS USS we must define btowc */
 210
 211wchar_t 
 212btowc (int c)
 213{
 214   wchar_t wtmp[2];
 215   char tmp[2];
 216
 217   tmp[0] = c;
 218   tmp[1] = 0;
 219
 220   mbtowc (wtmp, tmp, 1);
 221   return wtmp[0];
 222}
 223#endif
 224
 225/* re_compile_pattern is the GNU regular expression compiler: it
 226   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 227   Returns 0 if the pattern was valid, otherwise an error string.
 228
 229   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 230   are set in BUFP on entry.  */
 231
 232const char *
 233re_compile_pattern (const char *pattern,
 234                    size_t length,
 235                    struct re_pattern_buffer *bufp)
 236{
 237  reg_errcode_t ret;
 238
 239  /* And GNU code determines whether or not to get register information
 240     by passing null for the REGS argument to re_match, etc., not by
 241     setting no_sub, unless RE_NO_SUB is set.  */
 242  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 243
 244  /* Match anchors at newline.  */
 245  bufp->newline_anchor = 1;
 246
 247  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 248
 249  if (!ret)
 250    return NULL;
 251  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 252}
 253#ifdef _LIBC
 254weak_alias (__re_compile_pattern, re_compile_pattern)
 255#endif
 256
 257/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 258   also be assigned to arbitrarily: each pattern buffer stores its own
 259   syntax, so it can be changed between regex compilations.  */
 260/* This has no initializer because initialized variables in Emacs
 261   become read-only after dumping.  */
 262reg_syntax_t re_syntax_options;
 263
 264
 265/* Specify the precise syntax of regexps for compilation.  This provides
 266   for compatibility for various utilities which historically have
 267   different, incompatible syntaxes.
 268
 269   The argument SYNTAX is a bit mask comprised of the various bits
 270   defined in regex.h.  We return the old syntax.  */
 271
 272reg_syntax_t
 273re_set_syntax (reg_syntax_t syntax)
 274{
 275  reg_syntax_t ret = re_syntax_options;
 276
 277  re_syntax_options = syntax;
 278  return ret;
 279}
 280#ifdef _LIBC
 281weak_alias (__re_set_syntax, re_set_syntax)
 282#endif
 283
 284int
 285re_compile_fastmap (struct re_pattern_buffer *bufp)
 286{
 287  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 288  char *fastmap = bufp->fastmap;
 289
 290  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 291  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 292  if (dfa->init_state != dfa->init_state_word)
 293    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 294  if (dfa->init_state != dfa->init_state_nl)
 295    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 296  if (dfa->init_state != dfa->init_state_begbuf)
 297    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 298  bufp->fastmap_accurate = 1;
 299  return 0;
 300}
 301#ifdef _LIBC
 302weak_alias (__re_compile_fastmap, re_compile_fastmap)
 303#endif
 304
 305static inline void
 306__attribute ((always_inline))
 307re_set_fastmap (char *fastmap, int icase, int ch)
 308{
 309  fastmap[ch] = 1;
 310  if (icase)
 311    fastmap[tolower (ch)] = 1;
 312}
 313
 314/* Helper function for re_compile_fastmap.
 315   Compile fastmap for the initial_state INIT_STATE.  */
 316
 317static void
 318re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 319                         char *fastmap)
 320{
 321  volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 322  int node_cnt;
 323  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 324  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 325    {
 326      int node = init_state->nodes.elems[node_cnt];
 327      re_token_type_t type = dfa->nodes[node].type;
 328
 329      if (type == CHARACTER)
 330        {
 331          re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 332#ifdef RE_ENABLE_I18N
 333          if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 334            {
 335              unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 336              wchar_t wc;
 337              mbstate_t state;
 338
 339              p = buf;
 340              *p++ = dfa->nodes[node].opr.c;
 341              while (++node < dfa->nodes_len
 342                     && dfa->nodes[node].type == CHARACTER
 343                     && dfa->nodes[node].mb_partial)
 344                *p++ = dfa->nodes[node].opr.c;
 345              memset (&state, '\0', sizeof (state));
 346              if (__mbrtowc (&wc, (const char *) buf, p - buf,
 347                             &state) == p - buf
 348                  && (__wcrtomb ((char *) buf, towlower (wc), &state)
 349                      != (size_t) -1))
 350                re_set_fastmap (fastmap, 0, buf[0]);
 351              re_free (buf);
 352            }
 353#endif
 354        }
 355      else if (type == SIMPLE_BRACKET)
 356        {
 357          int i, ch;
 358          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 359            {
 360              int j;
 361              bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 362              for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 363                if (w & ((bitset_word_t) 1 << j))
 364                  re_set_fastmap (fastmap, icase, ch);
 365            }
 366        }
 367#ifdef RE_ENABLE_I18N
 368      else if (type == COMPLEX_BRACKET)
 369        {
 370          re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 371          int i;
 372
 373# ifdef _LIBC
 374          /* See if we have to try all bytes which start multiple collation
 375             elements.
 376             e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 377                  collation element, and don't catch 'b' since 'b' is
 378                  the only collation element which starts from 'b' (and
 379                  it is caught by SIMPLE_BRACKET).  */
 380              if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 381                  && (cset->ncoll_syms || cset->nranges))
 382                {
 383                  const int32_t *table = (const int32_t *)
 384                    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 385                  for (i = 0; i < SBC_MAX; ++i)
 386                    if (table[i] < 0)
 387                      re_set_fastmap (fastmap, icase, i);
 388                }
 389# endif /* _LIBC */
 390
 391          /* See if we have to start the match at all multibyte characters,
 392             i.e. where we would not find an invalid sequence.  This only
 393             applies to multibyte character sets; for single byte character
 394             sets, the SIMPLE_BRACKET again suffices.  */
 395          if (dfa->mb_cur_max > 1
 396              && (cset->nchar_classes || cset->non_match || cset->nranges
 397# ifdef _LIBC
 398                  || cset->nequiv_classes
 399# endif /* _LIBC */
 400                 ))
 401            {
 402              unsigned char c = 0;
 403              do
 404                {
 405                  mbstate_t mbs;
 406                  memset (&mbs, 0, sizeof (mbs));
 407                  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 408                    re_set_fastmap (fastmap, false, (int) c);
 409                }
 410              while (++c != 0);
 411            }
 412
 413          else
 414            {
 415              /* ... Else catch all bytes which can start the mbchars.  */
 416              for (i = 0; i < cset->nmbchars; ++i)
 417                {
 418                  char buf[256];
 419                  mbstate_t state;
 420                  memset (&state, '\0', sizeof (state));
 421                  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 422                    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 423                  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 424                    {
 425                      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 426                          != (size_t) -1)
 427                        re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 428                    }
 429                }
 430            }
 431        }
 432#endif /* RE_ENABLE_I18N */
 433      else if (type == OP_PERIOD
 434#ifdef RE_ENABLE_I18N
 435               || type == OP_UTF8_PERIOD
 436#endif /* RE_ENABLE_I18N */
 437               || type == END_OF_RE)
 438        {
 439          memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 440          if (type == END_OF_RE)
 441            bufp->can_be_null = 1;
 442          return;
 443        }
 444    }
 445}
 446\f
 447/* Entry point for POSIX code.  */
 448/* regcomp takes a regular expression as a string and compiles it.
 449
 450   PREG is a regex_t *.  We do not expect any fields to be initialized,
 451   since POSIX says we shouldn't.  Thus, we set
 452
 453     `buffer' to the compiled pattern;
 454     `used' to the length of the compiled pattern;
 455     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 456       REG_EXTENDED bit in CFLAGS is set; otherwise, to
 457       RE_SYNTAX_POSIX_BASIC;
 458     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 459     `fastmap' to an allocated space for the fastmap;
 460     `fastmap_accurate' to zero;
 461     `re_nsub' to the number of subexpressions in PATTERN.
 462
 463   PATTERN is the address of the pattern string.
 464
 465   CFLAGS is a series of bits which affect compilation.
 466
 467     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 468     use POSIX basic syntax.
 469
 470     If REG_NEWLINE is set, then . and [^...] don't match newline.
 471     Also, regexec will try a match beginning after every newline.
 472
 473     If REG_ICASE is set, then we considers upper- and lowercase
 474     versions of letters to be equivalent when matching.
 475
 476     If REG_NOSUB is set, then when PREG is passed to regexec, that
 477     routine will report only success or failure, and nothing about the
 478     registers.
 479
 480   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 481   the return codes and their meanings.)  */
 482
 483int
 484regcomp (regex_t *__restrict preg,
 485         const char *__restrict pattern,
 486         int cflags)
 487{
 488  reg_errcode_t ret;
 489  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 490                         : RE_SYNTAX_POSIX_BASIC);
 491
 492  preg->buffer = NULL;
 493  preg->allocated = 0;
 494  preg->used = 0;
 495
 496  /* Try to allocate space for the fastmap.  */
 497  preg->fastmap = re_malloc (char, SBC_MAX);
 498  if (BE (preg->fastmap == NULL, 0))
 499    return REG_ESPACE;
 500
 501  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 502
 503  /* If REG_NEWLINE is set, newlines are treated differently.  */
 504  if (cflags & REG_NEWLINE)
 505    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 506      syntax &= ~RE_DOT_NEWLINE;
 507      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 508      /* It also changes the matching behavior.  */
 509      preg->newline_anchor = 1;
 510    }
 511  else
 512    preg->newline_anchor = 0;
 513  preg->no_sub = !!(cflags & REG_NOSUB);
 514  preg->translate = NULL;
 515
 516  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 517
 518  /* POSIX doesn't distinguish between an unmatched open-group and an
 519     unmatched close-group: both are REG_EPAREN.  */
 520  if (ret == REG_ERPAREN)
 521    ret = REG_EPAREN;
 522
 523  /* We have already checked preg->fastmap != NULL.  */
 524  if (BE (ret == REG_NOERROR, 1))
 525    /* Compute the fastmap now, since regexec cannot modify the pattern
 526       buffer.  This function never fails in this implementation.  */
 527    (void) re_compile_fastmap (preg);
 528  else
 529    {
 530      /* Some error occurred while compiling the expression.  */
 531      re_free (preg->fastmap);
 532      preg->fastmap = NULL;
 533    }
 534
 535  return (int) ret;
 536}
 537#ifdef _LIBC
 538weak_alias (__regcomp, regcomp)
 539#endif
 540
 541/* Returns a message corresponding to an error code, ERRCODE, returned
 542   from either regcomp or regexec.   We don't use PREG here.  */
 543
 544size_t
 545regerror(int errcode, const regex_t *__restrict preg,
 546         char *__restrict errbuf, size_t errbuf_size)
 547{
 548  const char *msg;
 549  size_t msg_size;
 550
 551  if (BE (errcode < 0
 552          || errcode >= (int) (sizeof (__re_error_msgid_idx)
 553                               / sizeof (__re_error_msgid_idx[0])), 0))
 554    /* Only error codes returned by the rest of the code should be passed
 555       to this routine.  If we are given anything else, or if other regex
 556       code generates an invalid error code, then the program has a bug.
 557       Dump core so we can fix it.  */
 558    abort ();
 559
 560  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 561
 562  msg_size = strlen (msg) + 1; /* Includes the null.  */
 563
 564  if (BE (errbuf_size != 0, 1))
 565    {
 566      if (BE (msg_size > errbuf_size, 0))
 567        {
 568          memcpy (errbuf, msg, errbuf_size - 1);
 569          errbuf[errbuf_size - 1] = 0;
 570        }
 571      else
 572        memcpy (errbuf, msg, msg_size);
 573    }
 574
 575  return msg_size;
 576}
 577#ifdef _LIBC
 578weak_alias (__regerror, regerror)
 579#endif
 580
 581
 582#ifdef RE_ENABLE_I18N
 583/* This static array is used for the map to single-byte characters when
 584   UTF-8 is used.  Otherwise we would allocate memory just to initialize
 585   it the same all the time.  UTF-8 is the preferred encoding so this is
 586   a worthwhile optimization.  */
 587#if __GNUC__ >= 3
 588static const bitset_t utf8_sb_map = {
 589  /* Set the first 128 bits.  */
 590  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 591};
 592#else /* ! (__GNUC__ >= 3) */
 593static bitset_t utf8_sb_map;
 594#endif /* __GNUC__ >= 3 */
 595#endif /* RE_ENABLE_I18N */
 596
 597
 598static void
 599free_dfa_content (re_dfa_t *dfa)
 600{
 601  int i, j;
 602
 603  if (dfa->nodes)
 604    for (i = 0; i < dfa->nodes_len; ++i)
 605      free_token (dfa->nodes + i);
 606  re_free (dfa->nexts);
 607  for (i = 0; i < dfa->nodes_len; ++i)
 608    {
 609      if (dfa->eclosures != NULL)
 610        re_node_set_free (dfa->eclosures + i);
 611      if (dfa->inveclosures != NULL)
 612        re_node_set_free (dfa->inveclosures + i);
 613      if (dfa->edests != NULL)
 614        re_node_set_free (dfa->edests + i);
 615    }
 616  re_free (dfa->edests);
 617  re_free (dfa->eclosures);
 618  re_free (dfa->inveclosures);
 619  re_free (dfa->nodes);
 620
 621  if (dfa->state_table)
 622    for (i = 0; i <= dfa->state_hash_mask; ++i)
 623      {
 624        struct re_state_table_entry *entry = dfa->state_table + i;
 625        for (j = 0; j < entry->num; ++j)
 626          {
 627            re_dfastate_t *state = entry->array[j];
 628            free_state (state);
 629          }
 630        re_free (entry->array);
 631      }
 632  re_free (dfa->state_table);
 633#ifdef RE_ENABLE_I18N
 634  if (dfa->sb_char != utf8_sb_map)
 635    re_free (dfa->sb_char);
 636#endif
 637  re_free (dfa->subexp_map);
 638#ifdef DEBUG
 639  re_free (dfa->re_str);
 640#endif
 641
 642  re_free (dfa);
 643}
 644
 645
 646/* Free dynamically allocated space used by PREG.  */
 647
 648void
 649regfree (regex_t *preg)
 650{
 651  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 652  if (BE (dfa != NULL, 1))
 653    free_dfa_content (dfa);
 654  preg->buffer = NULL;
 655  preg->allocated = 0;
 656
 657  re_free (preg->fastmap);
 658  preg->fastmap = NULL;
 659
 660  re_free (preg->translate);
 661  preg->translate = NULL;
 662}
 663#ifdef _LIBC
 664weak_alias (__regfree, regfree)
 665#endif
 666\f
 667/* Entry points compatible with 4.2 BSD regex library.  We don't define
 668   them unless specifically requested.  */
 669
 670#if defined _REGEX_RE_COMP || defined _LIBC
 671
 672/* BSD has one and only one pattern buffer.  */
 673static struct re_pattern_buffer re_comp_buf;
 674
 675char *
 676# ifdef _LIBC
 677/* Make these definitions weak in libc, so POSIX programs can redefine
 678   these names if they don't use our functions, and still use
 679   regcomp/regexec above without link errors.  */
 680weak_function
 681# endif
 682re_comp (s)
 683     const char *s;
 684{
 685  reg_errcode_t ret;
 686  char *fastmap;
 687
 688  if (!s)
 689    {
 690      if (!re_comp_buf.buffer)
 691        return gettext ("No previous regular expression");
 692      return 0;
 693    }
 694
 695  if (re_comp_buf.buffer)
 696    {
 697      fastmap = re_comp_buf.fastmap;
 698      re_comp_buf.fastmap = NULL;
 699      __regfree (&re_comp_buf);
 700      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 701      re_comp_buf.fastmap = fastmap;
 702    }
 703
 704  if (re_comp_buf.fastmap == NULL)
 705    {
 706      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 707      if (re_comp_buf.fastmap == NULL)
 708        return (char *) gettext (__re_error_msgid
 709                                 + __re_error_msgid_idx[(int) REG_ESPACE]);
 710    }
 711
 712  /* Since `re_exec' always passes NULL for the `regs' argument, we
 713     don't need to initialize the pattern buffer fields which affect it.  */
 714
 715  /* Match anchors at newlines.  */
 716  re_comp_buf.newline_anchor = 1;
 717
 718  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 719
 720  if (!ret)
 721    return NULL;
 722
 723  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 724  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 725}
 726
 727#ifdef _LIBC
 728libc_freeres_fn (free_mem)
 729{
 730  __regfree (&re_comp_buf);
 731}
 732#endif
 733
 734#endif /* _REGEX_RE_COMP */
 735\f
 736/* Internal entry point.
 737   Compile the regular expression PATTERN, whose length is LENGTH.
 738   SYNTAX indicate regular expression's syntax.  */
 739
 740static reg_errcode_t
 741re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 742                     reg_syntax_t syntax)
 743{
 744  reg_errcode_t err = REG_NOERROR;
 745  re_dfa_t *dfa;
 746  re_string_t regexp;
 747
 748  /* Initialize the pattern buffer.  */
 749  preg->fastmap_accurate = 0;
 750  preg->syntax = syntax;
 751  preg->not_bol = preg->not_eol = 0;
 752  preg->used = 0;
 753  preg->re_nsub = 0;
 754  preg->can_be_null = 0;
 755  preg->regs_allocated = REGS_UNALLOCATED;
 756
 757  /* Initialize the dfa.  */
 758  dfa = (re_dfa_t *) preg->buffer;
 759  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 760    {
 761      /* If zero allocated, but buffer is non-null, try to realloc
 762         enough space.  This loses if buffer's address is bogus, but
 763         that is the user's responsibility.  If ->buffer is NULL this
 764         is a simple allocation.  */
 765      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 766      if (dfa == NULL)
 767        return REG_ESPACE;
 768      preg->allocated = sizeof (re_dfa_t);
 769      preg->buffer = (unsigned char *) dfa;
 770    }
 771  preg->used = sizeof (re_dfa_t);
 772
 773  err = init_dfa (dfa, length);
 774  if (BE (err != REG_NOERROR, 0))
 775    {
 776      free_dfa_content (dfa);
 777      preg->buffer = NULL;
 778      preg->allocated = 0;
 779      return err;
 780    }
 781#ifdef DEBUG
 782  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 783  dfa->re_str = re_malloc (char, length + 1);
 784  strncpy (dfa->re_str, pattern, length + 1);
 785#endif
 786
 787  __libc_lock_init (dfa->lock);
 788
 789  err = re_string_construct (&regexp, pattern, length, preg->translate,
 790                             syntax & RE_ICASE, dfa);
 791  if (BE (err != REG_NOERROR, 0))
 792    {
 793    re_compile_internal_free_return:
 794      free_workarea_compile (preg);
 795      re_string_destruct (&regexp);
 796      free_dfa_content (dfa);
 797      preg->buffer = NULL;
 798      preg->allocated = 0;
 799      return err;
 800    }
 801
 802  /* Parse the regular expression, and build a structure tree.  */
 803  preg->re_nsub = 0;
 804  dfa->str_tree = parse (&regexp, preg, syntax, &err);
 805  if (BE (dfa->str_tree == NULL, 0))
 806    goto re_compile_internal_free_return;
 807
 808  /* Analyze the tree and create the nfa.  */
 809  err = analyze (preg);
 810  if (BE (err != REG_NOERROR, 0))
 811    goto re_compile_internal_free_return;
 812
 813#ifdef RE_ENABLE_I18N
 814  /* If possible, do searching in single byte encoding to speed things up.  */
 815  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 816    optimize_utf8 (dfa);
 817#endif
 818
 819  /* Then create the initial state of the dfa.  */
 820  err = create_initial_state (dfa);
 821
 822  /* Release work areas.  */
 823  free_workarea_compile (preg);
 824  re_string_destruct (&regexp);
 825
 826  if (BE (err != REG_NOERROR, 0))
 827    {
 828      free_dfa_content (dfa);
 829      preg->buffer = NULL;
 830      preg->allocated = 0;
 831    }
 832
 833  return err;
 834}
 835
 836/* Initialize DFA.  We use the length of the regular expression PAT_LEN
 837   as the initial length of some arrays.  */
 838
 839static reg_errcode_t
 840init_dfa (re_dfa_t *dfa, size_t pat_len)
 841{
 842  unsigned int table_size;
 843#ifndef _LIBC
 844  char *codeset_name;
 845#endif
 846
 847  memset (dfa, '\0', sizeof (re_dfa_t));
 848
 849  /* Force allocation of str_tree_storage the first time.  */
 850  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 851
 852  /* Avoid overflows.  */
 853  if (pat_len == SIZE_MAX)
 854    return REG_ESPACE;
 855
 856  dfa->nodes_alloc = pat_len + 1;
 857  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 858
 859  /*  table_size = 2 ^ ceil(log pat_len) */
 860  for (table_size = 1; ; table_size <<= 1)
 861    if (table_size > pat_len)
 862      break;
 863
 864  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 865  dfa->state_hash_mask = table_size - 1;
 866
 867  dfa->mb_cur_max = MB_CUR_MAX;
 868#ifdef _LIBC
 869  if (dfa->mb_cur_max == 6
 870      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 871    dfa->is_utf8 = 1;
 872  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 873                       != 0);
 874#else
 875# ifdef HAVE_LANGINFO_CODESET
 876  codeset_name = nl_langinfo (CODESET);
 877# else
 878  codeset_name = getenv ("LC_ALL");
 879  if (codeset_name == NULL || codeset_name[0] == '\0')
 880    codeset_name = getenv ("LC_CTYPE");
 881  if (codeset_name == NULL || codeset_name[0] == '\0')
 882    codeset_name = getenv ("LANG");
 883  if (codeset_name == NULL)
 884    codeset_name = "";
 885  else if (strchr (codeset_name, '.') !=  NULL)
 886    codeset_name = strchr (codeset_name, '.') + 1;
 887# endif
 888
 889  /* strcasecmp isn't a standard interface. brute force check */
 890#if 0
 891  if (strcasecmp (codeset_name, "UTF-8") == 0
 892      || strcasecmp (codeset_name, "UTF8") == 0)
 893    dfa->is_utf8 = 1;
 894#else
 895  if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 896      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 897      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 898      && (codeset_name[3] == '-'
 899          ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 900          : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 901    dfa->is_utf8 = 1;
 902#endif
 903
 904  /* We check exhaustively in the loop below if this charset is a
 905     superset of ASCII.  */
 906  dfa->map_notascii = 0;
 907#endif
 908
 909#ifdef RE_ENABLE_I18N
 910  if (dfa->mb_cur_max > 1)
 911    {
 912      if (dfa->is_utf8)
 913        {
 914#if !defined(__GNUC__) || __GNUC__ < 3
 915          static short utf8_sb_map_inited = 0;
 916
 917          if (! utf8_sb_map_inited)
 918            {
 919                int i;
 920
 921                utf8_sb_map_inited = 0;
 922                for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 923                  utf8_sb_map[i] = BITSET_WORD_MAX;
 924            }
 925#endif
 926          dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 927        }
 928      else
 929        {
 930          int i, j, ch;
 931
 932          dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 933          if (BE (dfa->sb_char == NULL, 0))
 934            return REG_ESPACE;
 935
 936          /* Set the bits corresponding to single byte chars.  */
 937          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 938            for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 939              {
 940                wint_t wch = __btowc (ch);
 941                if (wch != WEOF)
 942                  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 943# ifndef _LIBC
 944                if (isascii (ch) && wch != ch)
 945                  dfa->map_notascii = 1;
 946# endif
 947              }
 948        }
 949    }
 950#endif
 951
 952  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 953    return REG_ESPACE;
 954  return REG_NOERROR;
 955}
 956
 957/* Initialize WORD_CHAR table, which indicate which character is
 958   "word".  In this case "word" means that it is the word construction
 959   character used by some operators like "\<", "\>", etc.  */
 960
 961static void
 962internal_function
 963init_word_char (re_dfa_t *dfa)
 964{
 965  int i, j, ch;
 966  dfa->word_ops_used = 1;
 967  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 968    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 969      if (isalnum (ch) || ch == '_')
 970        dfa->word_char[i] |= (bitset_word_t) 1 << j;
 971}
 972
 973/* Free the work area which are only used while compiling.  */
 974
 975static void
 976free_workarea_compile (regex_t *preg)
 977{
 978  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 979  bin_tree_storage_t *storage, *next;
 980  for (storage = dfa->str_tree_storage; storage; storage = next)
 981    {
 982      next = storage->next;
 983      re_free (storage);
 984    }
 985  dfa->str_tree_storage = NULL;
 986  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 987  dfa->str_tree = NULL;
 988  re_free (dfa->org_indices);
 989  dfa->org_indices = NULL;
 990}
 991
 992/* Create initial states for all contexts.  */
 993
 994static reg_errcode_t
 995create_initial_state (re_dfa_t *dfa)
 996{
 997  int first, i;
 998  reg_errcode_t err;
 999  re_node_set init_nodes;
1000
1001  /* Initial states have the epsilon closure of the node which is
1002     the first node of the regular expression.  */
1003  first = dfa->str_tree->first->node_idx;
1004  dfa->init_node = first;
1005  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1006  if (BE (err != REG_NOERROR, 0))
1007    return err;
1008
1009  /* The back-references which are in initial states can epsilon transit,
1010     since in this case all of the subexpressions can be null.
1011     Then we add epsilon closures of the nodes which are the next nodes of
1012     the back-references.  */
1013  if (dfa->nbackref > 0)
1014    for (i = 0; i < init_nodes.nelem; ++i)
1015      {
1016        int node_idx = init_nodes.elems[i];
1017        re_token_type_t type = dfa->nodes[node_idx].type;
1018
1019        int clexp_idx;
1020        if (type != OP_BACK_REF)
1021          continue;
1022        for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1023          {
1024            re_token_t *clexp_node;
1025            clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1026            if (clexp_node->type == OP_CLOSE_SUBEXP
1027                && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1028              break;
1029          }
1030        if (clexp_idx == init_nodes.nelem)
1031          continue;
1032
1033        if (type == OP_BACK_REF)
1034          {
1035            int dest_idx = dfa->edests[node_idx].elems[0];
1036            if (!re_node_set_contains (&init_nodes, dest_idx))
1037              {
1038                reg_errcode_t err = re_node_set_merge (&init_nodes,
1039                                                       dfa->eclosures
1040                                                       + dest_idx);
1041                if (err != REG_NOERROR)
1042                  return err;
1043                i = 0;
1044              }
1045          }
1046      }
1047
1048  /* It must be the first time to invoke acquire_state.  */
1049  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1050  /* We don't check ERR here, since the initial state must not be NULL.  */
1051  if (BE (dfa->init_state == NULL, 0))
1052    return err;
1053  if (dfa->init_state->has_constraint)
1054    {
1055      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1056                                                       CONTEXT_WORD);
1057      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1058                                                     CONTEXT_NEWLINE);
1059      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1060                                                         &init_nodes,
1061                                                         CONTEXT_NEWLINE
1062                                                         | CONTEXT_BEGBUF);
1063      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1064              || dfa->init_state_begbuf == NULL, 0))
1065        return err;
1066    }
1067  else
1068    dfa->init_state_word = dfa->init_state_nl
1069      = dfa->init_state_begbuf = dfa->init_state;
1070
1071  re_node_set_free (&init_nodes);
1072  return REG_NOERROR;
1073}
1074\f
1075#ifdef RE_ENABLE_I18N
1076/* If it is possible to do searching in single byte encoding instead of UTF-8
1077   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1078   DFA nodes where needed.  */
1079
1080static void
1081optimize_utf8 (re_dfa_t *dfa)
1082{
1083  int node, i, mb_chars = 0, has_period = 0;
1084
1085  for (node = 0; node < dfa->nodes_len; ++node)
1086    switch (dfa->nodes[node].type)
1087      {
1088      case CHARACTER:
1089        if (dfa->nodes[node].opr.c >= 0x80)
1090          mb_chars = 1;
1091        break;
1092      case ANCHOR:
1093        switch (dfa->nodes[node].opr.ctx_type)
1094          {
1095          case LINE_FIRST:
1096          case LINE_LAST:
1097          case BUF_FIRST:
1098          case BUF_LAST:
1099            break;
1100          default:
1101            /* Word anchors etc. cannot be handled.  It's okay to test
1102               opr.ctx_type since constraints (for all DFA nodes) are
1103               created by ORing one or more opr.ctx_type values.  */
1104            return;
1105          }
1106        break;
1107      case OP_PERIOD:
1108        has_period = 1;
1109        break;
1110      case OP_BACK_REF:
1111      case OP_ALT:
1112      case END_OF_RE:
1113      case OP_DUP_ASTERISK:
1114      case OP_OPEN_SUBEXP:
1115      case OP_CLOSE_SUBEXP:
1116        break;
1117      case COMPLEX_BRACKET:
1118        return;
1119      case SIMPLE_BRACKET:
1120        /* Just double check.  The non-ASCII range starts at 0x80.  */
1121        assert (0x80 % BITSET_WORD_BITS == 0);
1122        for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1123          if (dfa->nodes[node].opr.sbcset[i])
1124            return;
1125        break;
1126      default:
1127        abort ();
1128      }
1129
1130  if (mb_chars || has_period)
1131    for (node = 0; node < dfa->nodes_len; ++node)
1132      {
1133        if (dfa->nodes[node].type == CHARACTER
1134            && dfa->nodes[node].opr.c >= 0x80)
1135          dfa->nodes[node].mb_partial = 0;
1136        else if (dfa->nodes[node].type == OP_PERIOD)
1137          dfa->nodes[node].type = OP_UTF8_PERIOD;
1138      }
1139
1140  /* The search can be in single byte locale.  */
1141  dfa->mb_cur_max = 1;
1142  dfa->is_utf8 = 0;
1143  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1144}
1145#endif
1146\f
1147/* Analyze the structure tree, and calculate "first", "next", "edest",
1148   "eclosure", and "inveclosure".  */
1149
1150static reg_errcode_t
1151analyze (regex_t *preg)
1152{
1153  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1154  reg_errcode_t ret;
1155
1156  /* Allocate arrays.  */
1157  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1158  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1159  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1160  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1161  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1162          || dfa->eclosures == NULL, 0))
1163    return REG_ESPACE;
1164
1165  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1166  if (dfa->subexp_map != NULL)
1167    {
1168      int i;
1169      for (i = 0; i < preg->re_nsub; i++)
1170        dfa->subexp_map[i] = i;
1171      preorder (dfa->str_tree, optimize_subexps, dfa);
1172      for (i = 0; i < preg->re_nsub; i++)
1173        if (dfa->subexp_map[i] != i)
1174          break;
1175      if (i == preg->re_nsub)
1176        {
1177          free (dfa->subexp_map);
1178          dfa->subexp_map = NULL;
1179        }
1180    }
1181
1182  ret = postorder (dfa->str_tree, lower_subexps, preg);
1183  if (BE (ret != REG_NOERROR, 0))
1184    return ret;
1185  ret = postorder (dfa->str_tree, calc_first, dfa);
1186  if (BE (ret != REG_NOERROR, 0))
1187    return ret;
1188  preorder (dfa->str_tree, calc_next, dfa);
1189  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1190  if (BE (ret != REG_NOERROR, 0))
1191    return ret;
1192  ret = calc_eclosure (dfa);
1193  if (BE (ret != REG_NOERROR, 0))
1194    return ret;
1195
1196  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1197     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1198  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1199      || dfa->nbackref)
1200    {
1201      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1202      if (BE (dfa->inveclosures == NULL, 0))
1203        return REG_ESPACE;
1204      ret = calc_inveclosure (dfa);
1205    }
1206
1207  return ret;
1208}
1209
1210/* Our parse trees are very unbalanced, so we cannot use a stack to
1211   implement parse tree visits.  Instead, we use parent pointers and
1212   some hairy code in these two functions.  */
1213static reg_errcode_t
1214postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1215           void *extra)
1216{
1217  bin_tree_t *node, *prev;
1218
1219  for (node = root; ; )
1220    {
1221      /* Descend down the tree, preferably to the left (or to the right
1222         if that's the only child).  */
1223      while (node->left || node->right)
1224        if (node->left)
1225          node = node->left;
1226        else
1227          node = node->right;
1228
1229      do
1230        {
1231          reg_errcode_t err = fn (extra, node);
1232          if (BE (err != REG_NOERROR, 0))
1233            return err;
1234          if (node->parent == NULL)
1235            return REG_NOERROR;
1236          prev = node;
1237          node = node->parent;
1238        }
1239      /* Go up while we have a node that is reached from the right.  */
1240      while (node->right == prev || node->right == NULL);
1241      node = node->right;
1242    }
1243}
1244
1245static reg_errcode_t
1246preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1247          void *extra)
1248{
1249  bin_tree_t *node;
1250
1251  for (node = root; ; )
1252    {
1253      reg_errcode_t err = fn (extra, node);
1254      if (BE (err != REG_NOERROR, 0))
1255        return err;
1256
1257      /* Go to the left node, or up and to the right.  */
1258      if (node->left)
1259        node = node->left;
1260      else
1261        {
1262          bin_tree_t *prev = NULL;
1263          while (node->right == prev || node->right == NULL)
1264            {
1265              prev = node;
1266              node = node->parent;
1267              if (!node)
1268                return REG_NOERROR;
1269            }
1270          node = node->right;
1271        }
1272    }
1273}
1274
1275/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1276   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1277   backreferences as well.  Requires a preorder visit.  */
1278static reg_errcode_t
1279optimize_subexps (void *extra, bin_tree_t *node)
1280{
1281  re_dfa_t *dfa = (re_dfa_t *) extra;
1282
1283  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1284    {
1285      int idx = node->token.opr.idx;
1286      node->token.opr.idx = dfa->subexp_map[idx];
1287      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1288    }
1289
1290  else if (node->token.type == SUBEXP
1291           && node->left && node->left->token.type == SUBEXP)
1292    {
1293      int other_idx = node->left->token.opr.idx;
1294
1295      node->left = node->left->left;
1296      if (node->left)
1297        node->left->parent = node;
1298
1299      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1300      if (other_idx < BITSET_WORD_BITS)
1301          dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1302    }
1303
1304  return REG_NOERROR;
1305}
1306
1307/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1308   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1309static reg_errcode_t
1310lower_subexps (void *extra, bin_tree_t *node)
1311{
1312  regex_t *preg = (regex_t *) extra;
1313  reg_errcode_t err = REG_NOERROR;
1314
1315  if (node->left && node->left->token.type == SUBEXP)
1316    {
1317      node->left = lower_subexp (&err, preg, node->left);
1318      if (node->left)
1319        node->left->parent = node;
1320    }
1321  if (node->right && node->right->token.type == SUBEXP)
1322    {
1323      node->right = lower_subexp (&err, preg, node->right);
1324      if (node->right)
1325        node->right->parent = node;
1326    }
1327
1328  return err;
1329}
1330
1331static bin_tree_t *
1332lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1333{
1334  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1335  bin_tree_t *body = node->left;
1336  bin_tree_t *op, *cls, *tree1, *tree;
1337
1338  if (preg->no_sub
1339      /* We do not optimize empty subexpressions, because otherwise we may
1340         have bad CONCAT nodes with NULL children.  This is obviously not
1341         very common, so we do not lose much.  An example that triggers
1342         this case is the sed "script" /\(\)/x.  */
1343      && node->left != NULL
1344      && (node->token.opr.idx >= BITSET_WORD_BITS
1345          || !(dfa->used_bkref_map
1346               & ((bitset_word_t) 1 << node->token.opr.idx))))
1347    return node->left;
1348
1349  /* Convert the SUBEXP node to the concatenation of an
1350     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1351  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1352  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1353  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1354  tree = create_tree (dfa, op, tree1, CONCAT);
1355  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1356    {
1357      *err = REG_ESPACE;
1358      return NULL;
1359    }
1360
1361  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1362  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1363  return tree;
1364}
1365
1366/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1367   nodes.  Requires a postorder visit.  */
1368static reg_errcode_t
1369calc_first (void *extra, bin_tree_t *node)
1370{
1371  re_dfa_t *dfa = (re_dfa_t *) extra;
1372  if (node->token.type == CONCAT)
1373    {
1374      node->first = node->left->first;
1375      node->node_idx = node->left->node_idx;
1376    }
1377  else
1378    {
1379      node->first = node;
1380      node->node_idx = re_dfa_add_node (dfa, node->token);
1381      if (BE (node->node_idx == -1, 0))
1382        return REG_ESPACE;
1383      if (node->token.type == ANCHOR)
1384        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1385    }
1386  return REG_NOERROR;
1387}
1388
1389/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1390static reg_errcode_t
1391calc_next (void *extra, bin_tree_t *node)
1392{
1393  switch (node->token.type)
1394    {
1395    case OP_DUP_ASTERISK:
1396      node->left->next = node;
1397      break;
1398    case CONCAT:
1399      node->left->next = node->right->first;
1400      node->right->next = node->next;
1401      break;
1402    default:
1403      if (node->left)
1404        node->left->next = node->next;
1405      if (node->right)
1406        node->right->next = node->next;
1407      break;
1408    }
1409  return REG_NOERROR;
1410}
1411
1412/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1413static reg_errcode_t
1414link_nfa_nodes (void *extra, bin_tree_t *node)
1415{
1416  re_dfa_t *dfa = (re_dfa_t *) extra;
1417  int idx = node->node_idx;
1418  reg_errcode_t err = REG_NOERROR;
1419
1420  switch (node->token.type)
1421    {
1422    case CONCAT:
1423      break;
1424
1425    case END_OF_RE:
1426      assert (node->next == NULL);
1427      break;
1428
1429    case OP_DUP_ASTERISK:
1430    case OP_ALT:
1431      {
1432        int left, right;
1433        dfa->has_plural_match = 1;
1434        if (node->left != NULL)
1435          left = node->left->first->node_idx;
1436        else
1437          left = node->next->node_idx;
1438        if (node->right != NULL)
1439          right = node->right->first->node_idx;
1440        else
1441          right = node->next->node_idx;
1442        assert (left > -1);
1443        assert (right > -1);
1444        err = re_node_set_init_2 (dfa->edests + idx, left, right);
1445      }
1446      break;
1447
1448    case ANCHOR:
1449    case OP_OPEN_SUBEXP:
1450    case OP_CLOSE_SUBEXP:
1451      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1452      break;
1453
1454    case OP_BACK_REF:
1455      dfa->nexts[idx] = node->next->node_idx;
1456      if (node->token.type == OP_BACK_REF)
1457        err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1458      break;
1459
1460    default:
1461      assert (!IS_EPSILON_NODE (node->token.type));
1462      dfa->nexts[idx] = node->next->node_idx;
1463      break;
1464    }
1465
1466  return err;
1467}
1468
1469/* Duplicate the epsilon closure of the node ROOT_NODE.
1470   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1471   to their own constraint.  */
1472
1473static reg_errcode_t
1474internal_function
1475duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1476                        int root_node, unsigned int init_constraint)
1477{
1478  int org_node, clone_node, ret;
1479  unsigned int constraint = init_constraint;
1480  for (org_node = top_org_node, clone_node = top_clone_node;;)
1481    {
1482      int org_dest, clone_dest;
1483      if (dfa->nodes[org_node].type == OP_BACK_REF)
1484        {
1485          /* If the back reference epsilon-transit, its destination must
1486             also have the constraint.  Then duplicate the epsilon closure
1487             of the destination of the back reference, and store it in
1488             edests of the back reference.  */
1489          org_dest = dfa->nexts[org_node];
1490          re_node_set_empty (dfa->edests + clone_node);
1491          clone_dest = duplicate_node (dfa, org_dest, constraint);
1492          if (BE (clone_dest == -1, 0))
1493            return REG_ESPACE;
1494          dfa->nexts[clone_node] = dfa->nexts[org_node];
1495          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496          if (BE (ret < 0, 0))
1497            return REG_ESPACE;
1498        }
1499      else if (dfa->edests[org_node].nelem == 0)
1500        {
1501          /* In case of the node can't epsilon-transit, don't duplicate the
1502             destination and store the original destination as the
1503             destination of the node.  */
1504          dfa->nexts[clone_node] = dfa->nexts[org_node];
1505          break;
1506        }
1507      else if (dfa->edests[org_node].nelem == 1)
1508        {
1509          /* In case of the node can epsilon-transit, and it has only one
1510             destination.  */
1511          org_dest = dfa->edests[org_node].elems[0];
1512          re_node_set_empty (dfa->edests + clone_node);
1513          /* If the node is root_node itself, it means the epsilon clsoure
1514             has a loop.   Then tie it to the destination of the root_node.  */
1515          if (org_node == root_node && clone_node != org_node)
1516            {
1517              ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1518              if (BE (ret < 0, 0))
1519                return REG_ESPACE;
1520              break;
1521            }
1522          /* In case of the node has another constraint, add it.  */
1523          constraint |= dfa->nodes[org_node].constraint;
1524          clone_dest = duplicate_node (dfa, org_dest, constraint);
1525          if (BE (clone_dest == -1, 0))
1526            return REG_ESPACE;
1527          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528          if (BE (ret < 0, 0))
1529            return REG_ESPACE;
1530        }
1531      else /* dfa->edests[org_node].nelem == 2 */
1532        {
1533          /* In case of the node can epsilon-transit, and it has two
1534             destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1535          org_dest = dfa->edests[org_node].elems[0];
1536          re_node_set_empty (dfa->edests + clone_node);
1537          /* Search for a duplicated node which satisfies the constraint.  */
1538          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1539          if (clone_dest == -1)
1540            {
1541              /* There is no such duplicated node, create a new one.  */
1542              reg_errcode_t err;
1543              clone_dest = duplicate_node (dfa, org_dest, constraint);
1544              if (BE (clone_dest == -1, 0))
1545                return REG_ESPACE;
1546              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547              if (BE (ret < 0, 0))
1548                return REG_ESPACE;
1549              err = duplicate_node_closure (dfa, org_dest, clone_dest,
1550                                            root_node, constraint);
1551              if (BE (err != REG_NOERROR, 0))
1552                return err;
1553            }
1554          else
1555            {
1556              /* There is a duplicated node which satisfies the constraint,
1557                 use it to avoid infinite loop.  */
1558              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559              if (BE (ret < 0, 0))
1560                return REG_ESPACE;
1561            }
1562
1563          org_dest = dfa->edests[org_node].elems[1];
1564          clone_dest = duplicate_node (dfa, org_dest, constraint);
1565          if (BE (clone_dest == -1, 0))
1566            return REG_ESPACE;
1567          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568          if (BE (ret < 0, 0))
1569            return REG_ESPACE;
1570        }
1571      org_node = org_dest;
1572      clone_node = clone_dest;
1573    }
1574  return REG_NOERROR;
1575}
1576
1577/* Search for a node which is duplicated from the node ORG_NODE, and
1578   satisfies the constraint CONSTRAINT.  */
1579
1580static int
1581search_duplicated_node (const re_dfa_t *dfa, int org_node,
1582                        unsigned int constraint)
1583{
1584  int idx;
1585  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1586    {
1587      if (org_node == dfa->org_indices[idx]
1588          && constraint == dfa->nodes[idx].constraint)
1589        return idx; /* Found.  */
1590    }
1591  return -1; /* Not found.  */
1592}
1593
1594/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1595   Return the index of the new node, or -1 if insufficient storage is
1596   available.  */
1597
1598static int
1599duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1600{
1601  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1602  if (BE (dup_idx != -1, 1))
1603    {
1604      dfa->nodes[dup_idx].constraint = constraint;
1605      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1606      dfa->nodes[dup_idx].duplicated = 1;
1607
1608      /* Store the index of the original node.  */
1609      dfa->org_indices[dup_idx] = org_idx;
1610    }
1611  return dup_idx;
1612}
1613
1614static reg_errcode_t
1615calc_inveclosure (re_dfa_t *dfa)
1616{
1617  int src, idx, ret;
1618  for (idx = 0; idx < dfa->nodes_len; ++idx)
1619    re_node_set_init_empty (dfa->inveclosures + idx);
1620
1621  for (src = 0; src < dfa->nodes_len; ++src)
1622    {
1623      int *elems = dfa->eclosures[src].elems;
1624      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1625        {
1626          ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1627          if (BE (ret == -1, 0))
1628            return REG_ESPACE;
1629        }
1630    }
1631
1632  return REG_NOERROR;
1633}
1634
1635/* Calculate "eclosure" for all the node in DFA.  */
1636
1637static reg_errcode_t
1638calc_eclosure (re_dfa_t *dfa)
1639{
1640  int node_idx, incomplete;
1641#ifdef DEBUG
1642  assert (dfa->nodes_len > 0);
1643#endif
1644  incomplete = 0;
1645  /* For each nodes, calculate epsilon closure.  */
1646  for (node_idx = 0; ; ++node_idx)
1647    {
1648      reg_errcode_t err;
1649      re_node_set eclosure_elem;
1650      if (node_idx == dfa->nodes_len)
1651        {
1652          if (!incomplete)
1653            break;
1654          incomplete = 0;
1655          node_idx = 0;
1656        }
1657
1658#ifdef DEBUG
1659      assert (dfa->eclosures[node_idx].nelem != -1);
1660#endif
1661
1662      /* If we have already calculated, skip it.  */
1663      if (dfa->eclosures[node_idx].nelem != 0)
1664        continue;
1665      /* Calculate epsilon closure of `node_idx'.  */
1666      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1667      if (BE (err != REG_NOERROR, 0))
1668        return err;
1669
1670      if (dfa->eclosures[node_idx].nelem == 0)
1671        {
1672          incomplete = 1;
1673          re_node_set_free (&eclosure_elem);
1674        }
1675    }
1676  return REG_NOERROR;
1677}
1678
1679/* Calculate epsilon closure of NODE.  */
1680
1681static reg_errcode_t
1682calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1683{
1684  reg_errcode_t err;
1685  int i;
1686  re_node_set eclosure;
1687  int ret;
1688  int incomplete = 0;
1689  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690  if (BE (err != REG_NOERROR, 0))
1691    return err;
1692
1693  /* This indicates that we are calculating this node now.
1694     We reference this value to avoid infinite loop.  */
1695  dfa->eclosures[node].nelem = -1;
1696
1697  /* If the current node has constraints, duplicate all nodes
1698     since they must inherit the constraints.  */
1699  if (dfa->nodes[node].constraint
1700      && dfa->edests[node].nelem
1701      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1702    {
1703      err = duplicate_node_closure (dfa, node, node, node,
1704                                    dfa->nodes[node].constraint);
1705      if (BE (err != REG_NOERROR, 0))
1706        return err;
1707    }
1708
1709  /* Expand each epsilon destination nodes.  */
1710  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711    for (i = 0; i < dfa->edests[node].nelem; ++i)
1712      {
1713        re_node_set eclosure_elem;
1714        int edest = dfa->edests[node].elems[i];
1715        /* If calculating the epsilon closure of `edest' is in progress,
1716           return intermediate result.  */
1717        if (dfa->eclosures[edest].nelem == -1)
1718          {
1719            incomplete = 1;
1720            continue;
1721          }
1722        /* If we haven't calculated the epsilon closure of `edest' yet,
1723           calculate now. Otherwise use calculated epsilon closure.  */
1724        if (dfa->eclosures[edest].nelem == 0)
1725          {
1726            err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1727            if (BE (err != REG_NOERROR, 0))
1728              return err;
1729          }
1730        else
1731          eclosure_elem = dfa->eclosures[edest];
1732        /* Merge the epsilon closure of `edest'.  */
1733        err = re_node_set_merge (&eclosure, &eclosure_elem);
1734        if (BE (err != REG_NOERROR, 0))
1735          return err;
1736        /* If the epsilon closure of `edest' is incomplete,
1737           the epsilon closure of this node is also incomplete.  */
1738        if (dfa->eclosures[edest].nelem == 0)
1739          {
1740            incomplete = 1;
1741            re_node_set_free (&eclosure_elem);
1742          }
1743      }
1744
1745  /* An epsilon closure includes itself.  */
1746  ret = re_node_set_insert (&eclosure, node);
1747  if (BE (ret < 0, 0))
1748    return REG_ESPACE;
1749  if (incomplete && !root)
1750    dfa->eclosures[node].nelem = 0;
1751  else
1752    dfa->eclosures[node] = eclosure;
1753  *new_set = eclosure;
1754  return REG_NOERROR;
1755}
1756\f
1757/* Functions for token which are used in the parser.  */
1758
1759/* Fetch a token from INPUT.
1760   We must not use this function inside bracket expressions.  */
1761
1762static void
1763internal_function
1764fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1765{
1766  re_string_skip_bytes (input, peek_token (result, input, syntax));
1767}
1768
1769/* Peek a token from INPUT, and return the length of the token.
1770   We must not use this function inside bracket expressions.  */
1771
1772static int
1773internal_function
1774peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1775{
1776  unsigned char c;
1777
1778  if (re_string_eoi (input))
1779    {
1780      token->type = END_OF_RE;
1781      return 0;
1782    }
1783
1784  c = re_string_peek_byte (input, 0);
1785  token->opr.c = c;
1786
1787  token->word_char = 0;
1788#ifdef RE_ENABLE_I18N
1789  token->mb_partial = 0;
1790  if (input->mb_cur_max > 1 &&
1791      !re_string_first_byte (input, re_string_cur_idx (input)))
1792    {
1793      token->type = CHARACTER;
1794      token->mb_partial = 1;
1795      return 1;
1796    }
1797#endif
1798  if (c == '\\')
1799    {
1800      unsigned char c2;
1801      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1802        {
1803          token->type = BACK_SLASH;
1804          return 1;
1805        }
1806
1807      c2 = re_string_peek_byte_case (input, 1);
1808      token->opr.c = c2;
1809      token->type = CHARACTER;
1810#ifdef RE_ENABLE_I18N
1811      if (input->mb_cur_max > 1)
1812        {
1813          wint_t wc = re_string_wchar_at (input,
1814                                          re_string_cur_idx (input) + 1);
1815          token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1816        }
1817      else
1818#endif
1819        token->word_char = IS_WORD_CHAR (c2) != 0;
1820
1821      switch (c2)
1822        {
1823        case '|':
1824          if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825            token->type = OP_ALT;
1826          break;
1827        case '1': case '2': case '3': case '4': case '5':
1828        case '6': case '7': case '8': case '9':
1829          if (!(syntax & RE_NO_BK_REFS))
1830            {
1831              token->type = OP_BACK_REF;
1832              token->opr.idx = c2 - '1';
1833            }
1834          break;
1835        case '<':
1836          if (!(syntax & RE_NO_GNU_OPS))
1837            {
1838              token->type = ANCHOR;
1839              token->opr.ctx_type = WORD_FIRST;
1840            }
1841          break;
1842        case '>':
1843          if (!(syntax & RE_NO_GNU_OPS))
1844            {
1845              token->type = ANCHOR;
1846              token->opr.ctx_type = WORD_LAST;
1847            }
1848          break;
1849        case 'b':
1850          if (!(syntax & RE_NO_GNU_OPS))
1851            {
1852              token->type = ANCHOR;
1853              token->opr.ctx_type = WORD_DELIM;
1854            }
1855          break;
1856        case 'B':
1857          if (!(syntax & RE_NO_GNU_OPS))
1858            {
1859              token->type = ANCHOR;
1860              token->opr.ctx_type = NOT_WORD_DELIM;
1861            }
1862          break;
1863        case 'w':
1864          if (!(syntax & RE_NO_GNU_OPS))
1865            token->type = OP_WORD;
1866          break;
1867        case 'W':
1868          if (!(syntax & RE_NO_GNU_OPS))
1869            token->type = OP_NOTWORD;
1870          break;
1871        case 's':
1872          if (!(syntax & RE_NO_GNU_OPS))
1873            token->type = OP_SPACE;
1874          break;
1875        case 'S':
1876          if (!(syntax & RE_NO_GNU_OPS))
1877            token->type = OP_NOTSPACE;
1878          break;
1879        case '`':
1880          if (!(syntax & RE_NO_GNU_OPS))
1881            {
1882              token->type = ANCHOR;
1883              token->opr.ctx_type = BUF_FIRST;
1884            }
1885          break;
1886        case '\'':
1887          if (!(syntax & RE_NO_GNU_OPS))
1888            {
1889              token->type = ANCHOR;
1890              token->opr.ctx_type = BUF_LAST;
1891            }
1892          break;
1893        case '(':
1894          if (!(syntax & RE_NO_BK_PARENS))
1895            token->type = OP_OPEN_SUBEXP;
1896          break;
1897        case ')':
1898          if (!(syntax & RE_NO_BK_PARENS))
1899            token->type = OP_CLOSE_SUBEXP;
1900          break;
1901        case '+':
1902          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903            token->type = OP_DUP_PLUS;
1904          break;
1905        case '?':
1906          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907            token->type = OP_DUP_QUESTION;
1908          break;
1909        case '{':
1910          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911            token->type = OP_OPEN_DUP_NUM;
1912          break;
1913        case '}':
1914          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915            token->type = OP_CLOSE_DUP_NUM;
1916          break;
1917        default:
1918          break;
1919        }
1920      return 2;
1921    }
1922
1923  token->type = CHARACTER;
1924#ifdef RE_ENABLE_I18N
1925  if (input->mb_cur_max > 1)
1926    {
1927      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1929    }
1930  else
1931#endif
1932    token->word_char = IS_WORD_CHAR (token->opr.c);
1933
1934  switch (c)
1935    {
1936    case '\n':
1937      if (syntax & RE_NEWLINE_ALT)
1938        token->type = OP_ALT;
1939      break;
1940    case '|':
1941      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942        token->type = OP_ALT;
1943      break;
1944    case '*':
1945      token->type = OP_DUP_ASTERISK;
1946      break;
1947    case '+':
1948      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949        token->type = OP_DUP_PLUS;
1950      break;
1951    case '?':
1952      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953        token->type = OP_DUP_QUESTION;
1954      break;
1955    case '{':
1956      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957        token->type = OP_OPEN_DUP_NUM;
1958      break;
1959    case '}':
1960      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961        token->type = OP_CLOSE_DUP_NUM;
1962      break;
1963    case '(':
1964      if (syntax & RE_NO_BK_PARENS)
1965        token->type = OP_OPEN_SUBEXP;
1966      break;
1967    case ')':
1968      if (syntax & RE_NO_BK_PARENS)
1969        token->type = OP_CLOSE_SUBEXP;
1970      break;
1971    case '[':
1972      token->type = OP_OPEN_BRACKET;
1973      break;
1974    case '.':
1975      token->type = OP_PERIOD;
1976      break;
1977    case '^':
1978      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979          re_string_cur_idx (input) != 0)
1980        {
1981          char prev = re_string_peek_byte (input, -1);
1982          if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1983            break;
1984        }
1985      token->type = ANCHOR;
1986      token->opr.ctx_type = LINE_FIRST;
1987      break;
1988    case '$':
1989      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990          re_string_cur_idx (input) + 1 != re_string_length (input))
1991        {
1992          re_token_t next;
1993          re_string_skip_bytes (input, 1);
1994          peek_token (&next, input, syntax);
1995          re_string_skip_bytes (input, -1);
1996          if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1997            break;
1998        }
1999      token->type = ANCHOR;
2000      token->opr.ctx_type = LINE_LAST;
2001      break;
2002    default:
2003      break;
2004    }
2005  return 1;
2006}
2007
2008/* Peek a token from INPUT, and return the length of the token.
2009   We must not use this function out of bracket expressions.  */
2010
2011static int
2012internal_function
2013peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2014{
2015  unsigned char c;
2016  if (re_string_eoi (input))
2017    {
2018      token->type = END_OF_RE;
2019      return 0;
2020    }
2021  c = re_string_peek_byte (input, 0);
2022  token->opr.c = c;
2023
2024#ifdef RE_ENABLE_I18N
2025  if (input->mb_cur_max > 1 &&
2026      !re_string_first_byte (input, re_string_cur_idx (input)))
2027    {
2028      token->type = CHARACTER;
2029      return 1;
2030    }
2031#endif /* RE_ENABLE_I18N */
2032
2033  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034      && re_string_cur_idx (input) + 1 < re_string_length (input))
2035    {
2036      /* In this case, '\' escape a character.  */
2037      unsigned char c2;
2038      re_string_skip_bytes (input, 1);
2039      c2 = re_string_peek_byte (input, 0);
2040      token->opr.c = c2;
2041      token->type = CHARACTER;
2042      return 1;
2043    }
2044  if (c == '[') /* '[' is a special char in a bracket exps.  */
2045    {
2046      unsigned char c2;
2047      int token_len;
2048      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049        c2 = re_string_peek_byte (input, 1);
2050      else
2051        c2 = 0;
2052      token->opr.c = c2;
2053      token_len = 2;
2054      switch (c2)
2055        {
2056        case '.':
2057          token->type = OP_OPEN_COLL_ELEM;
2058          break;
2059        case '=':
2060          token->type = OP_OPEN_EQUIV_CLASS;
2061          break;
2062        case ':':
2063          if (syntax & RE_CHAR_CLASSES)
2064            {
2065              token->type = OP_OPEN_CHAR_CLASS;
2066              break;
2067            }
2068          /* else fall through.  */
2069        default:
2070          token->type = CHARACTER;
2071          token->opr.c = c;
2072          token_len = 1;
2073          break;
2074        }
2075      return token_len;
2076    }
2077  switch (c)
2078    {
2079    case '-':
2080      token->type = OP_CHARSET_RANGE;
2081      break;
2082    case ']':
2083      token->type = OP_CLOSE_BRACKET;
2084      break;
2085    case '^':
2086      token->type = OP_NON_MATCH_LIST;
2087      break;
2088    default:
2089      token->type = CHARACTER;
2090    }
2091  return 1;
2092}
2093\f
2094/* Functions for parser.  */
2095
2096/* Entry point of the parser.
2097   Parse the regular expression REGEXP and return the structure tree.
2098   If an error has occurred, ERR is set by error code, and return NULL.
2099   This function build the following tree, from regular expression <reg_exp>:
2100           CAT
2101           / \
2102          /   \
2103   <reg_exp>  EOR
2104
2105   CAT means concatenation.
2106   EOR means end of regular expression.  */
2107
2108static bin_tree_t *
2109parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2110       reg_errcode_t *err)
2111{
2112  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113  bin_tree_t *tree, *eor, *root;
2114  re_token_t current_token;
2115  dfa->syntax = syntax;
2116  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2118  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119    return NULL;
2120  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2121  if (tree != NULL)
2122    root = create_tree (dfa, tree, eor, CONCAT);
2123  else
2124    root = eor;
2125  if (BE (eor == NULL || root == NULL, 0))
2126    {
2127      *err = REG_ESPACE;
2128      return NULL;
2129    }
2130  return root;
2131}
2132
2133/* This function build the following tree, from regular expression
2134   <branch1>|<branch2>:
2135           ALT
2136           / \
2137          /   \
2138   <branch1> <branch2>
2139
2140   ALT means alternative, which represents the operator `|'.  */
2141
2142static bin_tree_t *
2143parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2145{
2146  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147  bin_tree_t *tree, *branch = NULL;
2148  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150    return NULL;
2151
2152  while (token->type == OP_ALT)
2153    {
2154      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155      if (token->type != OP_ALT && token->type != END_OF_RE
2156          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2157        {
2158          branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159          if (BE (*err != REG_NOERROR && branch == NULL, 0))
2160            return NULL;
2161        }
2162      else
2163        branch = NULL;
2164      tree = create_tree (dfa, tree, branch, OP_ALT);
2165      if (BE (tree == NULL, 0))
2166        {
2167          *err = REG_ESPACE;
2168          return NULL;
2169        }
2170    }
2171  return tree;
2172}
2173
2174/* This function build the following tree, from regular expression
2175   <exp1><exp2>:
2176        CAT
2177        / \
2178       /   \
2179   <exp1> <exp2>
2180
2181   CAT means concatenation.  */
2182
2183static bin_tree_t *
2184parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185              reg_syntax_t syntax, int nest, reg_errcode_t *err)
2186{
2187  bin_tree_t *tree, *exp;
2188  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191    return NULL;
2192
2193  while (token->type != OP_ALT && token->type != END_OF_RE
2194         && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2195    {
2196      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2197      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2198        {
2199          return NULL;
2200        }
2201      if (tree != NULL && exp != NULL)
2202        {
2203          tree = create_tree (dfa, tree, exp, CONCAT);
2204          if (tree == NULL)
2205            {
2206              *err = REG_ESPACE;
2207              return NULL;
2208            }
2209        }
2210      else if (tree == NULL)
2211        tree = exp;
2212      /* Otherwise exp == NULL, we don't need to create new tree.  */
2213    }
2214  return tree;
2215}
2216
2217/* This function build the following tree, from regular expression a*:
2218         *
2219         |
2220         a
2221*/
2222
2223static bin_tree_t *
2224parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225                  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2226{
2227  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228  bin_tree_t *tree;
2229  switch (token->type)
2230    {
2231    case CHARACTER:
2232      tree = create_token_tree (dfa, NULL, NULL, token);
2233      if (BE (tree == NULL, 0))
2234        {
2235          *err = REG_ESPACE;
2236          return NULL;
2237        }
2238#ifdef RE_ENABLE_I18N
2239      if (dfa->mb_cur_max > 1)
2240        {
2241          while (!re_string_eoi (regexp)
2242                 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243            {
2244              bin_tree_t *mbc_remain;
2245              fetch_token (token, regexp, syntax);
2246              mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247              tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248              if (BE (mbc_remain == NULL || tree == NULL, 0))
2249                {
2250                  *err = REG_ESPACE;
2251                  return NULL;
2252                }
2253            }
2254        }
2255#endif
2256      break;
2257    case OP_OPEN_SUBEXP:
2258      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260        return NULL;
2261      break;
2262    case OP_OPEN_BRACKET:
2263      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265        return NULL;
2266      break;
2267    case OP_BACK_REF:
2268      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269        {
2270          *err = REG_ESUBREG;
2271          return NULL;
2272        }
2273      dfa->used_bkref_map |= 1 << token->opr.idx;
2274      tree = create_token_tree (dfa, NULL, NULL, token);
2275      if (BE (tree == NULL, 0))
2276        {
2277          *err = REG_ESPACE;
2278          return NULL;
2279        }
2280      ++dfa->nbackref;
2281      dfa->has_mb_node = 1;
2282      break;
2283    case OP_OPEN_DUP_NUM:
2284      if (syntax & RE_CONTEXT_INVALID_DUP)
2285        {
2286          *err = REG_BADRPT;
2287          return NULL;
2288        }
2289      /* FALLTHROUGH */
2290    case OP_DUP_ASTERISK:
2291    case OP_DUP_PLUS:
2292    case OP_DUP_QUESTION:
2293      if (syntax & RE_CONTEXT_INVALID_OPS)
2294        {
2295          *err = REG_BADRPT;
2296          return NULL;
2297        }
2298      else if (syntax & RE_CONTEXT_INDEP_OPS)
2299        {
2300          fetch_token (token, regexp, syntax);
2301          return parse_expression (regexp, preg, token, syntax, nest, err);
2302        }
2303      /* else fall through  */
2304    case OP_CLOSE_SUBEXP:
2305      if ((token->type == OP_CLOSE_SUBEXP) &&
2306          !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307        {
2308          *err = REG_ERPAREN;
2309          return NULL;
2310        }
2311      /* else fall through  */
2312    case OP_CLOSE_DUP_NUM:
2313      /* We treat it as a normal character.  */
2314
2315      /* Then we can these characters as normal characters.  */
2316      token->type = CHARACTER;
2317      /* mb_partial and word_char bits should be initialized already
2318         by peek_token.  */
2319      tree = create_token_tree (dfa, NULL, NULL, token);
2320      if (BE (tree == NULL, 0))
2321        {
2322          *err = REG_ESPACE;
2323          return NULL;
2324        }
2325      break;
2326    case ANCHOR:
2327      if ((token->opr.ctx_type
2328           & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329          && dfa->word_ops_used == 0)
2330        init_word_char (dfa);
2331      if (token->opr.ctx_type == WORD_DELIM
2332          || token->opr.ctx_type == NOT_WORD_DELIM)
2333        {
2334          bin_tree_t *tree_first, *tree_last;
2335          if (token->opr.ctx_type == WORD_DELIM)
2336            {
2337              token->opr.ctx_type = WORD_FIRST;
2338              tree_first = create_token_tree (dfa, NULL, NULL, token);
2339              token->opr.ctx_type = WORD_LAST;
2340            }
2341          else
2342            {
2343              token->opr.ctx_type = INSIDE_WORD;
2344              tree_first = create_token_tree (dfa, NULL, NULL, token);
2345              token->opr.ctx_type = INSIDE_NOTWORD;
2346            }
2347          tree_last = create_token_tree (dfa, NULL, NULL, token);
2348          tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349          if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350            {
2351              *err = REG_ESPACE;
2352              return NULL;
2353            }
2354        }
2355      else
2356        {
2357          tree = create_token_tree (dfa, NULL, NULL, token);
2358          if (BE (tree == NULL, 0))
2359            {
2360              *err = REG_ESPACE;
2361              return NULL;
2362            }
2363        }
2364      /* We must return here, since ANCHORs can't be followed
2365         by repetition operators.
2366         eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367             it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2368      fetch_token (token, regexp, syntax);
2369      return tree;
2370    case OP_PERIOD:
2371      tree = create_token_tree (dfa, NULL, NULL, token);
2372      if (BE (tree == NULL, 0))
2373        {
2374          *err = REG_ESPACE;
2375          return NULL;
2376        }
2377      if (dfa->mb_cur_max > 1)
2378        dfa->has_mb_node = 1;
2379      break;
2380    case OP_WORD:
2381    case OP_NOTWORD:
2382      tree = build_charclass_op (dfa, regexp->trans,
2383                                 "alnum",
2384                                 "_",
2385                                 token->type == OP_NOTWORD, err);
2386      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387        return NULL;
2388      break;
2389    case OP_SPACE:
2390    case OP_NOTSPACE:
2391      tree = build_charclass_op (dfa, regexp->trans,
2392                                 "space",
2393                                 "",
2394                                 token->type == OP_NOTSPACE, err);
2395      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396        return NULL;
2397      break;
2398    case OP_ALT:
2399    case END_OF_RE:
2400      return NULL;
2401    case BACK_SLASH:
2402      *err = REG_EESCAPE;
2403      return NULL;
2404    default:
2405      /* Must not happen?  */
2406#ifdef DEBUG
2407      assert (0);
2408#endif
2409      return NULL;
2410    }
2411  fetch_token (token, regexp, syntax);
2412
2413  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414         || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415    {
2416      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418        return NULL;
2419      /* In BRE consecutive duplications are not allowed.  */
2420      if ((syntax & RE_CONTEXT_INVALID_DUP)
2421          && (token->type == OP_DUP_ASTERISK
2422              || token->type == OP_OPEN_DUP_NUM))
2423        {
2424          *err = REG_BADRPT;
2425          return NULL;
2426        }
2427    }
2428
2429  return tree;
2430}
2431
2432/* This function build the following tree, from regular expression
2433   (<reg_exp>):
2434         SUBEXP
2435            |
2436        <reg_exp>
2437*/
2438
2439static bin_tree_t *
2440parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2442{
2443  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444  bin_tree_t *tree;
2445  size_t cur_nsub;
2446  cur_nsub = preg->re_nsub++;
2447
2448  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449
2450  /* The subexpression may be a null string.  */
2451  if (token->type == OP_CLOSE_SUBEXP)
2452    tree = NULL;
2453  else
2454    {
2455      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457        *err = REG_EPAREN;
2458      if (BE (*err != REG_NOERROR, 0))
2459        return NULL;
2460    }
2461
2462  if (cur_nsub <= '9' - '1')
2463    dfa->completed_bkref_map |= 1 << cur_nsub;
2464
2465  tree = create_tree (dfa, tree, NULL, SUBEXP);
2466  if (BE (tree == NULL, 0))
2467    {
2468      *err = REG_ESPACE;
2469      return NULL;
2470    }
2471  tree->token.opr.idx = cur_nsub;
2472  return tree;
2473}
2474
2475/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2476
2477static bin_tree_t *
2478parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479              re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2480{
2481  bin_tree_t *tree = NULL, *old_tree = NULL;
2482  int i, start, end, start_idx = re_string_cur_idx (regexp);
2483#ifndef RE_TOKEN_INIT_BUG
2484  re_token_t start_token = *token;
2485#else
2486  re_token_t start_token;
2487
2488  memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2489#endif
2490
2491  if (token->type == OP_OPEN_DUP_NUM)
2492    {
2493      end = 0;
2494      start = fetch_number (regexp, token, syntax);
2495      if (start == -1)
2496        {
2497          if (token->type == CHARACTER && token->opr.c == ',')
2498            start = 0; /* We treat "{,m}" as "{0,m}".  */
2499          else
2500            {
2501              *err = REG_BADBR; /* <re>{} is invalid.  */
2502              return NULL;
2503            }
2504        }
2505      if (BE (start != -2, 1))
2506        {
2507          /* We treat "{n}" as "{n,n}".  */
2508          end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509                 : ((token->type == CHARACTER && token->opr.c == ',')
2510                    ? fetch_number (regexp, token, syntax) : -2));
2511        }
2512      if (BE (start == -2 || end == -2, 0))
2513        {
2514          /* Invalid sequence.  */
2515          if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2516            {
2517              if (token->type == END_OF_RE)
2518                *err = REG_EBRACE;
2519              else
2520                *err = REG_BADBR;
2521
2522              return NULL;
2523            }
2524
2525          /* If the syntax bit is set, rollback.  */
2526          re_string_set_index (regexp, start_idx);
2527          *token = start_token;
2528          token->type = CHARACTER;
2529          /* mb_partial and word_char bits should be already initialized by
2530             peek_token.  */
2531          return elem;
2532        }
2533
2534      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2535        {
2536          /* First number greater than second.  */
2537          *err = REG_BADBR;
2538          return NULL;
2539        }
2540    }
2541  else
2542    {
2543      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2544      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2545    }
2546
2547  fetch_token (token, regexp, syntax);
2548
2549  if (BE (elem == NULL, 0))
2550    return NULL;
2551  if (BE (start == 0 && end == 0, 0))
2552    {
2553      postorder (elem, free_tree, NULL);
2554      return NULL;
2555    }
2556
2557  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2558  if (BE (start > 0, 0))
2559    {
2560      tree = elem;
2561      for (i = 2; i <= start; ++i)
2562        {
2563          elem = duplicate_tree (elem, dfa);
2564          tree = create_tree (dfa, tree, elem, CONCAT);
2565          if (BE (elem == NULL || tree == NULL, 0))
2566            goto parse_dup_op_espace;
2567        }
2568
2569      if (start == end)
2570        return tree;
2571
2572      /* Duplicate ELEM before it is marked optional.  */
2573      elem = duplicate_tree (elem, dfa);
2574      old_tree = tree;
2575    }
2576  else
2577    old_tree = NULL;
2578
2579  if (elem->token.type == SUBEXP)
2580    postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2581
2582  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2583  if (BE (tree == NULL, 0))
2584    goto parse_dup_op_espace;
2585
2586  /* This loop is actually executed only when end != -1,
2587     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2588     already created the start+1-th copy.  */
2589  for (i = start + 2; i <= end; ++i)
2590    {
2591      elem = duplicate_tree (elem, dfa);
2592      tree = create_tree (dfa, tree, elem, CONCAT);
2593      if (BE (elem == NULL || tree == NULL, 0))
2594        goto parse_dup_op_espace;
2595
2596      tree = create_tree (dfa, tree, NULL, OP_ALT);
2597      if (BE (tree == NULL, 0))
2598        goto parse_dup_op_espace;
2599    }
2600
2601  if (old_tree)
2602    tree = create_tree (dfa, old_tree, tree, CONCAT);
2603
2604  return tree;
2605
2606 parse_dup_op_espace:
2607  *err = REG_ESPACE;
2608  return NULL;
2609}
2610
2611/* Size of the names for collating symbol/equivalence_class/character_class.
2612   I'm not sure, but maybe enough.  */
2613#define BRACKET_NAME_BUF_SIZE 32
2614
2615#ifndef _LIBC
2616  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2617     Build the range expression which starts from START_ELEM, and ends
2618     at END_ELEM.  The result are written to MBCSET and SBCSET.
2619     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2620     mbcset->range_ends, is a pointer argument since we may
2621     update it.  */
2622
2623static reg_errcode_t
2624internal_function
2625# ifdef RE_ENABLE_I18N
2626build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2627                 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2628# else /* not RE_ENABLE_I18N */
2629build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2630                 bracket_elem_t *end_elem)
2631# endif /* not RE_ENABLE_I18N */
2632{
2633  unsigned int start_ch, end_ch;
2634  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2635  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2636          || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2637          0))
2638    return REG_ERANGE;
2639
2640  /* We can handle no multi character collating elements without libc
2641     support.  */
2642  if (BE ((start_elem->type == COLL_SYM
2643           && strlen ((char *) start_elem->opr.name) > 1)
2644          || (end_elem->type == COLL_SYM
2645              && strlen ((char *) end_elem->opr.name) > 1), 0))
2646    return REG_ECOLLATE;
2647
2648# ifdef RE_ENABLE_I18N
2649  {
2650    wchar_t wc;
2651    wint_t start_wc;
2652    wint_t end_wc;
2653    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2654
2655    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2656                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2657                   : 0));
2658    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2659              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2660                 : 0));
2661#ifdef GAWK
2662    /*
2663     * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2664     * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2665     * unsigned, so we don't have sign extension problems.
2666     */
2667    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2668                ? start_ch : start_elem->opr.wch);
2669    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2670              ? end_ch : end_elem->opr.wch);
2671#else
2672    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2673                ? __btowc (start_ch) : start_elem->opr.wch);
2674    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2675              ? __btowc (end_ch) : end_elem->opr.wch);
2676#endif
2677    if (start_wc == WEOF || end_wc == WEOF)
2678      return REG_ECOLLATE;
2679    cmp_buf[0] = start_wc;
2680    cmp_buf[4] = end_wc;
2681    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2682      return REG_ERANGE;
2683
2684    /* Got valid collation sequence values, add them as a new entry.
2685       However, for !_LIBC we have no collation elements: if the
2686       character set is single byte, the single byte character set
2687       that we build below suffices.  parse_bracket_exp passes
2688       no MBCSET if dfa->mb_cur_max == 1.  */
2689    if (mbcset)
2690      {
2691        /* Check the space of the arrays.  */
2692        if (BE (*range_alloc == mbcset->nranges, 0))
2693          {
2694            /* There is not enough space, need realloc.  */
2695            wchar_t *new_array_start, *new_array_end;
2696            int new_nranges;
2697
2698            /* +1 in case of mbcset->nranges is 0.  */
2699            new_nranges = 2 * mbcset->nranges + 1;
2700            /* Use realloc since mbcset->range_starts and mbcset->range_ends
2701               are NULL if *range_alloc == 0.  */
2702            new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2703                                          new_nranges);
2704            new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2705                                        new_nranges);
2706
2707            if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2708              return REG_ESPACE;
2709
2710            mbcset->range_starts = new_array_start;
2711            mbcset->range_ends = new_array_end;
2712            *range_alloc = new_nranges;
2713          }
2714
2715        mbcset->range_starts[mbcset->nranges] = start_wc;
2716        mbcset->range_ends[mbcset->nranges++] = end_wc;
2717      }
2718
2719    /* Build the table for single byte characters.  */
2720    for (wc = 0; wc < SBC_MAX; ++wc)
2721      {
2722        cmp_buf[2] = wc;
2723        if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2724            && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2725          bitset_set (sbcset, wc);
2726      }
2727  }
2728# else /* not RE_ENABLE_I18N */
2729  {
2730    unsigned int ch;
2731    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2732                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2733                   : 0));
2734    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2735              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2736                 : 0));
2737    if (start_ch > end_ch)
2738      return REG_ERANGE;
2739    /* Build the table for single byte characters.  */
2740    for (ch = 0; ch < SBC_MAX; ++ch)
2741      if (start_ch <= ch  && ch <= end_ch)
2742        bitset_set (sbcset, ch);
2743  }
2744# endif /* not RE_ENABLE_I18N */
2745  return REG_NOERROR;
2746}
2747#endif /* not _LIBC */
2748
2749#ifndef _LIBC
2750/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2751   Build the collating element which is represented by NAME.
2752   The result are written to MBCSET and SBCSET.
2753   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2754   pointer argument since we may update it.  */
2755
2756static reg_errcode_t
2757internal_function
2758# ifdef RE_ENABLE_I18N
2759build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2760                        int *coll_sym_alloc, const unsigned char *name)
2761# else /* not RE_ENABLE_I18N */
2762build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2763# endif /* not RE_ENABLE_I18N */
2764{
2765  size_t name_len = strlen ((const char *) name);
2766  if (BE (name_len != 1, 0))
2767    return REG_ECOLLATE;
2768  else
2769    {
2770      bitset_set (sbcset, name[0]);
2771      return REG_NOERROR;
2772    }
2773}
2774#endif /* not _LIBC */
2775
2776/* This function parse bracket expression like "[abc]", "[a-c]",
2777   "[[.a-a.]]" etc.  */
2778
2779static bin_tree_t *
2780parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2781                   reg_syntax_t syntax, reg_errcode_t *err)
2782{
2783#ifdef _LIBC
2784  const unsigned char *collseqmb;
2785  const char *collseqwc;
2786  uint32_t nrules;
2787  int32_t table_size;
2788  const int32_t *symb_table;
2789  const unsigned char *extra;
2790
2791  /* Local function for parse_bracket_exp used in _LIBC environment.
2792     Seek the collating symbol entry correspondings to NAME.
2793     Return the index of the symbol in the SYMB_TABLE.  */
2794
2795  auto inline int32_t
2796  __attribute ((always_inline))
2797  seek_collating_symbol_entry (name, name_len)
2798         const unsigned char *name;
2799         size_t name_len;
2800    {
2801      int32_t hash = elem_hash ((const char *) name, name_len);
2802      int32_t elem = hash % table_size;
2803      if (symb_table[2 * elem] != 0)
2804        {
2805          int32_t second = hash % (table_size - 2) + 1;
2806
2807          do
2808            {
2809              /* First compare the hashing value.  */
2810              if (symb_table[2 * elem] == hash
2811                  /* Compare the length of the name.  */
2812                  && name_len == extra[symb_table[2 * elem + 1]]
2813                  /* Compare the name.  */
2814                  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2815                             name_len) == 0)
2816                {
2817                  /* Yep, this is the entry.  */
2818                  break;
2819                }
2820
2821              /* Next entry.  */
2822              elem += second;
2823            }
2824          while (symb_table[2 * elem] != 0);
2825        }
2826      return elem;
2827    }
2828
2829  /* Local function for parse_bracket_exp used in _LIBC environment.
2830     Look up the collation sequence value of BR_ELEM.
2831     Return the value if succeeded, UINT_MAX otherwise.  */
2832
2833  auto inline unsigned int
2834  __attribute ((always_inline))
2835  lookup_collation_sequence_value (br_elem)
2836         bracket_elem_t *br_elem;
2837    {
2838      if (br_elem->type == SB_CHAR)
2839        {
2840          /*
2841          if (MB_CUR_MAX == 1)
2842          */
2843          if (nrules == 0)
2844            return collseqmb[br_elem->opr.ch];
2845          else
2846            {
2847              wint_t wc = __btowc (br_elem->opr.ch);
2848              return __collseq_table_lookup (collseqwc, wc);
2849            }
2850        }
2851      else if (br_elem->type == MB_CHAR)
2852        {
2853          if (nrules != 0)
2854            return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2855        }
2856      else if (br_elem->type == COLL_SYM)
2857        {
2858          size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2859          if (nrules != 0)
2860            {
2861              int32_t elem, idx;
2862              elem = seek_collating_symbol_entry (br_elem->opr.name,
2863                                                  sym_name_len);
2864              if (symb_table[2 * elem] != 0)
2865                {
2866                  /* We found the entry.  */
2867                  idx = symb_table[2 * elem + 1];
2868                  /* Skip the name of collating element name.  */
2869                  idx += 1 + extra[idx];
2870                  /* Skip the byte sequence of the collating element.  */
2871                  idx += 1 + extra[idx];
2872                  /* Adjust for the alignment.  */
2873                  idx = (idx + 3) & ~3;
2874                  /* Skip the multibyte collation sequence value.  */
2875                  idx += sizeof (unsigned int);
2876                  /* Skip the wide char sequence of the collating element.  */
2877                  idx += sizeof (unsigned int) *
2878                    (1 + *(unsigned int *) (extra + idx));
2879                  /* Return the collation sequence value.  */
2880                  return *(unsigned int *) (extra + idx);
2881                }
2882              else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2883                {
2884                  /* No valid character.  Match it as a single byte
2885                     character.  */
2886                  return collseqmb[br_elem->opr.name[0]];
2887                }
2888            }
2889          else if (sym_name_len == 1)
2890            return collseqmb[br_elem->opr.name[0]];
2891        }
2892      return UINT_MAX;
2893    }
2894
2895  /* Local function for parse_bracket_exp used in _LIBC environment.
2896     Build the range expression which starts from START_ELEM, and ends
2897     at END_ELEM.  The result are written to MBCSET and SBCSET.
2898     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2899     mbcset->range_ends, is a pointer argument since we may
2900     update it.  */
2901
2902  auto inline reg_errcode_t
2903  __attribute ((always_inline))
2904  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2905         re_charset_t *mbcset;
2906         int *range_alloc;
2907         bitset_t sbcset;
2908         bracket_elem_t *start_elem, *end_elem;
2909    {
2910      unsigned int ch;
2911      uint32_t start_collseq;
2912      uint32_t end_collseq;
2913
2914      /* Equivalence Classes and Character Classes can't be a range
2915         start/end.  */
2916      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2917              || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2918              0))
2919        return REG_ERANGE;
2920
2921      start_collseq = lookup_collation_sequence_value (start_elem);
2922      end_collseq = lookup_collation_sequence_value (end_elem);
2923      /* Check start/end collation sequence values.  */
2924      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2925        return REG_ECOLLATE;
2926      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2927        return REG_ERANGE;
2928
2929      /* Got valid collation sequence values, add them as a new entry.
2930         However, if we have no collation elements, and the character set
2931         is single byte, the single byte character set that we
2932         build below suffices. */
2933      if (nrules > 0 || dfa->mb_cur_max > 1)
2934        {
2935          /* Check the space of the arrays.  */
2936          if (BE (*range_alloc == mbcset->nranges, 0))
2937            {
2938              /* There is not enough space, need realloc.  */
2939              uint32_t *new_array_start;
2940              uint32_t *new_array_end;
2941              int new_nranges;
2942
2943              /* +1 in case of mbcset->nranges is 0.  */
2944              new_nranges = 2 * mbcset->nranges + 1;
2945              new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2946                                            new_nranges);
2947              new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2948                                          new_nranges);
2949
2950              if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2951                return REG_ESPACE;
2952
2953              mbcset->range_starts = new_array_start;
2954              mbcset->range_ends = new_array_end;
2955              *range_alloc = new_nranges;
2956            }
2957
2958          mbcset->range_starts[mbcset->nranges] = start_collseq;
2959          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2960        }
2961
2962      /* Build the table for single byte characters.  */
2963      for (ch = 0; ch < SBC_MAX; ch++)
2964        {
2965          uint32_t ch_collseq;
2966          /*
2967          if (MB_CUR_MAX == 1)
2968          */
2969          if (nrules == 0)
2970            ch_collseq = collseqmb[ch];
2971          else
2972            ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2973          if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2974            bitset_set (sbcset, ch);
2975        }
2976      return REG_NOERROR;
2977    }
2978
2979  /* Local function for parse_bracket_exp used in _LIBC environment.
2980     Build the collating element which is represented by NAME.
2981     The result are written to MBCSET and SBCSET.
2982     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2983     pointer argument since we may update it.  */
2984
2985  auto inline reg_errcode_t
2986  __attribute ((always_inline))
2987  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2988         re_charset_t *mbcset;
2989         int *coll_sym_alloc;
2990         bitset_t sbcset;
2991         const unsigned char *name;
2992    {
2993      int32_t elem, idx;
2994      size_t name_len = strlen ((const char *) name);
2995      if (nrules != 0)
2996        {
2997          elem = seek_collating_symbol_entry (name, name_len);
2998          if (symb_table[2 * elem] != 0)
2999            {
3000              /* We found the entry.  */
3001              idx = symb_table[2 * elem + 1];
3002              /* Skip the name of collating element name.  */
3003              idx += 1 + extra[idx];
3004            }
3005          else if (symb_table[2 * elem] == 0 && name_len == 1)
3006            {
3007              /* No valid character, treat it as a normal
3008                 character.  */
3009              bitset_set (sbcset, name[0]);
3010              return REG_NOERROR;
3011            }
3012          else
3013            return REG_ECOLLATE;
3014
3015          /* Got valid collation sequence, add it as a new entry.  */
3016          /* Check the space of the arrays.  */
3017          if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3018            {
3019              /* Not enough, realloc it.  */
3020              /* +1 in case of mbcset->ncoll_syms is 0.  */
3021              int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3022              /* Use realloc since mbcset->coll_syms is NULL
3023                 if *alloc == 0.  */
3024              int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3025                                                   new_coll_sym_alloc);
3026              if (BE (new_coll_syms == NULL, 0))
3027                return REG_ESPACE;
3028              mbcset->coll_syms = new_coll_syms;
3029              *coll_sym_alloc = new_coll_sym_alloc;
3030            }
3031          mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3032          return REG_NOERROR;
3033        }
3034      else
3035        {
3036          if (BE (name_len != 1, 0))
3037            return REG_ECOLLATE;
3038          else
3039            {
3040              bitset_set (sbcset, name[0]);
3041              return REG_NOERROR;
3042            }
3043        }
3044    }
3045#endif
3046
3047  re_token_t br_token;
3048  re_bitset_ptr_t sbcset;
3049#ifdef RE_ENABLE_I18N
3050  re_charset_t *mbcset;
3051  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3052  int equiv_class_alloc = 0, char_class_alloc = 0;
3053#endif /* not RE_ENABLE_I18N */
3054  int non_match = 0;
3055  bin_tree_t *work_tree;
3056  int token_len;
3057  int first_round = 1;
3058#ifdef _LIBC
3059  collseqmb = (const unsigned char *)
3060    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3061  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3062  if (nrules)
3063    {
3064      /*
3065      if (MB_CUR_MAX > 1)
3066      */
3067      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3068      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3069      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3070                                                  _NL_COLLATE_SYMB_TABLEMB);
3071      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3072                                                   _NL_COLLATE_SYMB_EXTRAMB);
3073    }
3074#endif
3075  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3076#ifdef RE_ENABLE_I18N
3077  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3078#endif /* RE_ENABLE_I18N */
3079#ifdef RE_ENABLE_I18N
3080  if (BE (sbcset == NULL || mbcset == NULL, 0))
3081#else
3082  if (BE (sbcset == NULL, 0))
3083#endif /* RE_ENABLE_I18N */
3084    {
3085      *err = REG_ESPACE;
3086      return NULL;
3087    }
3088
3089  token_len = peek_token_bracket (token, regexp, syntax);
3090  if (BE (token->type == END_OF_RE, 0))
3091    {
3092      *err = REG_BADPAT;
3093      goto parse_bracket_exp_free_return;
3094    }
3095  if (token->type == OP_NON_MATCH_LIST)
3096    {
3097#ifdef RE_ENABLE_I18N
3098      mbcset->non_match = 1;
3099#endif /* not RE_ENABLE_I18N */
3100      non_match = 1;
3101      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3102        bitset_set (sbcset, '\n');
3103      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3104      token_len = peek_token_bracket (token, regexp, syntax);
3105      if (BE (token->type == END_OF_RE, 0))
3106        {
3107          *err = REG_BADPAT;
3108          goto parse_bracket_exp_free_return;
3109        }
3110    }
3111
3112  /* We treat the first ']' as a normal character.  */
3113  if (token->type == OP_CLOSE_BRACKET)
3114    token->type = CHARACTER;
3115
3116  while (1)
3117    {
3118      bracket_elem_t start_elem, end_elem;
3119      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3120      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3121      reg_errcode_t ret;
3122      int token_len2 = 0, is_range_exp = 0;
3123      re_token_t token2;
3124
3125      start_elem.opr.name = start_name_buf;
3126      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3127                                   syntax, first_round);
3128      if (BE (ret != REG_NOERROR, 0))
3129        {
3130          *err = ret;
3131          goto parse_bracket_exp_free_return;
3132        }
3133      first_round = 0;
3134
3135      /* Get information about the next token.  We need it in any case.  */
3136      token_len = peek_token_bracket (token, regexp, syntax);
3137
3138      /* Do not check for ranges if we know they are not allowed.  */
3139      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3140        {
3141          if (BE (token->type == END_OF_RE, 0))
3142            {
3143              *err = REG_EBRACK;
3144              goto parse_bracket_exp_free_return;
3145            }
3146          if (token->type == OP_CHARSET_RANGE)
3147            {
3148              re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3149              token_len2 = peek_token_bracket (&token2, regexp, syntax);
3150              if (BE (token2.type == END_OF_RE, 0))
3151                {
3152                  *err = REG_EBRACK;
3153                  goto parse_bracket_exp_free_return;
3154                }
3155              if (token2.type == OP_CLOSE_BRACKET)
3156                {
3157                  /* We treat the last '-' as a normal character.  */
3158                  re_string_skip_bytes (regexp, -token_len);
3159                  token->type = CHARACTER;
3160                }
3161              else
3162                is_range_exp = 1;
3163            }
3164        }
3165
3166      if (is_range_exp == 1)
3167        {
3168          end_elem.opr.name = end_name_buf;
3169          ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3170                                       dfa, syntax, 1);
3171          if (BE (ret != REG_NOERROR, 0))
3172            {
3173              *err = ret;
3174              goto parse_bracket_exp_free_return;
3175            }
3176
3177          token_len = peek_token_bracket (token, regexp, syntax);
3178
3179#ifdef _LIBC
3180          *err = build_range_exp (sbcset, mbcset, &range_alloc,
3181                                  &start_elem, &end_elem);
3182#else
3183# ifdef RE_ENABLE_I18N
3184          *err = build_range_exp (sbcset,
3185                                  dfa->mb_cur_max > 1 ? mbcset : NULL,
3186                                  &range_alloc, &start_elem, &end_elem);
3187# else
3188          *err = build_range_exp (sbcset, &start_elem, &end_elem);
3189# endif
3190#endif /* RE_ENABLE_I18N */
3191          if (BE (*err != REG_NOERROR, 0))
3192            goto parse_bracket_exp_free_return;
3193        }
3194      else
3195        {
3196          switch (start_elem.type)
3197            {
3198            case SB_CHAR:
3199              bitset_set (sbcset, start_elem.opr.ch);
3200              break;
3201#ifdef RE_ENABLE_I18N
3202            case MB_CHAR:
3203              /* Check whether the array has enough space.  */
3204              if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3205                {
3206                  wchar_t *new_mbchars;
3207                  /* Not enough, realloc it.  */
3208                  /* +1 in case of mbcset->nmbchars is 0.  */
3209                  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3210                  /* Use realloc since array is NULL if *alloc == 0.  */
3211                  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3212                                            mbchar_alloc);
3213                  if (BE (new_mbchars == NULL, 0))
3214                    goto parse_bracket_exp_espace;
3215                  mbcset->mbchars = new_mbchars;
3216                }
3217              mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3218              break;
3219#endif /* RE_ENABLE_I18N */
3220            case EQUIV_CLASS:
3221              *err = build_equiv_class (sbcset,
3222#ifdef RE_ENABLE_I18N
3223                                        mbcset, &equiv_class_alloc,
3224#endif /* RE_ENABLE_I18N */
3225                                        start_elem.opr.name);
3226              if (BE (*err != REG_NOERROR, 0))
3227                goto parse_bracket_exp_free_return;
3228              break;
3229            case COLL_SYM:
3230              *err = build_collating_symbol (sbcset,
3231#ifdef RE_ENABLE_I18N
3232                                             mbcset, &coll_sym_alloc,
3233#endif /* RE_ENABLE_I18N */
3234                                             start_elem.opr.name);
3235              if (BE (*err != REG_NOERROR, 0))
3236                goto parse_bracket_exp_free_return;
3237              break;
3238            case CHAR_CLASS:
3239              *err = build_charclass (regexp->trans, sbcset,
3240#ifdef RE_ENABLE_I18N
3241                                      mbcset, &char_class_alloc,
3242#endif /* RE_ENABLE_I18N */
3243                                      (const char *) start_elem.opr.name, syntax);
3244              if (BE (*err != REG_NOERROR, 0))
3245               goto parse_bracket_exp_free_return;
3246              break;
3247            default:
3248              assert (0);
3249              break;
3250            }
3251        }
3252      if (BE (token->type == END_OF_RE, 0))
3253        {
3254          *err = REG_EBRACK;
3255          goto parse_bracket_exp_free_return;
3256        }
3257      if (token->type == OP_CLOSE_BRACKET)
3258        break;
3259    }
3260
3261  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3262
3263  /* If it is non-matching list.  */
3264  if (non_match)
3265    bitset_not (sbcset);
3266
3267#ifdef RE_ENABLE_I18N
3268  /* Ensure only single byte characters are set.  */
3269  if (dfa->mb_cur_max > 1)
3270    bitset_mask (sbcset, dfa->sb_char);
3271
3272  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3273      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3274                                                     || mbcset->non_match)))
3275    {
3276      bin_tree_t *mbc_tree;
3277      int sbc_idx;
3278      /* Build a tree for complex bracket.  */
3279      dfa->has_mb_node = 1;
3280      br_token.type = COMPLEX_BRACKET;
3281      br_token.opr.mbcset = mbcset;
3282      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3283      if (BE (mbc_tree == NULL, 0))
3284        goto parse_bracket_exp_espace;
3285      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3286        if (sbcset[sbc_idx])
3287          break;
3288      /* If there are no bits set in sbcset, there is no point
3289         of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3290      if (sbc_idx < BITSET_WORDS)
3291        {
3292          /* Build a tree for simple bracket.  */
3293          br_token.type = SIMPLE_BRACKET;
3294          br_token.opr.sbcset = sbcset;
3295          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296          if (BE (work_tree == NULL, 0))
3297            goto parse_bracket_exp_espace;
3298
3299          /* Then join them by ALT node.  */
3300          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3301          if (BE (work_tree == NULL, 0))
3302            goto parse_bracket_exp_espace;
3303        }
3304      else
3305        {
3306          re_free (sbcset);
3307          work_tree = mbc_tree;
3308        }
3309    }
3310  else
3311#endif /* not RE_ENABLE_I18N */
3312    {
3313#ifdef RE_ENABLE_I18N
3314      free_charset (mbcset);
3315#endif
3316      /* Build a tree for simple bracket.  */
3317      br_token.type = SIMPLE_BRACKET;
3318      br_token.opr.sbcset = sbcset;
3319      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3320      if (BE (work_tree == NULL, 0))
3321        goto parse_bracket_exp_espace;
3322    }
3323  return work_tree;
3324
3325 parse_bracket_exp_espace:
3326  *err = REG_ESPACE;
3327 parse_bracket_exp_free_return:
3328  re_free (sbcset);
3329#ifdef RE_ENABLE_I18N
3330  free_charset (mbcset);
3331#endif /* RE_ENABLE_I18N */
3332  return NULL;
3333}
3334
3335/* Parse an element in the bracket expression.  */
3336
3337static reg_errcode_t
3338parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3339                       re_token_t *token, int token_len, re_dfa_t *dfa,
3340                       reg_syntax_t syntax, int accept_hyphen)
3341{
3342#ifdef RE_ENABLE_I18N
3343  int cur_char_size;
3344  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3345  if (cur_char_size > 1)
3346    {
3347      elem->type = MB_CHAR;
3348      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3349      re_string_skip_bytes (regexp, cur_char_size);
3350      return REG_NOERROR;
3351    }
3352#endif /* RE_ENABLE_I18N */
3353  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3354  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3355      || token->type == OP_OPEN_EQUIV_CLASS)
3356    return parse_bracket_symbol (elem, regexp, token);
3357  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3358    {
3359      /* A '-' must only appear as anything but a range indicator before
3360         the closing bracket.  Everything else is an error.  */
3361      re_token_t token2;
3362      (void) peek_token_bracket (&token2, regexp, syntax);
3363      if (token2.type != OP_CLOSE_BRACKET)
3364        /* The actual error value is not standardized since this whole
3365           case is undefined.  But ERANGE makes good sense.  */
3366        return REG_ERANGE;
3367    }
3368  elem->type = SB_CHAR;
3369  elem->opr.ch = token->opr.c;
3370  return REG_NOERROR;
3371}
3372
3373/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3374   such as [:<character_class>:], [.<collating_element>.], and
3375   [=<equivalent_class>=].  */
3376
3377static reg_errcode_t
3378parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3379                      re_token_t *token)
3380{
3381  unsigned char ch, delim = token->opr.c;
3382  int i = 0;
3383  if (re_string_eoi(regexp))
3384    return REG_EBRACK;
3385  for (;; ++i)
3386    {
3387      if (i >= BRACKET_NAME_BUF_SIZE)
3388        return REG_EBRACK;
3389      if (token->type == OP_OPEN_CHAR_CLASS)
3390        ch = re_string_fetch_byte_case (regexp);
3391      else
3392        ch = re_string_fetch_byte (regexp);
3393      if (re_string_eoi(regexp))
3394        return REG_EBRACK;
3395      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3396        break;
3397      elem->opr.name[i] = ch;
3398    }
3399  re_string_skip_bytes (regexp, 1);
3400  elem->opr.name[i] = '\0';
3401  switch (token->type)
3402    {
3403    case OP_OPEN_COLL_ELEM:
3404      elem->type = COLL_SYM;
3405      break;
3406    case OP_OPEN_EQUIV_CLASS:
3407      elem->type = EQUIV_CLASS;
3408      break;
3409    case OP_OPEN_CHAR_CLASS:
3410      elem->type = CHAR_CLASS;
3411      break;
3412    default:
3413      break;
3414    }
3415  return REG_NOERROR;
3416}
3417
3418  /* Helper function for parse_bracket_exp.
3419     Build the equivalence class which is represented by NAME.
3420     The result are written to MBCSET and SBCSET.
3421     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3422     is a pointer argument since we may update it.  */
3423
3424static reg_errcode_t
3425#ifdef RE_ENABLE_I18N
3426build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3427                   int *equiv_class_alloc, const unsigned char *name)
3428#else /* not RE_ENABLE_I18N */
3429build_equiv_class (bitset_t sbcset, const unsigned char *name)
3430#endif /* not RE_ENABLE_I18N */
3431{
3432#ifdef _LIBC
3433  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3434  if (nrules != 0)
3435    {
3436      const int32_t *table, *indirect;
3437      const unsigned char *weights, *extra, *cp;
3438      unsigned char char_buf[2];
3439      int32_t idx1, idx2;
3440      unsigned int ch;
3441      size_t len;
3442      /* This #include defines a local function!  */
3443# include <locale/weight.h>
3444      /* Calculate the index for equivalence class.  */
3445      cp = name;
3446      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3447      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3448                                               _NL_COLLATE_WEIGHTMB);
3449      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3450                                                   _NL_COLLATE_EXTRAMB);
3451      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3452                                                _NL_COLLATE_INDIRECTMB);
3453      idx1 = findidx (&cp);
3454      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3455        /* This isn't a valid character.  */
3456        return REG_ECOLLATE;
3457
3458      /* Build single byte matcing table for this equivalence class.  */
3459      char_buf[1] = (unsigned char) '\0';
3460      len = weights[idx1 & 0xffffff];
3461      for (ch = 0; ch < SBC_MAX; ++ch)
3462        {
3463          char_buf[0] = ch;
3464          cp = char_buf;
3465          idx2 = findidx (&cp);
3466/*
3467          idx2 = table[ch];
3468*/
3469          if (idx2 == 0)
3470            /* This isn't a valid character.  */
3471            continue;
3472          /* Compare only if the length matches and the collation rule
3473             index is the same.  */
3474          if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3475            {
3476              int cnt = 0;
3477
3478              while (cnt <= len &&
3479                     weights[(idx1 & 0xffffff) + 1 + cnt]
3480                     == weights[(idx2 & 0xffffff) + 1 + cnt])
3481                ++cnt;
3482
3483              if (cnt > len)
3484                bitset_set (sbcset, ch);
3485            }
3486        }
3487      /* Check whether the array has enough space.  */
3488      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3489        {
3490          /* Not enough, realloc it.  */
3491          /* +1 in case of mbcset->nequiv_classes is 0.  */
3492          int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3493          /* Use realloc since the array is NULL if *alloc == 0.  */
3494          int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3495                                                   int32_t,
3496                                                   new_equiv_class_alloc);
3497          if (BE (new_equiv_classes == NULL, 0))
3498            return REG_ESPACE;
3499          mbcset->equiv_classes = new_equiv_classes;
3500          *equiv_class_alloc = new_equiv_class_alloc;
3501        }
3502      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3503    }
3504  else
3505#endif /* _LIBC */
3506    {
3507      if (BE (strlen ((const char *) name) != 1, 0))
3508        return REG_ECOLLATE;
3509      bitset_set (sbcset, *name);
3510    }
3511  return REG_NOERROR;
3512}
3513
3514  /* Helper function for parse_bracket_exp.
3515     Build the character class which is represented by NAME.
3516     The result are written to MBCSET and SBCSET.
3517     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3518     is a pointer argument since we may update it.  */
3519
3520static reg_errcode_t
3521#ifdef RE_ENABLE_I18N
3522build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3523                 re_charset_t *mbcset, int *char_class_alloc,
3524                 const char *class_name, reg_syntax_t syntax)
3525#else /* not RE_ENABLE_I18N */
3526build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3527                 const char *class_name, reg_syntax_t syntax)
3528#endif /* not RE_ENABLE_I18N */
3529{
3530  int i;
3531
3532  /* In case of REG_ICASE "upper" and "lower" match the both of
3533     upper and lower cases.  */
3534  if ((syntax & RE_ICASE)
3535      && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3536    class_name = "alpha";
3537
3538#ifdef RE_ENABLE_I18N
3539  /* Check the space of the arrays.  */
3540  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3541    {
3542      /* Not enough, realloc it.  */
3543      /* +1 in case of mbcset->nchar_classes is 0.  */
3544      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3545      /* Use realloc since array is NULL if *alloc == 0.  */
3546      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3547                                               new_char_class_alloc);
3548      if (BE (new_char_classes == NULL, 0))
3549        return REG_ESPACE;
3550      mbcset->char_classes = new_char_classes;
3551      *char_class_alloc = new_char_class_alloc;
3552    }
3553  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3554#endif /* RE_ENABLE_I18N */
3555
3556#define BUILD_CHARCLASS_LOOP(ctype_func)        \
3557  do {                                          \
3558    if (BE (trans != NULL, 0))                  \
3559      {                                         \
3560        for (i = 0; i < SBC_MAX; ++i)           \
3561          if (ctype_func (i))                   \
3562            bitset_set (sbcset, trans[i]);      \
3563      }                                         \
3564    else                                        \
3565      {                                         \
3566        for (i = 0; i < SBC_MAX; ++i)           \
3567          if (ctype_func (i))                   \
3568            bitset_set (sbcset, i);             \
3569      }                                         \
3570  } while (0)
3571
3572  if (strcmp (class_name, "alnum") == 0)
3573    BUILD_CHARCLASS_LOOP (isalnum);
3574  else if (strcmp (class_name, "cntrl") == 0)
3575    BUILD_CHARCLASS_LOOP (iscntrl);
3576  else if (strcmp (class_name, "lower") == 0)
3577    BUILD_CHARCLASS_LOOP (islower);
3578  else if (strcmp (class_name, "space") == 0)
3579    BUILD_CHARCLASS_LOOP (isspace);
3580  else if (strcmp (class_name, "alpha") == 0)
3581    BUILD_CHARCLASS_LOOP (isalpha);
3582  else if (strcmp (class_name, "digit") == 0)
3583    BUILD_CHARCLASS_LOOP (isdigit);
3584  else if (strcmp (class_name, "print") == 0)
3585    BUILD_CHARCLASS_LOOP (isprint);
3586  else if (strcmp (class_name, "upper") == 0)
3587    BUILD_CHARCLASS_LOOP (isupper);
3588  else if (strcmp (class_name, "blank") == 0)
3589#ifndef GAWK
3590    BUILD_CHARCLASS_LOOP (isblank);
3591#else
3592    /* see comments above */
3593    BUILD_CHARCLASS_LOOP (is_blank);
3594#endif
3595  else if (strcmp (class_name, "graph") == 0)
3596    BUILD_CHARCLASS_LOOP (isgraph);
3597  else if (strcmp (class_name, "punct") == 0)
3598    BUILD_CHARCLASS_LOOP (ispunct);
3599  else if (strcmp (class_name, "xdigit") == 0)
3600    BUILD_CHARCLASS_LOOP (isxdigit);
3601  else
3602    return REG_ECTYPE;
3603
3604  return REG_NOERROR;
3605}
3606
3607static bin_tree_t *
3608build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3609                    const char *class_name,
3610                    const char *extra, int non_match,
3611                    reg_errcode_t *err)
3612{
3613  re_bitset_ptr_t sbcset;
3614#ifdef RE_ENABLE_I18N
3615  re_charset_t *mbcset;
3616  int alloc = 0;
3617#endif /* not RE_ENABLE_I18N */
3618  reg_errcode_t ret;
3619  re_token_t br_token;
3620  bin_tree_t *tree;
3621
3622  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3623#ifdef RE_ENABLE_I18N
3624  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3625#endif /* RE_ENABLE_I18N */
3626
3627#ifdef RE_ENABLE_I18N
3628  if (BE (sbcset == NULL || mbcset == NULL, 0))
3629#else /* not RE_ENABLE_I18N */
3630  if (BE (sbcset == NULL, 0))
3631#endif /* not RE_ENABLE_I18N */
3632    {
3633      *err = REG_ESPACE;
3634      return NULL;
3635    }
3636
3637  if (non_match)
3638    {
3639#ifdef RE_ENABLE_I18N
3640      mbcset->non_match = 1;
3641#endif /* not RE_ENABLE_I18N */
3642    }
3643
3644  /* We don't care the syntax in this case.  */
3645  ret = build_charclass (trans, sbcset,
3646#ifdef RE_ENABLE_I18N
3647                         mbcset, &alloc,
3648#endif /* RE_ENABLE_I18N */
3649                         class_name, 0);
3650
3651  if (BE (ret != REG_NOERROR, 0))
3652    {
3653      re_free (sbcset);
3654#ifdef RE_ENABLE_I18N
3655      free_charset (mbcset);
3656#endif /* RE_ENABLE_I18N */
3657      *err = ret;
3658      return NULL;
3659    }
3660  /* \w match '_' also.  */
3661  for (; *extra; extra++)
3662    bitset_set (sbcset, *extra);
3663
3664  /* If it is non-matching list.  */
3665  if (non_match)
3666    bitset_not (sbcset);
3667
3668#ifdef RE_ENABLE_I18N
3669  /* Ensure only single byte characters are set.  */
3670  if (dfa->mb_cur_max > 1)
3671    bitset_mask (sbcset, dfa->sb_char);
3672#endif
3673
3674  /* Build a tree for simple bracket.  */
3675  br_token.type = SIMPLE_BRACKET;
3676  br_token.opr.sbcset = sbcset;
3677  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3678  if (BE (tree == NULL, 0))
3679    goto build_word_op_espace;
3680
3681#ifdef RE_ENABLE_I18N
3682  if (dfa->mb_cur_max > 1)
3683    {
3684      bin_tree_t *mbc_tree;
3685      /* Build a tree for complex bracket.  */
3686      br_token.type = COMPLEX_BRACKET;
3687      br_token.opr.mbcset = mbcset;
3688      dfa->has_mb_node = 1;
3689      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3690      if (BE (mbc_tree == NULL, 0))
3691        goto build_word_op_espace;
3692      /* Then join them by ALT node.  */
3693      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3694      if (BE (mbc_tree != NULL, 1))
3695        return tree;
3696    }
3697  else
3698    {
3699      free_charset (mbcset);
3700      return tree;
3701    }
3702#else /* not RE_ENABLE_I18N */
3703  return tree;
3704#endif /* not RE_ENABLE_I18N */
3705
3706 build_word_op_espace:
3707  re_free (sbcset);
3708#ifdef RE_ENABLE_I18N
3709  free_charset (mbcset);
3710#endif /* RE_ENABLE_I18N */
3711  *err = REG_ESPACE;
3712  return NULL;
3713}
3714
3715/* This is intended for the expressions like "a{1,3}".
3716   Fetch a number from `input', and return the number.
3717   Return -1, if the number field is empty like "{,1}".
3718   Return -2, if an error has occurred.  */
3719
3720static int
3721fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3722{
3723  int num = -1;
3724  unsigned char c;
3725  while (1)
3726    {
3727      fetch_token (token, input, syntax);
3728      c = token->opr.c;
3729      if (BE (token->type == END_OF_RE, 0))
3730        return -2;
3731      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3732        break;
3733      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3734             ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3735      num = (num > RE_DUP_MAX) ? -2 : num;
3736    }
3737  return num;
3738}
3739\f
3740#ifdef RE_ENABLE_I18N
3741static void
3742free_charset (re_charset_t *cset)
3743{
3744  re_free (cset->mbchars);
3745# ifdef _LIBC
3746  re_free (cset->coll_syms);
3747  re_free (cset->equiv_classes);
3748  re_free (cset->range_starts);
3749  re_free (cset->range_ends);
3750# endif
3751  re_free (cset->char_classes);
3752  re_free (cset);
3753}
3754#endif /* RE_ENABLE_I18N */
3755\f
3756/* Functions for binary tree operation.  */
3757
3758/* Create a tree node.  */
3759
3760static bin_tree_t *
3761create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3762             re_token_type_t type)
3763{
3764  re_token_t t;
3765  t.type = type;
3766  return create_token_tree (dfa, left, right, &t);
3767}
3768
3769static bin_tree_t *
3770create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3771                   const re_token_t *token)
3772{
3773  bin_tree_t *tree;
3774  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3775    {
3776      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3777
3778      if (storage == NULL)
3779        return NULL;
3780      storage->next = dfa->str_tree_storage;
3781      dfa->str_tree_storage = storage;
3782      dfa->str_tree_storage_idx = 0;
3783    }
3784  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3785
3786  tree->parent = NULL;
3787  tree->left = left;
3788  tree->right = right;
3789  tree->token = *token;
3790  tree->token.duplicated = 0;
3791  tree->token.opt_subexp = 0;
3792  tree->first = NULL;
3793  tree->next = NULL;
3794  tree->node_idx = -1;
3795
3796  if (left != NULL)
3797    left->parent = tree;
3798  if (right != NULL)
3799    right->parent = tree;
3800  return tree;
3801}
3802
3803/* Mark the tree SRC as an optional subexpression.
3804   To be called from preorder or postorder.  */
3805
3806static reg_errcode_t
3807mark_opt_subexp (void *extra, bin_tree_t *node)
3808{
3809  int idx = (int) (intptr_t) extra;
3810  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3811    node->token.opt_subexp = 1;
3812
3813  return REG_NOERROR;
3814}
3815
3816/* Free the allocated memory inside NODE. */
3817
3818static void
3819free_token (re_token_t *node)
3820{
3821#ifdef RE_ENABLE_I18N
3822  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3823    free_charset (node->opr.mbcset);
3824  else
3825#endif /* RE_ENABLE_I18N */
3826    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3827      re_free (node->opr.sbcset);
3828}
3829
3830/* Worker function for tree walking.  Free the allocated memory inside NODE
3831   and its children. */
3832
3833static reg_errcode_t
3834free_tree (void *extra, bin_tree_t *node)
3835{
3836  free_token (&node->token);
3837  return REG_NOERROR;
3838}
3839
3840
3841/* Duplicate the node SRC, and return new node.  This is a preorder
3842   visit similar to the one implemented by the generic visitor, but
3843   we need more infrastructure to maintain two parallel trees --- so,
3844   it's easier to duplicate.  */
3845
3846static bin_tree_t *
3847duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3848{
3849  const bin_tree_t *node;
3850  bin_tree_t *dup_root;
3851  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3852
3853  for (node = root; ; )
3854    {
3855      /* Create a new tree and link it back to the current parent.  */
3856      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3857      if (*p_new == NULL)
3858        return NULL;
3859      (*p_new)->parent = dup_node;
3860      (*p_new)->token.duplicated = 1;
3861      dup_node = *p_new;
3862
3863      /* Go to the left node, or up and to the right.  */
3864      if (node->left)
3865        {
3866          node = node->left;
3867          p_new = &dup_node->left;
3868        }
3869      else
3870        {
3871          const bin_tree_t *prev = NULL;
3872          while (node->right == prev || node->right == NULL)
3873            {
3874              prev = node;
3875              node = node->parent;
3876              dup_node = dup_node->parent;
3877              if (!node)
3878                return dup_root;
3879            }
3880          node = node->right;
3881          p_new = &dup_node->right;
3882        }
3883    }
3884}