compat / regex / regcomp.con commit git-fetch.txt: document fetch output (a52397c)
   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, write to the Free
  18   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19   02110-1301 USA.  */
  20
  21#include <stdint.h>
  22
  23static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  24                                          size_t length, reg_syntax_t syntax);
  25static void re_compile_fastmap_iter (regex_t *bufp,
  26                                     const re_dfastate_t *init_state,
  27                                     char *fastmap);
  28static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  29#ifdef RE_ENABLE_I18N
  30static void free_charset (re_charset_t *cset);
  31#endif /* RE_ENABLE_I18N */
  32static void free_workarea_compile (regex_t *preg);
  33static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  34#ifdef RE_ENABLE_I18N
  35static void optimize_utf8 (re_dfa_t *dfa);
  36#endif
  37static reg_errcode_t analyze (regex_t *preg);
  38static reg_errcode_t preorder (bin_tree_t *root,
  39                               reg_errcode_t (fn (void *, bin_tree_t *)),
  40                               void *extra);
  41static reg_errcode_t postorder (bin_tree_t *root,
  42                                reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                void *extra);
  44static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  45static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  46static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  47                                 bin_tree_t *node);
  48static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  49static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  50static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  51static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  52static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  53                                   unsigned int constraint);
  54static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  55static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  56                                         int node, int root);
  57static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  58static int fetch_number (re_string_t *input, re_token_t *token,
  59                         reg_syntax_t syntax);
  60static int peek_token (re_token_t *token, re_string_t *input,
  61                        reg_syntax_t syntax) internal_function;
  62static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  63                          reg_syntax_t syntax, reg_errcode_t *err);
  64static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  68                                 re_token_t *token, reg_syntax_t syntax,
  69                                 int nest, reg_errcode_t *err);
  70static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  71                                     re_token_t *token, reg_syntax_t syntax,
  72                                     int nest, reg_errcode_t *err);
  73static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  74                                  re_token_t *token, reg_syntax_t syntax,
  75                                  int nest, reg_errcode_t *err);
  76static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  77                                 re_dfa_t *dfa, re_token_t *token,
  78                                 reg_syntax_t syntax, reg_errcode_t *err);
  79static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  80                                      re_token_t *token, reg_syntax_t syntax,
  81                                      reg_errcode_t *err);
  82static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  83                                            re_string_t *regexp,
  84                                            re_token_t *token, int token_len,
  85                                            re_dfa_t *dfa,
  86                                            reg_syntax_t syntax,
  87                                            int accept_hyphen);
  88static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  89                                          re_string_t *regexp,
  90                                          re_token_t *token);
  91#ifdef RE_ENABLE_I18N
  92static reg_errcode_t build_equiv_class (bitset_t sbcset,
  93                                        re_charset_t *mbcset,
  94                                        int *equiv_class_alloc,
  95                                        const unsigned char *name);
  96static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  97                                      bitset_t sbcset,
  98                                      re_charset_t *mbcset,
  99                                      int *char_class_alloc,
 100                                      const char *class_name,
 101                                      reg_syntax_t syntax);
 102#else  /* not RE_ENABLE_I18N */
 103static reg_errcode_t build_equiv_class (bitset_t sbcset,
 104                                        const unsigned char *name);
 105static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 106                                      bitset_t sbcset,
 107                                      const char *class_name,
 108                                      reg_syntax_t syntax);
 109#endif /* not RE_ENABLE_I18N */
 110static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 111                                       RE_TRANSLATE_TYPE trans,
 112                                       const char *class_name,
 113                                       const char *extra,
 114                                       int non_match, reg_errcode_t *err);
 115static bin_tree_t *create_tree (re_dfa_t *dfa,
 116                                bin_tree_t *left, bin_tree_t *right,
 117                                re_token_type_t type);
 118static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 119                                      bin_tree_t *left, bin_tree_t *right,
 120                                      const re_token_t *token);
 121static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 122static void free_token (re_token_t *node);
 123static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 124static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 125\f
 126/* This table gives an error message for each of the error codes listed
 127   in regex.h.  Obviously the order here has to be same as there.
 128   POSIX doesn't require that we do anything for REG_NOERROR,
 129   but why not be nice?  */
 130
 131const char __re_error_msgid[] attribute_hidden =
 132  {
 133#define REG_NOERROR_IDX 0
 134    gettext_noop ("Success")    /* REG_NOERROR */
 135    "\0"
 136#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 137    gettext_noop ("No match")   /* REG_NOMATCH */
 138    "\0"
 139#define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 140    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 141    "\0"
 142#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 143    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 144    "\0"
 145#define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 146    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 147    "\0"
 148#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 149    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 150    "\0"
 151#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 152    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 153    "\0"
 154#define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 155    gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 156    "\0"
 157#define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 158    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 159    "\0"
 160#define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 161    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 162    "\0"
 163#define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 164    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 165    "\0"
 166#define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 167    gettext_noop ("Invalid range end")  /* REG_ERANGE */
 168    "\0"
 169#define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 170    gettext_noop ("Memory exhausted") /* REG_ESPACE */
 171    "\0"
 172#define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 173    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 174    "\0"
 175#define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 176    gettext_noop ("Premature end of regular expression") /* REG_EEND */
 177    "\0"
 178#define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 179    gettext_noop ("Regular expression too big") /* REG_ESIZE */
 180    "\0"
 181#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 182    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 183  };
 184
 185const size_t __re_error_msgid_idx[] attribute_hidden =
 186  {
 187    REG_NOERROR_IDX,
 188    REG_NOMATCH_IDX,
 189    REG_BADPAT_IDX,
 190    REG_ECOLLATE_IDX,
 191    REG_ECTYPE_IDX,
 192    REG_EESCAPE_IDX,
 193    REG_ESUBREG_IDX,
 194    REG_EBRACK_IDX,
 195    REG_EPAREN_IDX,
 196    REG_EBRACE_IDX,
 197    REG_BADBR_IDX,
 198    REG_ERANGE_IDX,
 199    REG_ESPACE_IDX,
 200    REG_BADRPT_IDX,
 201    REG_EEND_IDX,
 202    REG_ESIZE_IDX,
 203    REG_ERPAREN_IDX
 204  };
 205\f
 206/* Entry points for GNU code.  */
 207
 208
 209#ifdef ZOS_USS
 210
 211/* For ZOS USS we must define btowc */
 212
 213wchar_t 
 214btowc (int c)
 215{
 216   wchar_t wtmp[2];
 217   char tmp[2];
 218
 219   tmp[0] = c;
 220   tmp[1] = 0;
 221
 222   mbtowc (wtmp, tmp, 1);
 223   return wtmp[0];
 224}
 225#endif
 226
 227/* re_compile_pattern is the GNU regular expression compiler: it
 228   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 229   Returns 0 if the pattern was valid, otherwise an error string.
 230
 231   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 232   are set in BUFP on entry.  */
 233
 234const char *
 235re_compile_pattern (const char *pattern,
 236                    size_t length,
 237                    struct re_pattern_buffer *bufp)
 238{
 239  reg_errcode_t ret;
 240
 241  /* And GNU code determines whether or not to get register information
 242     by passing null for the REGS argument to re_match, etc., not by
 243     setting no_sub, unless RE_NO_SUB is set.  */
 244  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 245
 246  /* Match anchors at newline.  */
 247  bufp->newline_anchor = 1;
 248
 249  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 250
 251  if (!ret)
 252    return NULL;
 253  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 254}
 255#ifdef _LIBC
 256weak_alias (__re_compile_pattern, re_compile_pattern)
 257#endif
 258
 259/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 260   also be assigned to arbitrarily: each pattern buffer stores its own
 261   syntax, so it can be changed between regex compilations.  */
 262/* This has no initializer because initialized variables in Emacs
 263   become read-only after dumping.  */
 264reg_syntax_t re_syntax_options;
 265
 266
 267/* Specify the precise syntax of regexps for compilation.  This provides
 268   for compatibility for various utilities which historically have
 269   different, incompatible syntaxes.
 270
 271   The argument SYNTAX is a bit mask comprised of the various bits
 272   defined in regex.h.  We return the old syntax.  */
 273
 274reg_syntax_t
 275re_set_syntax (reg_syntax_t syntax)
 276{
 277  reg_syntax_t ret = re_syntax_options;
 278
 279  re_syntax_options = syntax;
 280  return ret;
 281}
 282#ifdef _LIBC
 283weak_alias (__re_set_syntax, re_set_syntax)
 284#endif
 285
 286int
 287re_compile_fastmap (struct re_pattern_buffer *bufp)
 288{
 289  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 290  char *fastmap = bufp->fastmap;
 291
 292  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 293  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 294  if (dfa->init_state != dfa->init_state_word)
 295    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 296  if (dfa->init_state != dfa->init_state_nl)
 297    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 298  if (dfa->init_state != dfa->init_state_begbuf)
 299    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 300  bufp->fastmap_accurate = 1;
 301  return 0;
 302}
 303#ifdef _LIBC
 304weak_alias (__re_compile_fastmap, re_compile_fastmap)
 305#endif
 306
 307static inline void
 308__attribute ((always_inline))
 309re_set_fastmap (char *fastmap, int icase, int ch)
 310{
 311  fastmap[ch] = 1;
 312  if (icase)
 313    fastmap[tolower (ch)] = 1;
 314}
 315
 316/* Helper function for re_compile_fastmap.
 317   Compile fastmap for the initial_state INIT_STATE.  */
 318
 319static void
 320re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 321                         char *fastmap)
 322{
 323  volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 324  int node_cnt;
 325  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 326  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 327    {
 328      int node = init_state->nodes.elems[node_cnt];
 329      re_token_type_t type = dfa->nodes[node].type;
 330
 331      if (type == CHARACTER)
 332        {
 333          re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 334#ifdef RE_ENABLE_I18N
 335          if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 336            {
 337              unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 338              wchar_t wc;
 339              mbstate_t state;
 340
 341              p = buf;
 342              *p++ = dfa->nodes[node].opr.c;
 343              while (++node < dfa->nodes_len
 344                     && dfa->nodes[node].type == CHARACTER
 345                     && dfa->nodes[node].mb_partial)
 346                *p++ = dfa->nodes[node].opr.c;
 347              memset (&state, '\0', sizeof (state));
 348              if (__mbrtowc (&wc, (const char *) buf, p - buf,
 349                             &state) == p - buf
 350                  && (__wcrtomb ((char *) buf, towlower (wc), &state)
 351                      != (size_t) -1))
 352                re_set_fastmap (fastmap, 0, buf[0]);
 353              re_free (buf);
 354            }
 355#endif
 356        }
 357      else if (type == SIMPLE_BRACKET)
 358        {
 359          int i, ch;
 360          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 361            {
 362              int j;
 363              bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 364              for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 365                if (w & ((bitset_word_t) 1 << j))
 366                  re_set_fastmap (fastmap, icase, ch);
 367            }
 368        }
 369#ifdef RE_ENABLE_I18N
 370      else if (type == COMPLEX_BRACKET)
 371        {
 372          re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 373          int i;
 374
 375# ifdef _LIBC
 376          /* See if we have to try all bytes which start multiple collation
 377             elements.
 378             e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 379                  collation element, and don't catch 'b' since 'b' is
 380                  the only collation element which starts from 'b' (and
 381                  it is caught by SIMPLE_BRACKET).  */
 382              if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 383                  && (cset->ncoll_syms || cset->nranges))
 384                {
 385                  const int32_t *table = (const int32_t *)
 386                    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 387                  for (i = 0; i < SBC_MAX; ++i)
 388                    if (table[i] < 0)
 389                      re_set_fastmap (fastmap, icase, i);
 390                }
 391# endif /* _LIBC */
 392
 393          /* See if we have to start the match at all multibyte characters,
 394             i.e. where we would not find an invalid sequence.  This only
 395             applies to multibyte character sets; for single byte character
 396             sets, the SIMPLE_BRACKET again suffices.  */
 397          if (dfa->mb_cur_max > 1
 398              && (cset->nchar_classes || cset->non_match || cset->nranges
 399# ifdef _LIBC
 400                  || cset->nequiv_classes
 401# endif /* _LIBC */
 402                 ))
 403            {
 404              unsigned char c = 0;
 405              do
 406                {
 407                  mbstate_t mbs;
 408                  memset (&mbs, 0, sizeof (mbs));
 409                  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 410                    re_set_fastmap (fastmap, false, (int) c);
 411                }
 412              while (++c != 0);
 413            }
 414
 415          else
 416            {
 417              /* ... Else catch all bytes which can start the mbchars.  */
 418              for (i = 0; i < cset->nmbchars; ++i)
 419                {
 420                  char buf[256];
 421                  mbstate_t state;
 422                  memset (&state, '\0', sizeof (state));
 423                  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 424                    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 425                  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 426                    {
 427                      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 428                          != (size_t) -1)
 429                        re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 430                    }
 431                }
 432            }
 433        }
 434#endif /* RE_ENABLE_I18N */
 435      else if (type == OP_PERIOD
 436#ifdef RE_ENABLE_I18N
 437               || type == OP_UTF8_PERIOD
 438#endif /* RE_ENABLE_I18N */
 439               || type == END_OF_RE)
 440        {
 441          memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 442          if (type == END_OF_RE)
 443            bufp->can_be_null = 1;
 444          return;
 445        }
 446    }
 447}
 448\f
 449/* Entry point for POSIX code.  */
 450/* regcomp takes a regular expression as a string and compiles it.
 451
 452   PREG is a regex_t *.  We do not expect any fields to be initialized,
 453   since POSIX says we shouldn't.  Thus, we set
 454
 455     `buffer' to the compiled pattern;
 456     `used' to the length of the compiled pattern;
 457     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 458       REG_EXTENDED bit in CFLAGS is set; otherwise, to
 459       RE_SYNTAX_POSIX_BASIC;
 460     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 461     `fastmap' to an allocated space for the fastmap;
 462     `fastmap_accurate' to zero;
 463     `re_nsub' to the number of subexpressions in PATTERN.
 464
 465   PATTERN is the address of the pattern string.
 466
 467   CFLAGS is a series of bits which affect compilation.
 468
 469     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 470     use POSIX basic syntax.
 471
 472     If REG_NEWLINE is set, then . and [^...] don't match newline.
 473     Also, regexec will try a match beginning after every newline.
 474
 475     If REG_ICASE is set, then we considers upper- and lowercase
 476     versions of letters to be equivalent when matching.
 477
 478     If REG_NOSUB is set, then when PREG is passed to regexec, that
 479     routine will report only success or failure, and nothing about the
 480     registers.
 481
 482   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 483   the return codes and their meanings.)  */
 484
 485int
 486regcomp (regex_t *__restrict preg,
 487         const char *__restrict pattern,
 488         int cflags)
 489{
 490  reg_errcode_t ret;
 491  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 492                         : RE_SYNTAX_POSIX_BASIC);
 493
 494  preg->buffer = NULL;
 495  preg->allocated = 0;
 496  preg->used = 0;
 497
 498  /* Try to allocate space for the fastmap.  */
 499  preg->fastmap = re_malloc (char, SBC_MAX);
 500  if (BE (preg->fastmap == NULL, 0))
 501    return REG_ESPACE;
 502
 503  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 504
 505  /* If REG_NEWLINE is set, newlines are treated differently.  */
 506  if (cflags & REG_NEWLINE)
 507    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 508      syntax &= ~RE_DOT_NEWLINE;
 509      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 510      /* It also changes the matching behavior.  */
 511      preg->newline_anchor = 1;
 512    }
 513  else
 514    preg->newline_anchor = 0;
 515  preg->no_sub = !!(cflags & REG_NOSUB);
 516  preg->translate = NULL;
 517
 518  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 519
 520  /* POSIX doesn't distinguish between an unmatched open-group and an
 521     unmatched close-group: both are REG_EPAREN.  */
 522  if (ret == REG_ERPAREN)
 523    ret = REG_EPAREN;
 524
 525  /* We have already checked preg->fastmap != NULL.  */
 526  if (BE (ret == REG_NOERROR, 1))
 527    /* Compute the fastmap now, since regexec cannot modify the pattern
 528       buffer.  This function never fails in this implementation.  */
 529    (void) re_compile_fastmap (preg);
 530  else
 531    {
 532      /* Some error occurred while compiling the expression.  */
 533      re_free (preg->fastmap);
 534      preg->fastmap = NULL;
 535    }
 536
 537  return (int) ret;
 538}
 539#ifdef _LIBC
 540weak_alias (__regcomp, regcomp)
 541#endif
 542
 543/* Returns a message corresponding to an error code, ERRCODE, returned
 544   from either regcomp or regexec.   We don't use PREG here.  */
 545
 546size_t
 547regerror(int errcode, const regex_t *__restrict preg,
 548         char *__restrict errbuf, size_t errbuf_size)
 549{
 550  const char *msg;
 551  size_t msg_size;
 552
 553  if (BE (errcode < 0
 554          || errcode >= (int) (sizeof (__re_error_msgid_idx)
 555                               / sizeof (__re_error_msgid_idx[0])), 0))
 556    /* Only error codes returned by the rest of the code should be passed
 557       to this routine.  If we are given anything else, or if other regex
 558       code generates an invalid error code, then the program has a bug.
 559       Dump core so we can fix it.  */
 560    abort ();
 561
 562  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 563
 564  msg_size = strlen (msg) + 1; /* Includes the null.  */
 565
 566  if (BE (errbuf_size != 0, 1))
 567    {
 568      if (BE (msg_size > errbuf_size, 0))
 569        {
 570          memcpy (errbuf, msg, errbuf_size - 1);
 571          errbuf[errbuf_size - 1] = 0;
 572        }
 573      else
 574        memcpy (errbuf, msg, msg_size);
 575    }
 576
 577  return msg_size;
 578}
 579#ifdef _LIBC
 580weak_alias (__regerror, regerror)
 581#endif
 582
 583
 584#ifdef RE_ENABLE_I18N
 585/* This static array is used for the map to single-byte characters when
 586   UTF-8 is used.  Otherwise we would allocate memory just to initialize
 587   it the same all the time.  UTF-8 is the preferred encoding so this is
 588   a worthwhile optimization.  */
 589#if __GNUC__ >= 3
 590static const bitset_t utf8_sb_map = {
 591  /* Set the first 128 bits.  */
 592  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 593};
 594#else /* ! (__GNUC__ >= 3) */
 595static bitset_t utf8_sb_map;
 596#endif /* __GNUC__ >= 3 */
 597#endif /* RE_ENABLE_I18N */
 598
 599
 600static void
 601free_dfa_content (re_dfa_t *dfa)
 602{
 603  int i, j;
 604
 605  if (dfa->nodes)
 606    for (i = 0; i < dfa->nodes_len; ++i)
 607      free_token (dfa->nodes + i);
 608  re_free (dfa->nexts);
 609  for (i = 0; i < dfa->nodes_len; ++i)
 610    {
 611      if (dfa->eclosures != NULL)
 612        re_node_set_free (dfa->eclosures + i);
 613      if (dfa->inveclosures != NULL)
 614        re_node_set_free (dfa->inveclosures + i);
 615      if (dfa->edests != NULL)
 616        re_node_set_free (dfa->edests + i);
 617    }
 618  re_free (dfa->edests);
 619  re_free (dfa->eclosures);
 620  re_free (dfa->inveclosures);
 621  re_free (dfa->nodes);
 622
 623  if (dfa->state_table)
 624    for (i = 0; i <= dfa->state_hash_mask; ++i)
 625      {
 626        struct re_state_table_entry *entry = dfa->state_table + i;
 627        for (j = 0; j < entry->num; ++j)
 628          {
 629            re_dfastate_t *state = entry->array[j];
 630            free_state (state);
 631          }
 632        re_free (entry->array);
 633      }
 634  re_free (dfa->state_table);
 635#ifdef RE_ENABLE_I18N
 636  if (dfa->sb_char != utf8_sb_map)
 637    re_free (dfa->sb_char);
 638#endif
 639  re_free (dfa->subexp_map);
 640#ifdef DEBUG
 641  re_free (dfa->re_str);
 642#endif
 643
 644  re_free (dfa);
 645}
 646
 647
 648/* Free dynamically allocated space used by PREG.  */
 649
 650void
 651regfree (regex_t *preg)
 652{
 653  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 654  if (BE (dfa != NULL, 1))
 655    free_dfa_content (dfa);
 656  preg->buffer = NULL;
 657  preg->allocated = 0;
 658
 659  re_free (preg->fastmap);
 660  preg->fastmap = NULL;
 661
 662  re_free (preg->translate);
 663  preg->translate = NULL;
 664}
 665#ifdef _LIBC
 666weak_alias (__regfree, regfree)
 667#endif
 668\f
 669/* Entry points compatible with 4.2 BSD regex library.  We don't define
 670   them unless specifically requested.  */
 671
 672#if defined _REGEX_RE_COMP || defined _LIBC
 673
 674/* BSD has one and only one pattern buffer.  */
 675static struct re_pattern_buffer re_comp_buf;
 676
 677char *
 678# ifdef _LIBC
 679/* Make these definitions weak in libc, so POSIX programs can redefine
 680   these names if they don't use our functions, and still use
 681   regcomp/regexec above without link errors.  */
 682weak_function
 683# endif
 684re_comp (s)
 685     const char *s;
 686{
 687  reg_errcode_t ret;
 688  char *fastmap;
 689
 690  if (!s)
 691    {
 692      if (!re_comp_buf.buffer)
 693        return gettext ("No previous regular expression");
 694      return 0;
 695    }
 696
 697  if (re_comp_buf.buffer)
 698    {
 699      fastmap = re_comp_buf.fastmap;
 700      re_comp_buf.fastmap = NULL;
 701      __regfree (&re_comp_buf);
 702      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 703      re_comp_buf.fastmap = fastmap;
 704    }
 705
 706  if (re_comp_buf.fastmap == NULL)
 707    {
 708      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 709      if (re_comp_buf.fastmap == NULL)
 710        return (char *) gettext (__re_error_msgid
 711                                 + __re_error_msgid_idx[(int) REG_ESPACE]);
 712    }
 713
 714  /* Since `re_exec' always passes NULL for the `regs' argument, we
 715     don't need to initialize the pattern buffer fields which affect it.  */
 716
 717  /* Match anchors at newlines.  */
 718  re_comp_buf.newline_anchor = 1;
 719
 720  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 721
 722  if (!ret)
 723    return NULL;
 724
 725  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 726  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 727}
 728
 729#ifdef _LIBC
 730libc_freeres_fn (free_mem)
 731{
 732  __regfree (&re_comp_buf);
 733}
 734#endif
 735
 736#endif /* _REGEX_RE_COMP */
 737\f
 738/* Internal entry point.
 739   Compile the regular expression PATTERN, whose length is LENGTH.
 740   SYNTAX indicate regular expression's syntax.  */
 741
 742static reg_errcode_t
 743re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 744                     reg_syntax_t syntax)
 745{
 746  reg_errcode_t err = REG_NOERROR;
 747  re_dfa_t *dfa;
 748  re_string_t regexp;
 749
 750  /* Initialize the pattern buffer.  */
 751  preg->fastmap_accurate = 0;
 752  preg->syntax = syntax;
 753  preg->not_bol = preg->not_eol = 0;
 754  preg->used = 0;
 755  preg->re_nsub = 0;
 756  preg->can_be_null = 0;
 757  preg->regs_allocated = REGS_UNALLOCATED;
 758
 759  /* Initialize the dfa.  */
 760  dfa = (re_dfa_t *) preg->buffer;
 761  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 762    {
 763      /* If zero allocated, but buffer is non-null, try to realloc
 764         enough space.  This loses if buffer's address is bogus, but
 765         that is the user's responsibility.  If ->buffer is NULL this
 766         is a simple allocation.  */
 767      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 768      if (dfa == NULL)
 769        return REG_ESPACE;
 770      preg->allocated = sizeof (re_dfa_t);
 771      preg->buffer = (unsigned char *) dfa;
 772    }
 773  preg->used = sizeof (re_dfa_t);
 774
 775  err = init_dfa (dfa, length);
 776  if (BE (err != REG_NOERROR, 0))
 777    {
 778      free_dfa_content (dfa);
 779      preg->buffer = NULL;
 780      preg->allocated = 0;
 781      return err;
 782    }
 783#ifdef DEBUG
 784  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 785  dfa->re_str = re_malloc (char, length + 1);
 786  strncpy (dfa->re_str, pattern, length + 1);
 787#endif
 788
 789  __libc_lock_init (dfa->lock);
 790
 791  err = re_string_construct (&regexp, pattern, length, preg->translate,
 792                             syntax & RE_ICASE, dfa);
 793  if (BE (err != REG_NOERROR, 0))
 794    {
 795    re_compile_internal_free_return:
 796      free_workarea_compile (preg);
 797      re_string_destruct (&regexp);
 798      free_dfa_content (dfa);
 799      preg->buffer = NULL;
 800      preg->allocated = 0;
 801      return err;
 802    }
 803
 804  /* Parse the regular expression, and build a structure tree.  */
 805  preg->re_nsub = 0;
 806  dfa->str_tree = parse (&regexp, preg, syntax, &err);
 807  if (BE (dfa->str_tree == NULL, 0))
 808    goto re_compile_internal_free_return;
 809
 810  /* Analyze the tree and create the nfa.  */
 811  err = analyze (preg);
 812  if (BE (err != REG_NOERROR, 0))
 813    goto re_compile_internal_free_return;
 814
 815#ifdef RE_ENABLE_I18N
 816  /* If possible, do searching in single byte encoding to speed things up.  */
 817  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 818    optimize_utf8 (dfa);
 819#endif
 820
 821  /* Then create the initial state of the dfa.  */
 822  err = create_initial_state (dfa);
 823
 824  /* Release work areas.  */
 825  free_workarea_compile (preg);
 826  re_string_destruct (&regexp);
 827
 828  if (BE (err != REG_NOERROR, 0))
 829    {
 830      free_dfa_content (dfa);
 831      preg->buffer = NULL;
 832      preg->allocated = 0;
 833    }
 834
 835  return err;
 836}
 837
 838/* Initialize DFA.  We use the length of the regular expression PAT_LEN
 839   as the initial length of some arrays.  */
 840
 841static reg_errcode_t
 842init_dfa (re_dfa_t *dfa, size_t pat_len)
 843{
 844  unsigned int table_size;
 845#ifndef _LIBC
 846  char *codeset_name;
 847#endif
 848
 849  memset (dfa, '\0', sizeof (re_dfa_t));
 850
 851  /* Force allocation of str_tree_storage the first time.  */
 852  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 853
 854  /* Avoid overflows.  */
 855  if (pat_len == SIZE_MAX)
 856    return REG_ESPACE;
 857
 858  dfa->nodes_alloc = pat_len + 1;
 859  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 860
 861  /*  table_size = 2 ^ ceil(log pat_len) */
 862  for (table_size = 1; ; table_size <<= 1)
 863    if (table_size > pat_len)
 864      break;
 865
 866  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 867  dfa->state_hash_mask = table_size - 1;
 868
 869  dfa->mb_cur_max = MB_CUR_MAX;
 870#ifdef _LIBC
 871  if (dfa->mb_cur_max == 6
 872      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 873    dfa->is_utf8 = 1;
 874  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 875                       != 0);
 876#else
 877# ifdef HAVE_LANGINFO_CODESET
 878  codeset_name = nl_langinfo (CODESET);
 879# else
 880  codeset_name = getenv ("LC_ALL");
 881  if (codeset_name == NULL || codeset_name[0] == '\0')
 882    codeset_name = getenv ("LC_CTYPE");
 883  if (codeset_name == NULL || codeset_name[0] == '\0')
 884    codeset_name = getenv ("LANG");
 885  if (codeset_name == NULL)
 886    codeset_name = "";
 887  else if (strchr (codeset_name, '.') !=  NULL)
 888    codeset_name = strchr (codeset_name, '.') + 1;
 889# endif
 890
 891  /* strcasecmp isn't a standard interface. brute force check */
 892#if 0
 893  if (strcasecmp (codeset_name, "UTF-8") == 0
 894      || strcasecmp (codeset_name, "UTF8") == 0)
 895    dfa->is_utf8 = 1;
 896#else
 897  if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 898      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 899      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 900      && (codeset_name[3] == '-'
 901          ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 902          : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 903    dfa->is_utf8 = 1;
 904#endif
 905
 906  /* We check exhaustively in the loop below if this charset is a
 907     superset of ASCII.  */
 908  dfa->map_notascii = 0;
 909#endif
 910
 911#ifdef RE_ENABLE_I18N
 912  if (dfa->mb_cur_max > 1)
 913    {
 914      if (dfa->is_utf8)
 915        {
 916#if !defined(__GNUC__) || __GNUC__ < 3
 917          static short utf8_sb_map_inited = 0;
 918
 919          if (! utf8_sb_map_inited)
 920            {
 921                int i;
 922
 923                utf8_sb_map_inited = 0;
 924                for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 925                  utf8_sb_map[i] = BITSET_WORD_MAX;
 926            }
 927#endif
 928          dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 929        }
 930      else
 931        {
 932          int i, j, ch;
 933
 934          dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 935          if (BE (dfa->sb_char == NULL, 0))
 936            return REG_ESPACE;
 937
 938          /* Set the bits corresponding to single byte chars.  */
 939          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 940            for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 941              {
 942                wint_t wch = __btowc (ch);
 943                if (wch != WEOF)
 944                  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 945# ifndef _LIBC
 946                if (isascii (ch) && wch != ch)
 947                  dfa->map_notascii = 1;
 948# endif
 949              }
 950        }
 951    }
 952#endif
 953
 954  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 955    return REG_ESPACE;
 956  return REG_NOERROR;
 957}
 958
 959/* Initialize WORD_CHAR table, which indicate which character is
 960   "word".  In this case "word" means that it is the word construction
 961   character used by some operators like "\<", "\>", etc.  */
 962
 963static void
 964internal_function
 965init_word_char (re_dfa_t *dfa)
 966{
 967  int i, j, ch;
 968  dfa->word_ops_used = 1;
 969  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 970    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 971      if (isalnum (ch) || ch == '_')
 972        dfa->word_char[i] |= (bitset_word_t) 1 << j;
 973}
 974
 975/* Free the work area which are only used while compiling.  */
 976
 977static void
 978free_workarea_compile (regex_t *preg)
 979{
 980  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 981  bin_tree_storage_t *storage, *next;
 982  for (storage = dfa->str_tree_storage; storage; storage = next)
 983    {
 984      next = storage->next;
 985      re_free (storage);
 986    }
 987  dfa->str_tree_storage = NULL;
 988  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 989  dfa->str_tree = NULL;
 990  re_free (dfa->org_indices);
 991  dfa->org_indices = NULL;
 992}
 993
 994/* Create initial states for all contexts.  */
 995
 996static reg_errcode_t
 997create_initial_state (re_dfa_t *dfa)
 998{
 999  int first, i;
1000  reg_errcode_t err;
1001  re_node_set init_nodes;
1002
1003  /* Initial states have the epsilon closure of the node which is
1004     the first node of the regular expression.  */
1005  first = dfa->str_tree->first->node_idx;
1006  dfa->init_node = first;
1007  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008  if (BE (err != REG_NOERROR, 0))
1009    return err;
1010
1011  /* The back-references which are in initial states can epsilon transit,
1012     since in this case all of the subexpressions can be null.
1013     Then we add epsilon closures of the nodes which are the next nodes of
1014     the back-references.  */
1015  if (dfa->nbackref > 0)
1016    for (i = 0; i < init_nodes.nelem; ++i)
1017      {
1018        int node_idx = init_nodes.elems[i];
1019        re_token_type_t type = dfa->nodes[node_idx].type;
1020
1021        int clexp_idx;
1022        if (type != OP_BACK_REF)
1023          continue;
1024        for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025          {
1026            re_token_t *clexp_node;
1027            clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028            if (clexp_node->type == OP_CLOSE_SUBEXP
1029                && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030              break;
1031          }
1032        if (clexp_idx == init_nodes.nelem)
1033          continue;
1034
1035        if (type == OP_BACK_REF)
1036          {
1037            int dest_idx = dfa->edests[node_idx].elems[0];
1038            if (!re_node_set_contains (&init_nodes, dest_idx))
1039              {
1040                reg_errcode_t err = re_node_set_merge (&init_nodes,
1041                                                       dfa->eclosures
1042                                                       + dest_idx);
1043                if (err != REG_NOERROR)
1044                  return err;
1045                i = 0;
1046              }
1047          }
1048      }
1049
1050  /* It must be the first time to invoke acquire_state.  */
1051  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1052  /* We don't check ERR here, since the initial state must not be NULL.  */
1053  if (BE (dfa->init_state == NULL, 0))
1054    return err;
1055  if (dfa->init_state->has_constraint)
1056    {
1057      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1058                                                       CONTEXT_WORD);
1059      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1060                                                     CONTEXT_NEWLINE);
1061      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1062                                                         &init_nodes,
1063                                                         CONTEXT_NEWLINE
1064                                                         | CONTEXT_BEGBUF);
1065      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1066              || dfa->init_state_begbuf == NULL, 0))
1067        return err;
1068    }
1069  else
1070    dfa->init_state_word = dfa->init_state_nl
1071      = dfa->init_state_begbuf = dfa->init_state;
1072
1073  re_node_set_free (&init_nodes);
1074  return REG_NOERROR;
1075}
1076\f
1077#ifdef RE_ENABLE_I18N
1078/* If it is possible to do searching in single byte encoding instead of UTF-8
1079   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080   DFA nodes where needed.  */
1081
1082static void
1083optimize_utf8 (re_dfa_t *dfa)
1084{
1085  int node, i, mb_chars = 0, has_period = 0;
1086
1087  for (node = 0; node < dfa->nodes_len; ++node)
1088    switch (dfa->nodes[node].type)
1089      {
1090      case CHARACTER:
1091        if (dfa->nodes[node].opr.c >= 0x80)
1092          mb_chars = 1;
1093        break;
1094      case ANCHOR:
1095        switch (dfa->nodes[node].opr.ctx_type)
1096          {
1097          case LINE_FIRST:
1098          case LINE_LAST:
1099          case BUF_FIRST:
1100          case BUF_LAST:
1101            break;
1102          default:
1103            /* Word anchors etc. cannot be handled.  It's okay to test
1104               opr.ctx_type since constraints (for all DFA nodes) are
1105               created by ORing one or more opr.ctx_type values.  */
1106            return;
1107          }
1108        break;
1109      case OP_PERIOD:
1110        has_period = 1;
1111        break;
1112      case OP_BACK_REF:
1113      case OP_ALT:
1114      case END_OF_RE:
1115      case OP_DUP_ASTERISK:
1116      case OP_OPEN_SUBEXP:
1117      case OP_CLOSE_SUBEXP:
1118        break;
1119      case COMPLEX_BRACKET:
1120        return;
1121      case SIMPLE_BRACKET:
1122        /* Just double check.  The non-ASCII range starts at 0x80.  */
1123        assert (0x80 % BITSET_WORD_BITS == 0);
1124        for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1125          if (dfa->nodes[node].opr.sbcset[i])
1126            return;
1127        break;
1128      default:
1129        abort ();
1130      }
1131
1132  if (mb_chars || has_period)
1133    for (node = 0; node < dfa->nodes_len; ++node)
1134      {
1135        if (dfa->nodes[node].type == CHARACTER
1136            && dfa->nodes[node].opr.c >= 0x80)
1137          dfa->nodes[node].mb_partial = 0;
1138        else if (dfa->nodes[node].type == OP_PERIOD)
1139          dfa->nodes[node].type = OP_UTF8_PERIOD;
1140      }
1141
1142  /* The search can be in single byte locale.  */
1143  dfa->mb_cur_max = 1;
1144  dfa->is_utf8 = 0;
1145  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1146}
1147#endif
1148\f
1149/* Analyze the structure tree, and calculate "first", "next", "edest",
1150   "eclosure", and "inveclosure".  */
1151
1152static reg_errcode_t
1153analyze (regex_t *preg)
1154{
1155  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1156  reg_errcode_t ret;
1157
1158  /* Allocate arrays.  */
1159  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1160  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1161  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1162  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1163  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1164          || dfa->eclosures == NULL, 0))
1165    return REG_ESPACE;
1166
1167  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1168  if (dfa->subexp_map != NULL)
1169    {
1170      int i;
1171      for (i = 0; i < preg->re_nsub; i++)
1172        dfa->subexp_map[i] = i;
1173      preorder (dfa->str_tree, optimize_subexps, dfa);
1174      for (i = 0; i < preg->re_nsub; i++)
1175        if (dfa->subexp_map[i] != i)
1176          break;
1177      if (i == preg->re_nsub)
1178        {
1179          free (dfa->subexp_map);
1180          dfa->subexp_map = NULL;
1181        }
1182    }
1183
1184  ret = postorder (dfa->str_tree, lower_subexps, preg);
1185  if (BE (ret != REG_NOERROR, 0))
1186    return ret;
1187  ret = postorder (dfa->str_tree, calc_first, dfa);
1188  if (BE (ret != REG_NOERROR, 0))
1189    return ret;
1190  preorder (dfa->str_tree, calc_next, dfa);
1191  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1192  if (BE (ret != REG_NOERROR, 0))
1193    return ret;
1194  ret = calc_eclosure (dfa);
1195  if (BE (ret != REG_NOERROR, 0))
1196    return ret;
1197
1198  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1199     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1200  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1201      || dfa->nbackref)
1202    {
1203      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1204      if (BE (dfa->inveclosures == NULL, 0))
1205        return REG_ESPACE;
1206      ret = calc_inveclosure (dfa);
1207    }
1208
1209  return ret;
1210}
1211
1212/* Our parse trees are very unbalanced, so we cannot use a stack to
1213   implement parse tree visits.  Instead, we use parent pointers and
1214   some hairy code in these two functions.  */
1215static reg_errcode_t
1216postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1217           void *extra)
1218{
1219  bin_tree_t *node, *prev;
1220
1221  for (node = root; ; )
1222    {
1223      /* Descend down the tree, preferably to the left (or to the right
1224         if that's the only child).  */
1225      while (node->left || node->right)
1226        if (node->left)
1227          node = node->left;
1228        else
1229          node = node->right;
1230
1231      do
1232        {
1233          reg_errcode_t err = fn (extra, node);
1234          if (BE (err != REG_NOERROR, 0))
1235            return err;
1236          if (node->parent == NULL)
1237            return REG_NOERROR;
1238          prev = node;
1239          node = node->parent;
1240        }
1241      /* Go up while we have a node that is reached from the right.  */
1242      while (node->right == prev || node->right == NULL);
1243      node = node->right;
1244    }
1245}
1246
1247static reg_errcode_t
1248preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1249          void *extra)
1250{
1251  bin_tree_t *node;
1252
1253  for (node = root; ; )
1254    {
1255      reg_errcode_t err = fn (extra, node);
1256      if (BE (err != REG_NOERROR, 0))
1257        return err;
1258
1259      /* Go to the left node, or up and to the right.  */
1260      if (node->left)
1261        node = node->left;
1262      else
1263        {
1264          bin_tree_t *prev = NULL;
1265          while (node->right == prev || node->right == NULL)
1266            {
1267              prev = node;
1268              node = node->parent;
1269              if (!node)
1270                return REG_NOERROR;
1271            }
1272          node = node->right;
1273        }
1274    }
1275}
1276
1277/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1278   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1279   backreferences as well.  Requires a preorder visit.  */
1280static reg_errcode_t
1281optimize_subexps (void *extra, bin_tree_t *node)
1282{
1283  re_dfa_t *dfa = (re_dfa_t *) extra;
1284
1285  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1286    {
1287      int idx = node->token.opr.idx;
1288      node->token.opr.idx = dfa->subexp_map[idx];
1289      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1290    }
1291
1292  else if (node->token.type == SUBEXP
1293           && node->left && node->left->token.type == SUBEXP)
1294    {
1295      int other_idx = node->left->token.opr.idx;
1296
1297      node->left = node->left->left;
1298      if (node->left)
1299        node->left->parent = node;
1300
1301      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1302      if (other_idx < BITSET_WORD_BITS)
1303          dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1304    }
1305
1306  return REG_NOERROR;
1307}
1308
1309/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1310   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1311static reg_errcode_t
1312lower_subexps (void *extra, bin_tree_t *node)
1313{
1314  regex_t *preg = (regex_t *) extra;
1315  reg_errcode_t err = REG_NOERROR;
1316
1317  if (node->left && node->left->token.type == SUBEXP)
1318    {
1319      node->left = lower_subexp (&err, preg, node->left);
1320      if (node->left)
1321        node->left->parent = node;
1322    }
1323  if (node->right && node->right->token.type == SUBEXP)
1324    {
1325      node->right = lower_subexp (&err, preg, node->right);
1326      if (node->right)
1327        node->right->parent = node;
1328    }
1329
1330  return err;
1331}
1332
1333static bin_tree_t *
1334lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1335{
1336  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1337  bin_tree_t *body = node->left;
1338  bin_tree_t *op, *cls, *tree1, *tree;
1339
1340  if (preg->no_sub
1341      /* We do not optimize empty subexpressions, because otherwise we may
1342         have bad CONCAT nodes with NULL children.  This is obviously not
1343         very common, so we do not lose much.  An example that triggers
1344         this case is the sed "script" /\(\)/x.  */
1345      && node->left != NULL
1346      && (node->token.opr.idx >= BITSET_WORD_BITS
1347          || !(dfa->used_bkref_map
1348               & ((bitset_word_t) 1 << node->token.opr.idx))))
1349    return node->left;
1350
1351  /* Convert the SUBEXP node to the concatenation of an
1352     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1353  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1354  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1355  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1356  tree = create_tree (dfa, op, tree1, CONCAT);
1357  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1358    {
1359      *err = REG_ESPACE;
1360      return NULL;
1361    }
1362
1363  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1364  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1365  return tree;
1366}
1367
1368/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1369   nodes.  Requires a postorder visit.  */
1370static reg_errcode_t
1371calc_first (void *extra, bin_tree_t *node)
1372{
1373  re_dfa_t *dfa = (re_dfa_t *) extra;
1374  if (node->token.type == CONCAT)
1375    {
1376      node->first = node->left->first;
1377      node->node_idx = node->left->node_idx;
1378    }
1379  else
1380    {
1381      node->first = node;
1382      node->node_idx = re_dfa_add_node (dfa, node->token);
1383      if (BE (node->node_idx == -1, 0))
1384        return REG_ESPACE;
1385      if (node->token.type == ANCHOR)
1386        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1387    }
1388  return REG_NOERROR;
1389}
1390
1391/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1392static reg_errcode_t
1393calc_next (void *extra, bin_tree_t *node)
1394{
1395  switch (node->token.type)
1396    {
1397    case OP_DUP_ASTERISK:
1398      node->left->next = node;
1399      break;
1400    case CONCAT:
1401      node->left->next = node->right->first;
1402      node->right->next = node->next;
1403      break;
1404    default:
1405      if (node->left)
1406        node->left->next = node->next;
1407      if (node->right)
1408        node->right->next = node->next;
1409      break;
1410    }
1411  return REG_NOERROR;
1412}
1413
1414/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1415static reg_errcode_t
1416link_nfa_nodes (void *extra, bin_tree_t *node)
1417{
1418  re_dfa_t *dfa = (re_dfa_t *) extra;
1419  int idx = node->node_idx;
1420  reg_errcode_t err = REG_NOERROR;
1421
1422  switch (node->token.type)
1423    {
1424    case CONCAT:
1425      break;
1426
1427    case END_OF_RE:
1428      assert (node->next == NULL);
1429      break;
1430
1431    case OP_DUP_ASTERISK:
1432    case OP_ALT:
1433      {
1434        int left, right;
1435        dfa->has_plural_match = 1;
1436        if (node->left != NULL)
1437          left = node->left->first->node_idx;
1438        else
1439          left = node->next->node_idx;
1440        if (node->right != NULL)
1441          right = node->right->first->node_idx;
1442        else
1443          right = node->next->node_idx;
1444        assert (left > -1);
1445        assert (right > -1);
1446        err = re_node_set_init_2 (dfa->edests + idx, left, right);
1447      }
1448      break;
1449
1450    case ANCHOR:
1451    case OP_OPEN_SUBEXP:
1452    case OP_CLOSE_SUBEXP:
1453      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1454      break;
1455
1456    case OP_BACK_REF:
1457      dfa->nexts[idx] = node->next->node_idx;
1458      if (node->token.type == OP_BACK_REF)
1459        err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1460      break;
1461
1462    default:
1463      assert (!IS_EPSILON_NODE (node->token.type));
1464      dfa->nexts[idx] = node->next->node_idx;
1465      break;
1466    }
1467
1468  return err;
1469}
1470
1471/* Duplicate the epsilon closure of the node ROOT_NODE.
1472   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1473   to their own constraint.  */
1474
1475static reg_errcode_t
1476internal_function
1477duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1478                        int root_node, unsigned int init_constraint)
1479{
1480  int org_node, clone_node, ret;
1481  unsigned int constraint = init_constraint;
1482  for (org_node = top_org_node, clone_node = top_clone_node;;)
1483    {
1484      int org_dest, clone_dest;
1485      if (dfa->nodes[org_node].type == OP_BACK_REF)
1486        {
1487          /* If the back reference epsilon-transit, its destination must
1488             also have the constraint.  Then duplicate the epsilon closure
1489             of the destination of the back reference, and store it in
1490             edests of the back reference.  */
1491          org_dest = dfa->nexts[org_node];
1492          re_node_set_empty (dfa->edests + clone_node);
1493          clone_dest = duplicate_node (dfa, org_dest, constraint);
1494          if (BE (clone_dest == -1, 0))
1495            return REG_ESPACE;
1496          dfa->nexts[clone_node] = dfa->nexts[org_node];
1497          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498          if (BE (ret < 0, 0))
1499            return REG_ESPACE;
1500        }
1501      else if (dfa->edests[org_node].nelem == 0)
1502        {
1503          /* In case of the node can't epsilon-transit, don't duplicate the
1504             destination and store the original destination as the
1505             destination of the node.  */
1506          dfa->nexts[clone_node] = dfa->nexts[org_node];
1507          break;
1508        }
1509      else if (dfa->edests[org_node].nelem == 1)
1510        {
1511          /* In case of the node can epsilon-transit, and it has only one
1512             destination.  */
1513          org_dest = dfa->edests[org_node].elems[0];
1514          re_node_set_empty (dfa->edests + clone_node);
1515          /* If the node is root_node itself, it means the epsilon clsoure
1516             has a loop.   Then tie it to the destination of the root_node.  */
1517          if (org_node == root_node && clone_node != org_node)
1518            {
1519              ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1520              if (BE (ret < 0, 0))
1521                return REG_ESPACE;
1522              break;
1523            }
1524          /* In case of the node has another constraint, add it.  */
1525          constraint |= dfa->nodes[org_node].constraint;
1526          clone_dest = duplicate_node (dfa, org_dest, constraint);
1527          if (BE (clone_dest == -1, 0))
1528            return REG_ESPACE;
1529          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1530          if (BE (ret < 0, 0))
1531            return REG_ESPACE;
1532        }
1533      else /* dfa->edests[org_node].nelem == 2 */
1534        {
1535          /* In case of the node can epsilon-transit, and it has two
1536             destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1537          org_dest = dfa->edests[org_node].elems[0];
1538          re_node_set_empty (dfa->edests + clone_node);
1539          /* Search for a duplicated node which satisfies the constraint.  */
1540          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1541          if (clone_dest == -1)
1542            {
1543              /* There is no such duplicated node, create a new one.  */
1544              reg_errcode_t err;
1545              clone_dest = duplicate_node (dfa, org_dest, constraint);
1546              if (BE (clone_dest == -1, 0))
1547                return REG_ESPACE;
1548              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1549              if (BE (ret < 0, 0))
1550                return REG_ESPACE;
1551              err = duplicate_node_closure (dfa, org_dest, clone_dest,
1552                                            root_node, constraint);
1553              if (BE (err != REG_NOERROR, 0))
1554                return err;
1555            }
1556          else
1557            {
1558              /* There is a duplicated node which satisfies the constraint,
1559                 use it to avoid infinite loop.  */
1560              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1561              if (BE (ret < 0, 0))
1562                return REG_ESPACE;
1563            }
1564
1565          org_dest = dfa->edests[org_node].elems[1];
1566          clone_dest = duplicate_node (dfa, org_dest, constraint);
1567          if (BE (clone_dest == -1, 0))
1568            return REG_ESPACE;
1569          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1570          if (BE (ret < 0, 0))
1571            return REG_ESPACE;
1572        }
1573      org_node = org_dest;
1574      clone_node = clone_dest;
1575    }
1576  return REG_NOERROR;
1577}
1578
1579/* Search for a node which is duplicated from the node ORG_NODE, and
1580   satisfies the constraint CONSTRAINT.  */
1581
1582static int
1583search_duplicated_node (const re_dfa_t *dfa, int org_node,
1584                        unsigned int constraint)
1585{
1586  int idx;
1587  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1588    {
1589      if (org_node == dfa->org_indices[idx]
1590          && constraint == dfa->nodes[idx].constraint)
1591        return idx; /* Found.  */
1592    }
1593  return -1; /* Not found.  */
1594}
1595
1596/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1597   Return the index of the new node, or -1 if insufficient storage is
1598   available.  */
1599
1600static int
1601duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1602{
1603  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1604  if (BE (dup_idx != -1, 1))
1605    {
1606      dfa->nodes[dup_idx].constraint = constraint;
1607      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1608      dfa->nodes[dup_idx].duplicated = 1;
1609
1610      /* Store the index of the original node.  */
1611      dfa->org_indices[dup_idx] = org_idx;
1612    }
1613  return dup_idx;
1614}
1615
1616static reg_errcode_t
1617calc_inveclosure (re_dfa_t *dfa)
1618{
1619  int src, idx, ret;
1620  for (idx = 0; idx < dfa->nodes_len; ++idx)
1621    re_node_set_init_empty (dfa->inveclosures + idx);
1622
1623  for (src = 0; src < dfa->nodes_len; ++src)
1624    {
1625      int *elems = dfa->eclosures[src].elems;
1626      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1627        {
1628          ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1629          if (BE (ret == -1, 0))
1630            return REG_ESPACE;
1631        }
1632    }
1633
1634  return REG_NOERROR;
1635}
1636
1637/* Calculate "eclosure" for all the node in DFA.  */
1638
1639static reg_errcode_t
1640calc_eclosure (re_dfa_t *dfa)
1641{
1642  int node_idx, incomplete;
1643#ifdef DEBUG
1644  assert (dfa->nodes_len > 0);
1645#endif
1646  incomplete = 0;
1647  /* For each nodes, calculate epsilon closure.  */
1648  for (node_idx = 0; ; ++node_idx)
1649    {
1650      reg_errcode_t err;
1651      re_node_set eclosure_elem;
1652      if (node_idx == dfa->nodes_len)
1653        {
1654          if (!incomplete)
1655            break;
1656          incomplete = 0;
1657          node_idx = 0;
1658        }
1659
1660#ifdef DEBUG
1661      assert (dfa->eclosures[node_idx].nelem != -1);
1662#endif
1663
1664      /* If we have already calculated, skip it.  */
1665      if (dfa->eclosures[node_idx].nelem != 0)
1666        continue;
1667      /* Calculate epsilon closure of `node_idx'.  */
1668      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1669      if (BE (err != REG_NOERROR, 0))
1670        return err;
1671
1672      if (dfa->eclosures[node_idx].nelem == 0)
1673        {
1674          incomplete = 1;
1675          re_node_set_free (&eclosure_elem);
1676        }
1677    }
1678  return REG_NOERROR;
1679}
1680
1681/* Calculate epsilon closure of NODE.  */
1682
1683static reg_errcode_t
1684calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1685{
1686  reg_errcode_t err;
1687  int i;
1688  re_node_set eclosure;
1689  int ret;
1690  int incomplete = 0;
1691  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1692  if (BE (err != REG_NOERROR, 0))
1693    return err;
1694
1695  /* This indicates that we are calculating this node now.
1696     We reference this value to avoid infinite loop.  */
1697  dfa->eclosures[node].nelem = -1;
1698
1699  /* If the current node has constraints, duplicate all nodes
1700     since they must inherit the constraints.  */
1701  if (dfa->nodes[node].constraint
1702      && dfa->edests[node].nelem
1703      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1704    {
1705      err = duplicate_node_closure (dfa, node, node, node,
1706                                    dfa->nodes[node].constraint);
1707      if (BE (err != REG_NOERROR, 0))
1708        return err;
1709    }
1710
1711  /* Expand each epsilon destination nodes.  */
1712  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1713    for (i = 0; i < dfa->edests[node].nelem; ++i)
1714      {
1715        re_node_set eclosure_elem;
1716        int edest = dfa->edests[node].elems[i];
1717        /* If calculating the epsilon closure of `edest' is in progress,
1718           return intermediate result.  */
1719        if (dfa->eclosures[edest].nelem == -1)
1720          {
1721            incomplete = 1;
1722            continue;
1723          }
1724        /* If we haven't calculated the epsilon closure of `edest' yet,
1725           calculate now. Otherwise use calculated epsilon closure.  */
1726        if (dfa->eclosures[edest].nelem == 0)
1727          {
1728            err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1729            if (BE (err != REG_NOERROR, 0))
1730              return err;
1731          }
1732        else
1733          eclosure_elem = dfa->eclosures[edest];
1734        /* Merge the epsilon closure of `edest'.  */
1735        err = re_node_set_merge (&eclosure, &eclosure_elem);
1736        if (BE (err != REG_NOERROR, 0))
1737          return err;
1738        /* If the epsilon closure of `edest' is incomplete,
1739           the epsilon closure of this node is also incomplete.  */
1740        if (dfa->eclosures[edest].nelem == 0)
1741          {
1742            incomplete = 1;
1743            re_node_set_free (&eclosure_elem);
1744          }
1745      }
1746
1747  /* An epsilon closure includes itself.  */
1748  ret = re_node_set_insert (&eclosure, node);
1749  if (BE (ret < 0, 0))
1750    return REG_ESPACE;
1751  if (incomplete && !root)
1752    dfa->eclosures[node].nelem = 0;
1753  else
1754    dfa->eclosures[node] = eclosure;
1755  *new_set = eclosure;
1756  return REG_NOERROR;
1757}
1758\f
1759/* Functions for token which are used in the parser.  */
1760
1761/* Fetch a token from INPUT.
1762   We must not use this function inside bracket expressions.  */
1763
1764static void
1765internal_function
1766fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1767{
1768  re_string_skip_bytes (input, peek_token (result, input, syntax));
1769}
1770
1771/* Peek a token from INPUT, and return the length of the token.
1772   We must not use this function inside bracket expressions.  */
1773
1774static int
1775internal_function
1776peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1777{
1778  unsigned char c;
1779
1780  if (re_string_eoi (input))
1781    {
1782      token->type = END_OF_RE;
1783      return 0;
1784    }
1785
1786  c = re_string_peek_byte (input, 0);
1787  token->opr.c = c;
1788
1789  token->word_char = 0;
1790#ifdef RE_ENABLE_I18N
1791  token->mb_partial = 0;
1792  if (input->mb_cur_max > 1 &&
1793      !re_string_first_byte (input, re_string_cur_idx (input)))
1794    {
1795      token->type = CHARACTER;
1796      token->mb_partial = 1;
1797      return 1;
1798    }
1799#endif
1800  if (c == '\\')
1801    {
1802      unsigned char c2;
1803      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1804        {
1805          token->type = BACK_SLASH;
1806          return 1;
1807        }
1808
1809      c2 = re_string_peek_byte_case (input, 1);
1810      token->opr.c = c2;
1811      token->type = CHARACTER;
1812#ifdef RE_ENABLE_I18N
1813      if (input->mb_cur_max > 1)
1814        {
1815          wint_t wc = re_string_wchar_at (input,
1816                                          re_string_cur_idx (input) + 1);
1817          token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1818        }
1819      else
1820#endif
1821        token->word_char = IS_WORD_CHAR (c2) != 0;
1822
1823      switch (c2)
1824        {
1825        case '|':
1826          if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1827            token->type = OP_ALT;
1828          break;
1829        case '1': case '2': case '3': case '4': case '5':
1830        case '6': case '7': case '8': case '9':
1831          if (!(syntax & RE_NO_BK_REFS))
1832            {
1833              token->type = OP_BACK_REF;
1834              token->opr.idx = c2 - '1';
1835            }
1836          break;
1837        case '<':
1838          if (!(syntax & RE_NO_GNU_OPS))
1839            {
1840              token->type = ANCHOR;
1841              token->opr.ctx_type = WORD_FIRST;
1842            }
1843          break;
1844        case '>':
1845          if (!(syntax & RE_NO_GNU_OPS))
1846            {
1847              token->type = ANCHOR;
1848              token->opr.ctx_type = WORD_LAST;
1849            }
1850          break;
1851        case 'b':
1852          if (!(syntax & RE_NO_GNU_OPS))
1853            {
1854              token->type = ANCHOR;
1855              token->opr.ctx_type = WORD_DELIM;
1856            }
1857          break;
1858        case 'B':
1859          if (!(syntax & RE_NO_GNU_OPS))
1860            {
1861              token->type = ANCHOR;
1862              token->opr.ctx_type = NOT_WORD_DELIM;
1863            }
1864          break;
1865        case 'w':
1866          if (!(syntax & RE_NO_GNU_OPS))
1867            token->type = OP_WORD;
1868          break;
1869        case 'W':
1870          if (!(syntax & RE_NO_GNU_OPS))
1871            token->type = OP_NOTWORD;
1872          break;
1873        case 's':
1874          if (!(syntax & RE_NO_GNU_OPS))
1875            token->type = OP_SPACE;
1876          break;
1877        case 'S':
1878          if (!(syntax & RE_NO_GNU_OPS))
1879            token->type = OP_NOTSPACE;
1880          break;
1881        case '`':
1882          if (!(syntax & RE_NO_GNU_OPS))
1883            {
1884              token->type = ANCHOR;
1885              token->opr.ctx_type = BUF_FIRST;
1886            }
1887          break;
1888        case '\'':
1889          if (!(syntax & RE_NO_GNU_OPS))
1890            {
1891              token->type = ANCHOR;
1892              token->opr.ctx_type = BUF_LAST;
1893            }
1894          break;
1895        case '(':
1896          if (!(syntax & RE_NO_BK_PARENS))
1897            token->type = OP_OPEN_SUBEXP;
1898          break;
1899        case ')':
1900          if (!(syntax & RE_NO_BK_PARENS))
1901            token->type = OP_CLOSE_SUBEXP;
1902          break;
1903        case '+':
1904          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1905            token->type = OP_DUP_PLUS;
1906          break;
1907        case '?':
1908          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1909            token->type = OP_DUP_QUESTION;
1910          break;
1911        case '{':
1912          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1913            token->type = OP_OPEN_DUP_NUM;
1914          break;
1915        case '}':
1916          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1917            token->type = OP_CLOSE_DUP_NUM;
1918          break;
1919        default:
1920          break;
1921        }
1922      return 2;
1923    }
1924
1925  token->type = CHARACTER;
1926#ifdef RE_ENABLE_I18N
1927  if (input->mb_cur_max > 1)
1928    {
1929      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1930      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1931    }
1932  else
1933#endif
1934    token->word_char = IS_WORD_CHAR (token->opr.c);
1935
1936  switch (c)
1937    {
1938    case '\n':
1939      if (syntax & RE_NEWLINE_ALT)
1940        token->type = OP_ALT;
1941      break;
1942    case '|':
1943      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1944        token->type = OP_ALT;
1945      break;
1946    case '*':
1947      token->type = OP_DUP_ASTERISK;
1948      break;
1949    case '+':
1950      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1951        token->type = OP_DUP_PLUS;
1952      break;
1953    case '?':
1954      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1955        token->type = OP_DUP_QUESTION;
1956      break;
1957    case '{':
1958      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1959        token->type = OP_OPEN_DUP_NUM;
1960      break;
1961    case '}':
1962      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1963        token->type = OP_CLOSE_DUP_NUM;
1964      break;
1965    case '(':
1966      if (syntax & RE_NO_BK_PARENS)
1967        token->type = OP_OPEN_SUBEXP;
1968      break;
1969    case ')':
1970      if (syntax & RE_NO_BK_PARENS)
1971        token->type = OP_CLOSE_SUBEXP;
1972      break;
1973    case '[':
1974      token->type = OP_OPEN_BRACKET;
1975      break;
1976    case '.':
1977      token->type = OP_PERIOD;
1978      break;
1979    case '^':
1980      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1981          re_string_cur_idx (input) != 0)
1982        {
1983          char prev = re_string_peek_byte (input, -1);
1984          if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1985            break;
1986        }
1987      token->type = ANCHOR;
1988      token->opr.ctx_type = LINE_FIRST;
1989      break;
1990    case '$':
1991      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1992          re_string_cur_idx (input) + 1 != re_string_length (input))
1993        {
1994          re_token_t next;
1995          re_string_skip_bytes (input, 1);
1996          peek_token (&next, input, syntax);
1997          re_string_skip_bytes (input, -1);
1998          if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1999            break;
2000        }
2001      token->type = ANCHOR;
2002      token->opr.ctx_type = LINE_LAST;
2003      break;
2004    default:
2005      break;
2006    }
2007  return 1;
2008}
2009
2010/* Peek a token from INPUT, and return the length of the token.
2011   We must not use this function out of bracket expressions.  */
2012
2013static int
2014internal_function
2015peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2016{
2017  unsigned char c;
2018  if (re_string_eoi (input))
2019    {
2020      token->type = END_OF_RE;
2021      return 0;
2022    }
2023  c = re_string_peek_byte (input, 0);
2024  token->opr.c = c;
2025
2026#ifdef RE_ENABLE_I18N
2027  if (input->mb_cur_max > 1 &&
2028      !re_string_first_byte (input, re_string_cur_idx (input)))
2029    {
2030      token->type = CHARACTER;
2031      return 1;
2032    }
2033#endif /* RE_ENABLE_I18N */
2034
2035  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2036      && re_string_cur_idx (input) + 1 < re_string_length (input))
2037    {
2038      /* In this case, '\' escape a character.  */
2039      unsigned char c2;
2040      re_string_skip_bytes (input, 1);
2041      c2 = re_string_peek_byte (input, 0);
2042      token->opr.c = c2;
2043      token->type = CHARACTER;
2044      return 1;
2045    }
2046  if (c == '[') /* '[' is a special char in a bracket exps.  */
2047    {
2048      unsigned char c2;
2049      int token_len;
2050      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2051        c2 = re_string_peek_byte (input, 1);
2052      else
2053        c2 = 0;
2054      token->opr.c = c2;
2055      token_len = 2;
2056      switch (c2)
2057        {
2058        case '.':
2059          token->type = OP_OPEN_COLL_ELEM;
2060          break;
2061        case '=':
2062          token->type = OP_OPEN_EQUIV_CLASS;
2063          break;
2064        case ':':
2065          if (syntax & RE_CHAR_CLASSES)
2066            {
2067              token->type = OP_OPEN_CHAR_CLASS;
2068              break;
2069            }
2070          /* else fall through.  */
2071        default:
2072          token->type = CHARACTER;
2073          token->opr.c = c;
2074          token_len = 1;
2075          break;
2076        }
2077      return token_len;
2078    }
2079  switch (c)
2080    {
2081    case '-':
2082      token->type = OP_CHARSET_RANGE;
2083      break;
2084    case ']':
2085      token->type = OP_CLOSE_BRACKET;
2086      break;
2087    case '^':
2088      token->type = OP_NON_MATCH_LIST;
2089      break;
2090    default:
2091      token->type = CHARACTER;
2092    }
2093  return 1;
2094}
2095\f
2096/* Functions for parser.  */
2097
2098/* Entry point of the parser.
2099   Parse the regular expression REGEXP and return the structure tree.
2100   If an error has occurred, ERR is set by error code, and return NULL.
2101   This function build the following tree, from regular expression <reg_exp>:
2102           CAT
2103           / \
2104          /   \
2105   <reg_exp>  EOR
2106
2107   CAT means concatenation.
2108   EOR means end of regular expression.  */
2109
2110static bin_tree_t *
2111parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2112       reg_errcode_t *err)
2113{
2114  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2115  bin_tree_t *tree, *eor, *root;
2116  re_token_t current_token;
2117  dfa->syntax = syntax;
2118  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2119  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2120  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121    return NULL;
2122  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2123  if (tree != NULL)
2124    root = create_tree (dfa, tree, eor, CONCAT);
2125  else
2126    root = eor;
2127  if (BE (eor == NULL || root == NULL, 0))
2128    {
2129      *err = REG_ESPACE;
2130      return NULL;
2131    }
2132  return root;
2133}
2134
2135/* This function build the following tree, from regular expression
2136   <branch1>|<branch2>:
2137           ALT
2138           / \
2139          /   \
2140   <branch1> <branch2>
2141
2142   ALT means alternative, which represents the operator `|'.  */
2143
2144static bin_tree_t *
2145parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2146               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2147{
2148  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2149  bin_tree_t *tree, *branch = NULL;
2150  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2151  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2152    return NULL;
2153
2154  while (token->type == OP_ALT)
2155    {
2156      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2157      if (token->type != OP_ALT && token->type != END_OF_RE
2158          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2159        {
2160          branch = parse_branch (regexp, preg, token, syntax, nest, err);
2161          if (BE (*err != REG_NOERROR && branch == NULL, 0))
2162            return NULL;
2163        }
2164      else
2165        branch = NULL;
2166      tree = create_tree (dfa, tree, branch, OP_ALT);
2167      if (BE (tree == NULL, 0))
2168        {
2169          *err = REG_ESPACE;
2170          return NULL;
2171        }
2172    }
2173  return tree;
2174}
2175
2176/* This function build the following tree, from regular expression
2177   <exp1><exp2>:
2178        CAT
2179        / \
2180       /   \
2181   <exp1> <exp2>
2182
2183   CAT means concatenation.  */
2184
2185static bin_tree_t *
2186parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2187              reg_syntax_t syntax, int nest, reg_errcode_t *err)
2188{
2189  bin_tree_t *tree, *exp;
2190  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2191  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2192  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193    return NULL;
2194
2195  while (token->type != OP_ALT && token->type != END_OF_RE
2196         && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2197    {
2198      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2199      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2200        {
2201          return NULL;
2202        }
2203      if (tree != NULL && exp != NULL)
2204        {
2205          tree = create_tree (dfa, tree, exp, CONCAT);
2206          if (tree == NULL)
2207            {
2208              *err = REG_ESPACE;
2209              return NULL;
2210            }
2211        }
2212      else if (tree == NULL)
2213        tree = exp;
2214      /* Otherwise exp == NULL, we don't need to create new tree.  */
2215    }
2216  return tree;
2217}
2218
2219/* This function build the following tree, from regular expression a*:
2220         *
2221         |
2222         a
2223*/
2224
2225static bin_tree_t *
2226parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2227                  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2228{
2229  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2230  bin_tree_t *tree;
2231  switch (token->type)
2232    {
2233    case CHARACTER:
2234      tree = create_token_tree (dfa, NULL, NULL, token);
2235      if (BE (tree == NULL, 0))
2236        {
2237          *err = REG_ESPACE;
2238          return NULL;
2239        }
2240#ifdef RE_ENABLE_I18N
2241      if (dfa->mb_cur_max > 1)
2242        {
2243          while (!re_string_eoi (regexp)
2244                 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2245            {
2246              bin_tree_t *mbc_remain;
2247              fetch_token (token, regexp, syntax);
2248              mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2249              tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2250              if (BE (mbc_remain == NULL || tree == NULL, 0))
2251                {
2252                  *err = REG_ESPACE;
2253                  return NULL;
2254                }
2255            }
2256        }
2257#endif
2258      break;
2259    case OP_OPEN_SUBEXP:
2260      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2261      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262        return NULL;
2263      break;
2264    case OP_OPEN_BRACKET:
2265      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2266      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267        return NULL;
2268      break;
2269    case OP_BACK_REF:
2270      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2271        {
2272          *err = REG_ESUBREG;
2273          return NULL;
2274        }
2275      dfa->used_bkref_map |= 1 << token->opr.idx;
2276      tree = create_token_tree (dfa, NULL, NULL, token);
2277      if (BE (tree == NULL, 0))
2278        {
2279          *err = REG_ESPACE;
2280          return NULL;
2281        }
2282      ++dfa->nbackref;
2283      dfa->has_mb_node = 1;
2284      break;
2285    case OP_OPEN_DUP_NUM:
2286      if (syntax & RE_CONTEXT_INVALID_DUP)
2287        {
2288          *err = REG_BADRPT;
2289          return NULL;
2290        }
2291      /* FALLTHROUGH */
2292    case OP_DUP_ASTERISK:
2293    case OP_DUP_PLUS:
2294    case OP_DUP_QUESTION:
2295      if (syntax & RE_CONTEXT_INVALID_OPS)
2296        {
2297          *err = REG_BADRPT;
2298          return NULL;
2299        }
2300      else if (syntax & RE_CONTEXT_INDEP_OPS)
2301        {
2302          fetch_token (token, regexp, syntax);
2303          return parse_expression (regexp, preg, token, syntax, nest, err);
2304        }
2305      /* else fall through  */
2306    case OP_CLOSE_SUBEXP:
2307      if ((token->type == OP_CLOSE_SUBEXP) &&
2308          !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2309        {
2310          *err = REG_ERPAREN;
2311          return NULL;
2312        }
2313      /* else fall through  */
2314    case OP_CLOSE_DUP_NUM:
2315      /* We treat it as a normal character.  */
2316
2317      /* Then we can these characters as normal characters.  */
2318      token->type = CHARACTER;
2319      /* mb_partial and word_char bits should be initialized already
2320         by peek_token.  */
2321      tree = create_token_tree (dfa, NULL, NULL, token);
2322      if (BE (tree == NULL, 0))
2323        {
2324          *err = REG_ESPACE;
2325          return NULL;
2326        }
2327      break;
2328    case ANCHOR:
2329      if ((token->opr.ctx_type
2330           & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2331          && dfa->word_ops_used == 0)
2332        init_word_char (dfa);
2333      if (token->opr.ctx_type == WORD_DELIM
2334          || token->opr.ctx_type == NOT_WORD_DELIM)
2335        {
2336          bin_tree_t *tree_first, *tree_last;
2337          if (token->opr.ctx_type == WORD_DELIM)
2338            {
2339              token->opr.ctx_type = WORD_FIRST;
2340              tree_first = create_token_tree (dfa, NULL, NULL, token);
2341              token->opr.ctx_type = WORD_LAST;
2342            }
2343          else
2344            {
2345              token->opr.ctx_type = INSIDE_WORD;
2346              tree_first = create_token_tree (dfa, NULL, NULL, token);
2347              token->opr.ctx_type = INSIDE_NOTWORD;
2348            }
2349          tree_last = create_token_tree (dfa, NULL, NULL, token);
2350          tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2351          if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2352            {
2353              *err = REG_ESPACE;
2354              return NULL;
2355            }
2356        }
2357      else
2358        {
2359          tree = create_token_tree (dfa, NULL, NULL, token);
2360          if (BE (tree == NULL, 0))
2361            {
2362              *err = REG_ESPACE;
2363              return NULL;
2364            }
2365        }
2366      /* We must return here, since ANCHORs can't be followed
2367         by repetition operators.
2368         eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2369             it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2370      fetch_token (token, regexp, syntax);
2371      return tree;
2372    case OP_PERIOD:
2373      tree = create_token_tree (dfa, NULL, NULL, token);
2374      if (BE (tree == NULL, 0))
2375        {
2376          *err = REG_ESPACE;
2377          return NULL;
2378        }
2379      if (dfa->mb_cur_max > 1)
2380        dfa->has_mb_node = 1;
2381      break;
2382    case OP_WORD:
2383    case OP_NOTWORD:
2384      tree = build_charclass_op (dfa, regexp->trans,
2385                                 "alnum",
2386                                 "_",
2387                                 token->type == OP_NOTWORD, err);
2388      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389        return NULL;
2390      break;
2391    case OP_SPACE:
2392    case OP_NOTSPACE:
2393      tree = build_charclass_op (dfa, regexp->trans,
2394                                 "space",
2395                                 "",
2396                                 token->type == OP_NOTSPACE, err);
2397      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2398        return NULL;
2399      break;
2400    case OP_ALT:
2401    case END_OF_RE:
2402      return NULL;
2403    case BACK_SLASH:
2404      *err = REG_EESCAPE;
2405      return NULL;
2406    default:
2407      /* Must not happen?  */
2408#ifdef DEBUG
2409      assert (0);
2410#endif
2411      return NULL;
2412    }
2413  fetch_token (token, regexp, syntax);
2414
2415  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2416         || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2417    {
2418      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2419      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2420        return NULL;
2421      /* In BRE consecutive duplications are not allowed.  */
2422      if ((syntax & RE_CONTEXT_INVALID_DUP)
2423          && (token->type == OP_DUP_ASTERISK
2424              || token->type == OP_OPEN_DUP_NUM))
2425        {
2426          *err = REG_BADRPT;
2427          return NULL;
2428        }
2429    }
2430
2431  return tree;
2432}
2433
2434/* This function build the following tree, from regular expression
2435   (<reg_exp>):
2436         SUBEXP
2437            |
2438        <reg_exp>
2439*/
2440
2441static bin_tree_t *
2442parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2443               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2444{
2445  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2446  bin_tree_t *tree;
2447  size_t cur_nsub;
2448  cur_nsub = preg->re_nsub++;
2449
2450  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2451
2452  /* The subexpression may be a null string.  */
2453  if (token->type == OP_CLOSE_SUBEXP)
2454    tree = NULL;
2455  else
2456    {
2457      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2458      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2459        *err = REG_EPAREN;
2460      if (BE (*err != REG_NOERROR, 0))
2461        return NULL;
2462    }
2463
2464  if (cur_nsub <= '9' - '1')
2465    dfa->completed_bkref_map |= 1 << cur_nsub;
2466
2467  tree = create_tree (dfa, tree, NULL, SUBEXP);
2468  if (BE (tree == NULL, 0))
2469    {
2470      *err = REG_ESPACE;
2471      return NULL;
2472    }
2473  tree->token.opr.idx = cur_nsub;
2474  return tree;
2475}
2476
2477/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2478
2479static bin_tree_t *
2480parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2481              re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2482{
2483  bin_tree_t *tree = NULL, *old_tree = NULL;
2484  int i, start, end, start_idx = re_string_cur_idx (regexp);
2485#ifndef RE_TOKEN_INIT_BUG
2486  re_token_t start_token = *token;
2487#else
2488  re_token_t start_token;
2489
2490  memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2491#endif
2492
2493  if (token->type == OP_OPEN_DUP_NUM)
2494    {
2495      end = 0;
2496      start = fetch_number (regexp, token, syntax);
2497      if (start == -1)
2498        {
2499          if (token->type == CHARACTER && token->opr.c == ',')
2500            start = 0; /* We treat "{,m}" as "{0,m}".  */
2501          else
2502            {
2503              *err = REG_BADBR; /* <re>{} is invalid.  */
2504              return NULL;
2505            }
2506        }
2507      if (BE (start != -2, 1))
2508        {
2509          /* We treat "{n}" as "{n,n}".  */
2510          end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2511                 : ((token->type == CHARACTER && token->opr.c == ',')
2512                    ? fetch_number (regexp, token, syntax) : -2));
2513        }
2514      if (BE (start == -2 || end == -2, 0))
2515        {
2516          /* Invalid sequence.  */
2517          if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2518            {
2519              if (token->type == END_OF_RE)
2520                *err = REG_EBRACE;
2521              else
2522                *err = REG_BADBR;
2523
2524              return NULL;
2525            }
2526
2527          /* If the syntax bit is set, rollback.  */
2528          re_string_set_index (regexp, start_idx);
2529          *token = start_token;
2530          token->type = CHARACTER;
2531          /* mb_partial and word_char bits should be already initialized by
2532             peek_token.  */
2533          return elem;
2534        }
2535
2536      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2537        {
2538          /* First number greater than second.  */
2539          *err = REG_BADBR;
2540          return NULL;
2541        }
2542    }
2543  else
2544    {
2545      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2546      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2547    }
2548
2549  fetch_token (token, regexp, syntax);
2550
2551  if (BE (elem == NULL, 0))
2552    return NULL;
2553  if (BE (start == 0 && end == 0, 0))
2554    {
2555      postorder (elem, free_tree, NULL);
2556      return NULL;
2557    }
2558
2559  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2560  if (BE (start > 0, 0))
2561    {
2562      tree = elem;
2563      for (i = 2; i <= start; ++i)
2564        {
2565          elem = duplicate_tree (elem, dfa);
2566          tree = create_tree (dfa, tree, elem, CONCAT);
2567          if (BE (elem == NULL || tree == NULL, 0))
2568            goto parse_dup_op_espace;
2569        }
2570
2571      if (start == end)
2572        return tree;
2573
2574      /* Duplicate ELEM before it is marked optional.  */
2575      elem = duplicate_tree (elem, dfa);
2576      old_tree = tree;
2577    }
2578  else
2579    old_tree = NULL;
2580
2581  if (elem->token.type == SUBEXP)
2582    postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2583
2584  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2585  if (BE (tree == NULL, 0))
2586    goto parse_dup_op_espace;
2587
2588  /* This loop is actually executed only when end != -1,
2589     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2590     already created the start+1-th copy.  */
2591  for (i = start + 2; i <= end; ++i)
2592    {
2593      elem = duplicate_tree (elem, dfa);
2594      tree = create_tree (dfa, tree, elem, CONCAT);
2595      if (BE (elem == NULL || tree == NULL, 0))
2596        goto parse_dup_op_espace;
2597
2598      tree = create_tree (dfa, tree, NULL, OP_ALT);
2599      if (BE (tree == NULL, 0))
2600        goto parse_dup_op_espace;
2601    }
2602
2603  if (old_tree)
2604    tree = create_tree (dfa, old_tree, tree, CONCAT);
2605
2606  return tree;
2607
2608 parse_dup_op_espace:
2609  *err = REG_ESPACE;
2610  return NULL;
2611}
2612
2613/* Size of the names for collating symbol/equivalence_class/character_class.
2614   I'm not sure, but maybe enough.  */
2615#define BRACKET_NAME_BUF_SIZE 32
2616
2617#ifndef _LIBC
2618  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2619     Build the range expression which starts from START_ELEM, and ends
2620     at END_ELEM.  The result are written to MBCSET and SBCSET.
2621     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2622     mbcset->range_ends, is a pointer argument since we may
2623     update it.  */
2624
2625static reg_errcode_t
2626internal_function
2627# ifdef RE_ENABLE_I18N
2628build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2629                 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2630# else /* not RE_ENABLE_I18N */
2631build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2632                 bracket_elem_t *end_elem)
2633# endif /* not RE_ENABLE_I18N */
2634{
2635  unsigned int start_ch, end_ch;
2636  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2637  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2638          || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2639          0))
2640    return REG_ERANGE;
2641
2642  /* We can handle no multi character collating elements without libc
2643     support.  */
2644  if (BE ((start_elem->type == COLL_SYM
2645           && strlen ((char *) start_elem->opr.name) > 1)
2646          || (end_elem->type == COLL_SYM
2647              && strlen ((char *) end_elem->opr.name) > 1), 0))
2648    return REG_ECOLLATE;
2649
2650# ifdef RE_ENABLE_I18N
2651  {
2652    wchar_t wc;
2653    wint_t start_wc;
2654    wint_t end_wc;
2655    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2656
2657    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2658                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2659                   : 0));
2660    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2661              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2662                 : 0));
2663#ifdef GAWK
2664    /*
2665     * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2666     * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2667     * unsigned, so we don't have sign extension problems.
2668     */
2669    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2670                ? start_ch : start_elem->opr.wch);
2671    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2672              ? end_ch : end_elem->opr.wch);
2673#else
2674    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2675                ? __btowc (start_ch) : start_elem->opr.wch);
2676    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2677              ? __btowc (end_ch) : end_elem->opr.wch);
2678#endif
2679    if (start_wc == WEOF || end_wc == WEOF)
2680      return REG_ECOLLATE;
2681    cmp_buf[0] = start_wc;
2682    cmp_buf[4] = end_wc;
2683    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2684      return REG_ERANGE;
2685
2686    /* Got valid collation sequence values, add them as a new entry.
2687       However, for !_LIBC we have no collation elements: if the
2688       character set is single byte, the single byte character set
2689       that we build below suffices.  parse_bracket_exp passes
2690       no MBCSET if dfa->mb_cur_max == 1.  */
2691    if (mbcset)
2692      {
2693        /* Check the space of the arrays.  */
2694        if (BE (*range_alloc == mbcset->nranges, 0))
2695          {
2696            /* There is not enough space, need realloc.  */
2697            wchar_t *new_array_start, *new_array_end;
2698            int new_nranges;
2699
2700            /* +1 in case of mbcset->nranges is 0.  */
2701            new_nranges = 2 * mbcset->nranges + 1;
2702            /* Use realloc since mbcset->range_starts and mbcset->range_ends
2703               are NULL if *range_alloc == 0.  */
2704            new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2705                                          new_nranges);
2706            new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2707                                        new_nranges);
2708
2709            if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2710              return REG_ESPACE;
2711
2712            mbcset->range_starts = new_array_start;
2713            mbcset->range_ends = new_array_end;
2714            *range_alloc = new_nranges;
2715          }
2716
2717        mbcset->range_starts[mbcset->nranges] = start_wc;
2718        mbcset->range_ends[mbcset->nranges++] = end_wc;
2719      }
2720
2721    /* Build the table for single byte characters.  */
2722    for (wc = 0; wc < SBC_MAX; ++wc)
2723      {
2724        cmp_buf[2] = wc;
2725        if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2726            && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2727          bitset_set (sbcset, wc);
2728      }
2729  }
2730# else /* not RE_ENABLE_I18N */
2731  {
2732    unsigned int ch;
2733    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2734                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2735                   : 0));
2736    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2737              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2738                 : 0));
2739    if (start_ch > end_ch)
2740      return REG_ERANGE;
2741    /* Build the table for single byte characters.  */
2742    for (ch = 0; ch < SBC_MAX; ++ch)
2743      if (start_ch <= ch  && ch <= end_ch)
2744        bitset_set (sbcset, ch);
2745  }
2746# endif /* not RE_ENABLE_I18N */
2747  return REG_NOERROR;
2748}
2749#endif /* not _LIBC */
2750
2751#ifndef _LIBC
2752/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2753   Build the collating element which is represented by NAME.
2754   The result are written to MBCSET and SBCSET.
2755   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2756   pointer argument since we may update it.  */
2757
2758static reg_errcode_t
2759internal_function
2760# ifdef RE_ENABLE_I18N
2761build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2762                        int *coll_sym_alloc, const unsigned char *name)
2763# else /* not RE_ENABLE_I18N */
2764build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2765# endif /* not RE_ENABLE_I18N */
2766{
2767  size_t name_len = strlen ((const char *) name);
2768  if (BE (name_len != 1, 0))
2769    return REG_ECOLLATE;
2770  else
2771    {
2772      bitset_set (sbcset, name[0]);
2773      return REG_NOERROR;
2774    }
2775}
2776#endif /* not _LIBC */
2777
2778/* This function parse bracket expression like "[abc]", "[a-c]",
2779   "[[.a-a.]]" etc.  */
2780
2781static bin_tree_t *
2782parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2783                   reg_syntax_t syntax, reg_errcode_t *err)
2784{
2785#ifdef _LIBC
2786  const unsigned char *collseqmb;
2787  const char *collseqwc;
2788  uint32_t nrules;
2789  int32_t table_size;
2790  const int32_t *symb_table;
2791  const unsigned char *extra;
2792
2793  /* Local function for parse_bracket_exp used in _LIBC environment.
2794     Seek the collating symbol entry correspondings to NAME.
2795     Return the index of the symbol in the SYMB_TABLE.  */
2796
2797  auto inline int32_t
2798  __attribute ((always_inline))
2799  seek_collating_symbol_entry (name, name_len)
2800         const unsigned char *name;
2801         size_t name_len;
2802    {
2803      int32_t hash = elem_hash ((const char *) name, name_len);
2804      int32_t elem = hash % table_size;
2805      if (symb_table[2 * elem] != 0)
2806        {
2807          int32_t second = hash % (table_size - 2) + 1;
2808
2809          do
2810            {
2811              /* First compare the hashing value.  */
2812              if (symb_table[2 * elem] == hash
2813                  /* Compare the length of the name.  */
2814                  && name_len == extra[symb_table[2 * elem + 1]]
2815                  /* Compare the name.  */
2816                  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2817                             name_len) == 0)
2818                {
2819                  /* Yep, this is the entry.  */
2820                  break;
2821                }
2822
2823              /* Next entry.  */
2824              elem += second;
2825            }
2826          while (symb_table[2 * elem] != 0);
2827        }
2828      return elem;
2829    }
2830
2831  /* Local function for parse_bracket_exp used in _LIBC environment.
2832     Look up the collation sequence value of BR_ELEM.
2833     Return the value if succeeded, UINT_MAX otherwise.  */
2834
2835  auto inline unsigned int
2836  __attribute ((always_inline))
2837  lookup_collation_sequence_value (br_elem)
2838         bracket_elem_t *br_elem;
2839    {
2840      if (br_elem->type == SB_CHAR)
2841        {
2842          /*
2843          if (MB_CUR_MAX == 1)
2844          */
2845          if (nrules == 0)
2846            return collseqmb[br_elem->opr.ch];
2847          else
2848            {
2849              wint_t wc = __btowc (br_elem->opr.ch);
2850              return __collseq_table_lookup (collseqwc, wc);
2851            }
2852        }
2853      else if (br_elem->type == MB_CHAR)
2854        {
2855          if (nrules != 0)
2856            return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2857        }
2858      else if (br_elem->type == COLL_SYM)
2859        {
2860          size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2861          if (nrules != 0)
2862            {
2863              int32_t elem, idx;
2864              elem = seek_collating_symbol_entry (br_elem->opr.name,
2865                                                  sym_name_len);
2866              if (symb_table[2 * elem] != 0)
2867                {
2868                  /* We found the entry.  */
2869                  idx = symb_table[2 * elem + 1];
2870                  /* Skip the name of collating element name.  */
2871                  idx += 1 + extra[idx];
2872                  /* Skip the byte sequence of the collating element.  */
2873                  idx += 1 + extra[idx];
2874                  /* Adjust for the alignment.  */
2875                  idx = (idx + 3) & ~3;
2876                  /* Skip the multibyte collation sequence value.  */
2877                  idx += sizeof (unsigned int);
2878                  /* Skip the wide char sequence of the collating element.  */
2879                  idx += sizeof (unsigned int) *
2880                    (1 + *(unsigned int *) (extra + idx));
2881                  /* Return the collation sequence value.  */
2882                  return *(unsigned int *) (extra + idx);
2883                }
2884              else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2885                {
2886                  /* No valid character.  Match it as a single byte
2887                     character.  */
2888                  return collseqmb[br_elem->opr.name[0]];
2889                }
2890            }
2891          else if (sym_name_len == 1)
2892            return collseqmb[br_elem->opr.name[0]];
2893        }
2894      return UINT_MAX;
2895    }
2896
2897  /* Local function for parse_bracket_exp used in _LIBC environment.
2898     Build the range expression which starts from START_ELEM, and ends
2899     at END_ELEM.  The result are written to MBCSET and SBCSET.
2900     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2901     mbcset->range_ends, is a pointer argument since we may
2902     update it.  */
2903
2904  auto inline reg_errcode_t
2905  __attribute ((always_inline))
2906  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2907         re_charset_t *mbcset;
2908         int *range_alloc;
2909         bitset_t sbcset;
2910         bracket_elem_t *start_elem, *end_elem;
2911    {
2912      unsigned int ch;
2913      uint32_t start_collseq;
2914      uint32_t end_collseq;
2915
2916      /* Equivalence Classes and Character Classes can't be a range
2917         start/end.  */
2918      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2919              || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2920              0))
2921        return REG_ERANGE;
2922
2923      start_collseq = lookup_collation_sequence_value (start_elem);
2924      end_collseq = lookup_collation_sequence_value (end_elem);
2925      /* Check start/end collation sequence values.  */
2926      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2927        return REG_ECOLLATE;
2928      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2929        return REG_ERANGE;
2930
2931      /* Got valid collation sequence values, add them as a new entry.
2932         However, if we have no collation elements, and the character set
2933         is single byte, the single byte character set that we
2934         build below suffices. */
2935      if (nrules > 0 || dfa->mb_cur_max > 1)
2936        {
2937          /* Check the space of the arrays.  */
2938          if (BE (*range_alloc == mbcset->nranges, 0))
2939            {
2940              /* There is not enough space, need realloc.  */
2941              uint32_t *new_array_start;
2942              uint32_t *new_array_end;
2943              int new_nranges;
2944
2945              /* +1 in case of mbcset->nranges is 0.  */
2946              new_nranges = 2 * mbcset->nranges + 1;
2947              new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2948                                            new_nranges);
2949              new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2950                                          new_nranges);
2951
2952              if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2953                return REG_ESPACE;
2954
2955              mbcset->range_starts = new_array_start;
2956              mbcset->range_ends = new_array_end;
2957              *range_alloc = new_nranges;
2958            }
2959
2960          mbcset->range_starts[mbcset->nranges] = start_collseq;
2961          mbcset->range_ends[mbcset->nranges++] = end_collseq;
2962        }
2963
2964      /* Build the table for single byte characters.  */
2965      for (ch = 0; ch < SBC_MAX; ch++)
2966        {
2967          uint32_t ch_collseq;
2968          /*
2969          if (MB_CUR_MAX == 1)
2970          */
2971          if (nrules == 0)
2972            ch_collseq = collseqmb[ch];
2973          else
2974            ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2975          if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2976            bitset_set (sbcset, ch);
2977        }
2978      return REG_NOERROR;
2979    }
2980
2981  /* Local function for parse_bracket_exp used in _LIBC environment.
2982     Build the collating element which is represented by NAME.
2983     The result are written to MBCSET and SBCSET.
2984     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2985     pointer argument since we may update it.  */
2986
2987  auto inline reg_errcode_t
2988  __attribute ((always_inline))
2989  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2990         re_charset_t *mbcset;
2991         int *coll_sym_alloc;
2992         bitset_t sbcset;
2993         const unsigned char *name;
2994    {
2995      int32_t elem, idx;
2996      size_t name_len = strlen ((const char *) name);
2997      if (nrules != 0)
2998        {
2999          elem = seek_collating_symbol_entry (name, name_len);
3000          if (symb_table[2 * elem] != 0)
3001            {
3002              /* We found the entry.  */
3003              idx = symb_table[2 * elem + 1];
3004              /* Skip the name of collating element name.  */
3005              idx += 1 + extra[idx];
3006            }
3007          else if (symb_table[2 * elem] == 0 && name_len == 1)
3008            {
3009              /* No valid character, treat it as a normal
3010                 character.  */
3011              bitset_set (sbcset, name[0]);
3012              return REG_NOERROR;
3013            }
3014          else
3015            return REG_ECOLLATE;
3016
3017          /* Got valid collation sequence, add it as a new entry.  */
3018          /* Check the space of the arrays.  */
3019          if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3020            {
3021              /* Not enough, realloc it.  */
3022              /* +1 in case of mbcset->ncoll_syms is 0.  */
3023              int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3024              /* Use realloc since mbcset->coll_syms is NULL
3025                 if *alloc == 0.  */
3026              int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3027                                                   new_coll_sym_alloc);
3028              if (BE (new_coll_syms == NULL, 0))
3029                return REG_ESPACE;
3030              mbcset->coll_syms = new_coll_syms;
3031              *coll_sym_alloc = new_coll_sym_alloc;
3032            }
3033          mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3034          return REG_NOERROR;
3035        }
3036      else
3037        {
3038          if (BE (name_len != 1, 0))
3039            return REG_ECOLLATE;
3040          else
3041            {
3042              bitset_set (sbcset, name[0]);
3043              return REG_NOERROR;
3044            }
3045        }
3046    }
3047#endif
3048
3049  re_token_t br_token;
3050  re_bitset_ptr_t sbcset;
3051#ifdef RE_ENABLE_I18N
3052  re_charset_t *mbcset;
3053  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3054  int equiv_class_alloc = 0, char_class_alloc = 0;
3055#endif /* not RE_ENABLE_I18N */
3056  int non_match = 0;
3057  bin_tree_t *work_tree;
3058  int token_len;
3059  int first_round = 1;
3060#ifdef _LIBC
3061  collseqmb = (const unsigned char *)
3062    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3063  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3064  if (nrules)
3065    {
3066      /*
3067      if (MB_CUR_MAX > 1)
3068      */
3069      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3070      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3071      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3072                                                  _NL_COLLATE_SYMB_TABLEMB);
3073      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3074                                                   _NL_COLLATE_SYMB_EXTRAMB);
3075    }
3076#endif
3077  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3078#ifdef RE_ENABLE_I18N
3079  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3080#endif /* RE_ENABLE_I18N */
3081#ifdef RE_ENABLE_I18N
3082  if (BE (sbcset == NULL || mbcset == NULL, 0))
3083#else
3084  if (BE (sbcset == NULL, 0))
3085#endif /* RE_ENABLE_I18N */
3086    {
3087      *err = REG_ESPACE;
3088      return NULL;
3089    }
3090
3091  token_len = peek_token_bracket (token, regexp, syntax);
3092  if (BE (token->type == END_OF_RE, 0))
3093    {
3094      *err = REG_BADPAT;
3095      goto parse_bracket_exp_free_return;
3096    }
3097  if (token->type == OP_NON_MATCH_LIST)
3098    {
3099#ifdef RE_ENABLE_I18N
3100      mbcset->non_match = 1;
3101#endif /* not RE_ENABLE_I18N */
3102      non_match = 1;
3103      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3104        bitset_set (sbcset, '\n');
3105      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3106      token_len = peek_token_bracket (token, regexp, syntax);
3107      if (BE (token->type == END_OF_RE, 0))
3108        {
3109          *err = REG_BADPAT;
3110          goto parse_bracket_exp_free_return;
3111        }
3112    }
3113
3114  /* We treat the first ']' as a normal character.  */
3115  if (token->type == OP_CLOSE_BRACKET)
3116    token->type = CHARACTER;
3117
3118  while (1)
3119    {
3120      bracket_elem_t start_elem, end_elem;
3121      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3122      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3123      reg_errcode_t ret;
3124      int token_len2 = 0, is_range_exp = 0;
3125      re_token_t token2;
3126
3127      start_elem.opr.name = start_name_buf;
3128      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3129                                   syntax, first_round);
3130      if (BE (ret != REG_NOERROR, 0))
3131        {
3132          *err = ret;
3133          goto parse_bracket_exp_free_return;
3134        }
3135      first_round = 0;
3136
3137      /* Get information about the next token.  We need it in any case.  */
3138      token_len = peek_token_bracket (token, regexp, syntax);
3139
3140      /* Do not check for ranges if we know they are not allowed.  */
3141      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3142        {
3143          if (BE (token->type == END_OF_RE, 0))
3144            {
3145              *err = REG_EBRACK;
3146              goto parse_bracket_exp_free_return;
3147            }
3148          if (token->type == OP_CHARSET_RANGE)
3149            {
3150              re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3151              token_len2 = peek_token_bracket (&token2, regexp, syntax);
3152              if (BE (token2.type == END_OF_RE, 0))
3153                {
3154                  *err = REG_EBRACK;
3155                  goto parse_bracket_exp_free_return;
3156                }
3157              if (token2.type == OP_CLOSE_BRACKET)
3158                {
3159                  /* We treat the last '-' as a normal character.  */
3160                  re_string_skip_bytes (regexp, -token_len);
3161                  token->type = CHARACTER;
3162                }
3163              else
3164                is_range_exp = 1;
3165            }
3166        }
3167
3168      if (is_range_exp == 1)
3169        {
3170          end_elem.opr.name = end_name_buf;
3171          ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3172                                       dfa, syntax, 1);
3173          if (BE (ret != REG_NOERROR, 0))
3174            {
3175              *err = ret;
3176              goto parse_bracket_exp_free_return;
3177            }
3178
3179          token_len = peek_token_bracket (token, regexp, syntax);
3180
3181#ifdef _LIBC
3182          *err = build_range_exp (sbcset, mbcset, &range_alloc,
3183                                  &start_elem, &end_elem);
3184#else
3185# ifdef RE_ENABLE_I18N
3186          *err = build_range_exp (sbcset,
3187                                  dfa->mb_cur_max > 1 ? mbcset : NULL,
3188                                  &range_alloc, &start_elem, &end_elem);
3189# else
3190          *err = build_range_exp (sbcset, &start_elem, &end_elem);
3191# endif
3192#endif /* RE_ENABLE_I18N */
3193          if (BE (*err != REG_NOERROR, 0))
3194            goto parse_bracket_exp_free_return;
3195        }
3196      else
3197        {
3198          switch (start_elem.type)
3199            {
3200            case SB_CHAR:
3201              bitset_set (sbcset, start_elem.opr.ch);
3202              break;
3203#ifdef RE_ENABLE_I18N
3204            case MB_CHAR:
3205              /* Check whether the array has enough space.  */
3206              if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3207                {
3208                  wchar_t *new_mbchars;
3209                  /* Not enough, realloc it.  */
3210                  /* +1 in case of mbcset->nmbchars is 0.  */
3211                  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3212                  /* Use realloc since array is NULL if *alloc == 0.  */
3213                  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3214                                            mbchar_alloc);
3215                  if (BE (new_mbchars == NULL, 0))
3216                    goto parse_bracket_exp_espace;
3217                  mbcset->mbchars = new_mbchars;
3218                }
3219              mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3220              break;
3221#endif /* RE_ENABLE_I18N */
3222            case EQUIV_CLASS:
3223              *err = build_equiv_class (sbcset,
3224#ifdef RE_ENABLE_I18N
3225                                        mbcset, &equiv_class_alloc,
3226#endif /* RE_ENABLE_I18N */
3227                                        start_elem.opr.name);
3228              if (BE (*err != REG_NOERROR, 0))
3229                goto parse_bracket_exp_free_return;
3230              break;
3231            case COLL_SYM:
3232              *err = build_collating_symbol (sbcset,
3233#ifdef RE_ENABLE_I18N
3234                                             mbcset, &coll_sym_alloc,
3235#endif /* RE_ENABLE_I18N */
3236                                             start_elem.opr.name);
3237              if (BE (*err != REG_NOERROR, 0))
3238                goto parse_bracket_exp_free_return;
3239              break;
3240            case CHAR_CLASS:
3241              *err = build_charclass (regexp->trans, sbcset,
3242#ifdef RE_ENABLE_I18N
3243                                      mbcset, &char_class_alloc,
3244#endif /* RE_ENABLE_I18N */
3245                                      (const char *) start_elem.opr.name, syntax);
3246              if (BE (*err != REG_NOERROR, 0))
3247               goto parse_bracket_exp_free_return;
3248              break;
3249            default:
3250              assert (0);
3251              break;
3252            }
3253        }
3254      if (BE (token->type == END_OF_RE, 0))
3255        {
3256          *err = REG_EBRACK;
3257          goto parse_bracket_exp_free_return;
3258        }
3259      if (token->type == OP_CLOSE_BRACKET)
3260        break;
3261    }
3262
3263  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3264
3265  /* If it is non-matching list.  */
3266  if (non_match)
3267    bitset_not (sbcset);
3268
3269#ifdef RE_ENABLE_I18N
3270  /* Ensure only single byte characters are set.  */
3271  if (dfa->mb_cur_max > 1)
3272    bitset_mask (sbcset, dfa->sb_char);
3273
3274  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3275      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3276                                                     || mbcset->non_match)))
3277    {
3278      bin_tree_t *mbc_tree;
3279      int sbc_idx;
3280      /* Build a tree for complex bracket.  */
3281      dfa->has_mb_node = 1;
3282      br_token.type = COMPLEX_BRACKET;
3283      br_token.opr.mbcset = mbcset;
3284      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3285      if (BE (mbc_tree == NULL, 0))
3286        goto parse_bracket_exp_espace;
3287      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3288        if (sbcset[sbc_idx])
3289          break;
3290      /* If there are no bits set in sbcset, there is no point
3291         of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3292      if (sbc_idx < BITSET_WORDS)
3293        {
3294          /* Build a tree for simple bracket.  */
3295          br_token.type = SIMPLE_BRACKET;
3296          br_token.opr.sbcset = sbcset;
3297          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3298          if (BE (work_tree == NULL, 0))
3299            goto parse_bracket_exp_espace;
3300
3301          /* Then join them by ALT node.  */
3302          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3303          if (BE (work_tree == NULL, 0))
3304            goto parse_bracket_exp_espace;
3305        }
3306      else
3307        {
3308          re_free (sbcset);
3309          work_tree = mbc_tree;
3310        }
3311    }
3312  else
3313#endif /* not RE_ENABLE_I18N */
3314    {
3315#ifdef RE_ENABLE_I18N
3316      free_charset (mbcset);
3317#endif
3318      /* Build a tree for simple bracket.  */
3319      br_token.type = SIMPLE_BRACKET;
3320      br_token.opr.sbcset = sbcset;
3321      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3322      if (BE (work_tree == NULL, 0))
3323        goto parse_bracket_exp_espace;
3324    }
3325  return work_tree;
3326
3327 parse_bracket_exp_espace:
3328  *err = REG_ESPACE;
3329 parse_bracket_exp_free_return:
3330  re_free (sbcset);
3331#ifdef RE_ENABLE_I18N
3332  free_charset (mbcset);
3333#endif /* RE_ENABLE_I18N */
3334  return NULL;
3335}
3336
3337/* Parse an element in the bracket expression.  */
3338
3339static reg_errcode_t
3340parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3341                       re_token_t *token, int token_len, re_dfa_t *dfa,
3342                       reg_syntax_t syntax, int accept_hyphen)
3343{
3344#ifdef RE_ENABLE_I18N
3345  int cur_char_size;
3346  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3347  if (cur_char_size > 1)
3348    {
3349      elem->type = MB_CHAR;
3350      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3351      re_string_skip_bytes (regexp, cur_char_size);
3352      return REG_NOERROR;
3353    }
3354#endif /* RE_ENABLE_I18N */
3355  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3356  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3357      || token->type == OP_OPEN_EQUIV_CLASS)
3358    return parse_bracket_symbol (elem, regexp, token);
3359  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3360    {
3361      /* A '-' must only appear as anything but a range indicator before
3362         the closing bracket.  Everything else is an error.  */
3363      re_token_t token2;
3364      (void) peek_token_bracket (&token2, regexp, syntax);
3365      if (token2.type != OP_CLOSE_BRACKET)
3366        /* The actual error value is not standardized since this whole
3367           case is undefined.  But ERANGE makes good sense.  */
3368        return REG_ERANGE;
3369    }
3370  elem->type = SB_CHAR;
3371  elem->opr.ch = token->opr.c;
3372  return REG_NOERROR;
3373}
3374
3375/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3376   such as [:<character_class>:], [.<collating_element>.], and
3377   [=<equivalent_class>=].  */
3378
3379static reg_errcode_t
3380parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3381                      re_token_t *token)
3382{
3383  unsigned char ch, delim = token->opr.c;
3384  int i = 0;
3385  if (re_string_eoi(regexp))
3386    return REG_EBRACK;
3387  for (;; ++i)
3388    {
3389      if (i >= BRACKET_NAME_BUF_SIZE)
3390        return REG_EBRACK;
3391      if (token->type == OP_OPEN_CHAR_CLASS)
3392        ch = re_string_fetch_byte_case (regexp);
3393      else
3394        ch = re_string_fetch_byte (regexp);
3395      if (re_string_eoi(regexp))
3396        return REG_EBRACK;
3397      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3398        break;
3399      elem->opr.name[i] = ch;
3400    }
3401  re_string_skip_bytes (regexp, 1);
3402  elem->opr.name[i] = '\0';
3403  switch (token->type)
3404    {
3405    case OP_OPEN_COLL_ELEM:
3406      elem->type = COLL_SYM;
3407      break;
3408    case OP_OPEN_EQUIV_CLASS:
3409      elem->type = EQUIV_CLASS;
3410      break;
3411    case OP_OPEN_CHAR_CLASS:
3412      elem->type = CHAR_CLASS;
3413      break;
3414    default:
3415      break;
3416    }
3417  return REG_NOERROR;
3418}
3419
3420  /* Helper function for parse_bracket_exp.
3421     Build the equivalence class which is represented by NAME.
3422     The result are written to MBCSET and SBCSET.
3423     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3424     is a pointer argument since we may update it.  */
3425
3426static reg_errcode_t
3427#ifdef RE_ENABLE_I18N
3428build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3429                   int *equiv_class_alloc, const unsigned char *name)
3430#else /* not RE_ENABLE_I18N */
3431build_equiv_class (bitset_t sbcset, const unsigned char *name)
3432#endif /* not RE_ENABLE_I18N */
3433{
3434#ifdef _LIBC
3435  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3436  if (nrules != 0)
3437    {
3438      const int32_t *table, *indirect;
3439      const unsigned char *weights, *extra, *cp;
3440      unsigned char char_buf[2];
3441      int32_t idx1, idx2;
3442      unsigned int ch;
3443      size_t len;
3444      /* This #include defines a local function!  */
3445# include <locale/weight.h>
3446      /* Calculate the index for equivalence class.  */
3447      cp = name;
3448      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3449      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3450                                               _NL_COLLATE_WEIGHTMB);
3451      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3452                                                   _NL_COLLATE_EXTRAMB);
3453      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3454                                                _NL_COLLATE_INDIRECTMB);
3455      idx1 = findidx (&cp);
3456      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3457        /* This isn't a valid character.  */
3458        return REG_ECOLLATE;
3459
3460      /* Build single byte matcing table for this equivalence class.  */
3461      char_buf[1] = (unsigned char) '\0';
3462      len = weights[idx1 & 0xffffff];
3463      for (ch = 0; ch < SBC_MAX; ++ch)
3464        {
3465          char_buf[0] = ch;
3466          cp = char_buf;
3467          idx2 = findidx (&cp);
3468/*
3469          idx2 = table[ch];
3470*/
3471          if (idx2 == 0)
3472            /* This isn't a valid character.  */
3473            continue;
3474          /* Compare only if the length matches and the collation rule
3475             index is the same.  */
3476          if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3477            {
3478              int cnt = 0;
3479
3480              while (cnt <= len &&
3481                     weights[(idx1 & 0xffffff) + 1 + cnt]
3482                     == weights[(idx2 & 0xffffff) + 1 + cnt])
3483                ++cnt;
3484
3485              if (cnt > len)
3486                bitset_set (sbcset, ch);
3487            }
3488        }
3489      /* Check whether the array has enough space.  */
3490      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3491        {
3492          /* Not enough, realloc it.  */
3493          /* +1 in case of mbcset->nequiv_classes is 0.  */
3494          int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3495          /* Use realloc since the array is NULL if *alloc == 0.  */
3496          int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3497                                                   int32_t,
3498                                                   new_equiv_class_alloc);
3499          if (BE (new_equiv_classes == NULL, 0))
3500            return REG_ESPACE;
3501          mbcset->equiv_classes = new_equiv_classes;
3502          *equiv_class_alloc = new_equiv_class_alloc;
3503        }
3504      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3505    }
3506  else
3507#endif /* _LIBC */
3508    {
3509      if (BE (strlen ((const char *) name) != 1, 0))
3510        return REG_ECOLLATE;
3511      bitset_set (sbcset, *name);
3512    }
3513  return REG_NOERROR;
3514}
3515
3516  /* Helper function for parse_bracket_exp.
3517     Build the character class which is represented by NAME.
3518     The result are written to MBCSET and SBCSET.
3519     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3520     is a pointer argument since we may update it.  */
3521
3522static reg_errcode_t
3523#ifdef RE_ENABLE_I18N
3524build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3525                 re_charset_t *mbcset, int *char_class_alloc,
3526                 const char *class_name, reg_syntax_t syntax)
3527#else /* not RE_ENABLE_I18N */
3528build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3529                 const char *class_name, reg_syntax_t syntax)
3530#endif /* not RE_ENABLE_I18N */
3531{
3532  int i;
3533
3534  /* In case of REG_ICASE "upper" and "lower" match the both of
3535     upper and lower cases.  */
3536  if ((syntax & RE_ICASE)
3537      && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3538    class_name = "alpha";
3539
3540#ifdef RE_ENABLE_I18N
3541  /* Check the space of the arrays.  */
3542  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3543    {
3544      /* Not enough, realloc it.  */
3545      /* +1 in case of mbcset->nchar_classes is 0.  */
3546      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3547      /* Use realloc since array is NULL if *alloc == 0.  */
3548      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3549                                               new_char_class_alloc);
3550      if (BE (new_char_classes == NULL, 0))
3551        return REG_ESPACE;
3552      mbcset->char_classes = new_char_classes;
3553      *char_class_alloc = new_char_class_alloc;
3554    }
3555  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3556#endif /* RE_ENABLE_I18N */
3557
3558#define BUILD_CHARCLASS_LOOP(ctype_func)        \
3559  do {                                          \
3560    if (BE (trans != NULL, 0))                  \
3561      {                                         \
3562        for (i = 0; i < SBC_MAX; ++i)           \
3563          if (ctype_func (i))                   \
3564            bitset_set (sbcset, trans[i]);      \
3565      }                                         \
3566    else                                        \
3567      {                                         \
3568        for (i = 0; i < SBC_MAX; ++i)           \
3569          if (ctype_func (i))                   \
3570            bitset_set (sbcset, i);             \
3571      }                                         \
3572  } while (0)
3573
3574  if (strcmp (class_name, "alnum") == 0)
3575    BUILD_CHARCLASS_LOOP (isalnum);
3576  else if (strcmp (class_name, "cntrl") == 0)
3577    BUILD_CHARCLASS_LOOP (iscntrl);
3578  else if (strcmp (class_name, "lower") == 0)
3579    BUILD_CHARCLASS_LOOP (islower);
3580  else if (strcmp (class_name, "space") == 0)
3581    BUILD_CHARCLASS_LOOP (isspace);
3582  else if (strcmp (class_name, "alpha") == 0)
3583    BUILD_CHARCLASS_LOOP (isalpha);
3584  else if (strcmp (class_name, "digit") == 0)
3585    BUILD_CHARCLASS_LOOP (isdigit);
3586  else if (strcmp (class_name, "print") == 0)
3587    BUILD_CHARCLASS_LOOP (isprint);
3588  else if (strcmp (class_name, "upper") == 0)
3589    BUILD_CHARCLASS_LOOP (isupper);
3590  else if (strcmp (class_name, "blank") == 0)
3591#ifndef GAWK
3592    BUILD_CHARCLASS_LOOP (isblank);
3593#else
3594    /* see comments above */
3595    BUILD_CHARCLASS_LOOP (is_blank);
3596#endif
3597  else if (strcmp (class_name, "graph") == 0)
3598    BUILD_CHARCLASS_LOOP (isgraph);
3599  else if (strcmp (class_name, "punct") == 0)
3600    BUILD_CHARCLASS_LOOP (ispunct);
3601  else if (strcmp (class_name, "xdigit") == 0)
3602    BUILD_CHARCLASS_LOOP (isxdigit);
3603  else
3604    return REG_ECTYPE;
3605
3606  return REG_NOERROR;
3607}
3608
3609static bin_tree_t *
3610build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3611                    const char *class_name,
3612                    const char *extra, int non_match,
3613                    reg_errcode_t *err)
3614{
3615  re_bitset_ptr_t sbcset;
3616#ifdef RE_ENABLE_I18N
3617  re_charset_t *mbcset;
3618  int alloc = 0;
3619#endif /* not RE_ENABLE_I18N */
3620  reg_errcode_t ret;
3621  re_token_t br_token;
3622  bin_tree_t *tree;
3623
3624  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3625#ifdef RE_ENABLE_I18N
3626  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3627#endif /* RE_ENABLE_I18N */
3628
3629#ifdef RE_ENABLE_I18N
3630  if (BE (sbcset == NULL || mbcset == NULL, 0))
3631#else /* not RE_ENABLE_I18N */
3632  if (BE (sbcset == NULL, 0))
3633#endif /* not RE_ENABLE_I18N */
3634    {
3635      *err = REG_ESPACE;
3636      return NULL;
3637    }
3638
3639  if (non_match)
3640    {
3641#ifdef RE_ENABLE_I18N
3642      mbcset->non_match = 1;
3643#endif /* not RE_ENABLE_I18N */
3644    }
3645
3646  /* We don't care the syntax in this case.  */
3647  ret = build_charclass (trans, sbcset,
3648#ifdef RE_ENABLE_I18N
3649                         mbcset, &alloc,
3650#endif /* RE_ENABLE_I18N */
3651                         class_name, 0);
3652
3653  if (BE (ret != REG_NOERROR, 0))
3654    {
3655      re_free (sbcset);
3656#ifdef RE_ENABLE_I18N
3657      free_charset (mbcset);
3658#endif /* RE_ENABLE_I18N */
3659      *err = ret;
3660      return NULL;
3661    }
3662  /* \w match '_' also.  */
3663  for (; *extra; extra++)
3664    bitset_set (sbcset, *extra);
3665
3666  /* If it is non-matching list.  */
3667  if (non_match)
3668    bitset_not (sbcset);
3669
3670#ifdef RE_ENABLE_I18N
3671  /* Ensure only single byte characters are set.  */
3672  if (dfa->mb_cur_max > 1)
3673    bitset_mask (sbcset, dfa->sb_char);
3674#endif
3675
3676  /* Build a tree for simple bracket.  */
3677  br_token.type = SIMPLE_BRACKET;
3678  br_token.opr.sbcset = sbcset;
3679  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3680  if (BE (tree == NULL, 0))
3681    goto build_word_op_espace;
3682
3683#ifdef RE_ENABLE_I18N
3684  if (dfa->mb_cur_max > 1)
3685    {
3686      bin_tree_t *mbc_tree;
3687      /* Build a tree for complex bracket.  */
3688      br_token.type = COMPLEX_BRACKET;
3689      br_token.opr.mbcset = mbcset;
3690      dfa->has_mb_node = 1;
3691      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3692      if (BE (mbc_tree == NULL, 0))
3693        goto build_word_op_espace;
3694      /* Then join them by ALT node.  */
3695      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3696      if (BE (mbc_tree != NULL, 1))
3697        return tree;
3698    }
3699  else
3700    {
3701      free_charset (mbcset);
3702      return tree;
3703    }
3704#else /* not RE_ENABLE_I18N */
3705  return tree;
3706#endif /* not RE_ENABLE_I18N */
3707
3708 build_word_op_espace:
3709  re_free (sbcset);
3710#ifdef RE_ENABLE_I18N
3711  free_charset (mbcset);
3712#endif /* RE_ENABLE_I18N */
3713  *err = REG_ESPACE;
3714  return NULL;
3715}
3716
3717/* This is intended for the expressions like "a{1,3}".
3718   Fetch a number from `input', and return the number.
3719   Return -1, if the number field is empty like "{,1}".
3720   Return -2, if an error has occurred.  */
3721
3722static int
3723fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3724{
3725  int num = -1;
3726  unsigned char c;
3727  while (1)
3728    {
3729      fetch_token (token, input, syntax);
3730      c = token->opr.c;
3731      if (BE (token->type == END_OF_RE, 0))
3732        return -2;
3733      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3734        break;
3735      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3736             ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3737      num = (num > RE_DUP_MAX) ? -2 : num;
3738    }
3739  return num;
3740}
3741\f
3742#ifdef RE_ENABLE_I18N
3743static void
3744free_charset (re_charset_t *cset)
3745{
3746  re_free (cset->mbchars);
3747# ifdef _LIBC
3748  re_free (cset->coll_syms);
3749  re_free (cset->equiv_classes);
3750  re_free (cset->range_starts);
3751  re_free (cset->range_ends);
3752# endif
3753  re_free (cset->char_classes);
3754  re_free (cset);
3755}
3756#endif /* RE_ENABLE_I18N */
3757\f
3758/* Functions for binary tree operation.  */
3759
3760/* Create a tree node.  */
3761
3762static bin_tree_t *
3763create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3764             re_token_type_t type)
3765{
3766  re_token_t t;
3767  t.type = type;
3768  return create_token_tree (dfa, left, right, &t);
3769}
3770
3771static bin_tree_t *
3772create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3773                   const re_token_t *token)
3774{
3775  bin_tree_t *tree;
3776  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3777    {
3778      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3779
3780      if (storage == NULL)
3781        return NULL;
3782      storage->next = dfa->str_tree_storage;
3783      dfa->str_tree_storage = storage;
3784      dfa->str_tree_storage_idx = 0;
3785    }
3786  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3787
3788  tree->parent = NULL;
3789  tree->left = left;
3790  tree->right = right;
3791  tree->token = *token;
3792  tree->token.duplicated = 0;
3793  tree->token.opt_subexp = 0;
3794  tree->first = NULL;
3795  tree->next = NULL;
3796  tree->node_idx = -1;
3797
3798  if (left != NULL)
3799    left->parent = tree;
3800  if (right != NULL)
3801    right->parent = tree;
3802  return tree;
3803}
3804
3805/* Mark the tree SRC as an optional subexpression.
3806   To be called from preorder or postorder.  */
3807
3808static reg_errcode_t
3809mark_opt_subexp (void *extra, bin_tree_t *node)
3810{
3811  int idx = (int) (intptr_t) extra;
3812  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3813    node->token.opt_subexp = 1;
3814
3815  return REG_NOERROR;
3816}
3817
3818/* Free the allocated memory inside NODE. */
3819
3820static void
3821free_token (re_token_t *node)
3822{
3823#ifdef RE_ENABLE_I18N
3824  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3825    free_charset (node->opr.mbcset);
3826  else
3827#endif /* RE_ENABLE_I18N */
3828    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3829      re_free (node->opr.sbcset);
3830}
3831
3832/* Worker function for tree walking.  Free the allocated memory inside NODE
3833   and its children. */
3834
3835static reg_errcode_t
3836free_tree (void *extra, bin_tree_t *node)
3837{
3838  free_token (&node->token);
3839  return REG_NOERROR;
3840}
3841
3842
3843/* Duplicate the node SRC, and return new node.  This is a preorder
3844   visit similar to the one implemented by the generic visitor, but
3845   we need more infrastructure to maintain two parallel trees --- so,
3846   it's easier to duplicate.  */
3847
3848static bin_tree_t *
3849duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3850{
3851  const bin_tree_t *node;
3852  bin_tree_t *dup_root;
3853  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3854
3855  for (node = root; ; )
3856    {
3857      /* Create a new tree and link it back to the current parent.  */
3858      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3859      if (*p_new == NULL)
3860        return NULL;
3861      (*p_new)->parent = dup_node;
3862      (*p_new)->token.duplicated = 1;
3863      dup_node = *p_new;
3864
3865      /* Go to the left node, or up and to the right.  */
3866      if (node->left)
3867        {
3868          node = node->left;
3869          p_new = &dup_node->left;
3870        }
3871      else
3872        {
3873          const bin_tree_t *prev = NULL;
3874          while (node->right == prev || node->right == NULL)
3875            {
3876              prev = node;
3877              node = node->parent;
3878              dup_node = dup_node->parent;
3879              if (!node)
3880                return dup_root;
3881            }
3882          node = node->right;
3883          p_new = &dup_node->right;
3884        }
3885    }
3886}