compat / regex / regexec.con commit compat/regex: use the regex engine from gawk for compat (d18f76d)
   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, write to the Free
  18   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19   02110-1301 USA.  */
  20
  21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                     int n) internal_function;
  23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24static void match_ctx_free (re_match_context_t *cache) internal_function;
  25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                          int str_idx, int from, int to)
  27     internal_function;
  28static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29     internal_function;
  30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                           int str_idx) internal_function;
  32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                   int node, int str_idx)
  34     internal_function;
  35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                           re_dfastate_t **limited_sts, int last_node,
  37                           int last_str_idx)
  38     internal_function;
  39static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                         const char *string, int length,
  41                                         int start, int range, int stop,
  42                                         size_t nmatch, regmatch_t pmatch[],
  43                                         int eflags) internal_function;
  44static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                             const char *string1, int length1,
  46                             const char *string2, int length2,
  47                             int start, int range, struct re_registers *regs,
  48                             int stop, int ret_len) internal_function;
  49static int re_search_stub (struct re_pattern_buffer *bufp,
  50                           const char *string, int length, int start,
  51                           int range, int stop, struct re_registers *regs,
  52                           int ret_len) internal_function;
  53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                              int nregs, int regs_allocated) internal_function;
  55static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56     internal_function;
  57static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                           int *p_match_first) internal_function;
  59static int check_halt_state_context (const re_match_context_t *mctx,
  60                                     const re_dfastate_t *state, int idx)
  61     internal_function;
  62static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                         regmatch_t *prev_idx_match, int cur_node,
  64                         int cur_idx, int nmatch) internal_function;
  65static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                      int str_idx, int dest_node, int nregs,
  67                                      regmatch_t *regs,
  68                                      re_node_set *eps_via_nodes)
  69     internal_function;
  70static reg_errcode_t set_regs (const regex_t *preg,
  71                               const re_match_context_t *mctx,
  72                               size_t nmatch, regmatch_t *pmatch,
  73                               int fl_backtrack) internal_function;
  74static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75     internal_function;
  76
  77#ifdef RE_ENABLE_I18N
  78static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                re_sift_context_t *sctx,
  80                                int node_idx, int str_idx, int max_str_idx)
  81     internal_function;
  82#endif /* RE_ENABLE_I18N */
  83static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                           re_sift_context_t *sctx)
  85     internal_function;
  86static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                          re_sift_context_t *sctx, int str_idx,
  88                                          re_node_set *cur_dest)
  89     internal_function;
  90static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                              re_sift_context_t *sctx,
  92                                              int str_idx,
  93                                              re_node_set *dest_nodes)
  94     internal_function;
  95static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                            re_node_set *dest_nodes,
  97                                            const re_node_set *candidates)
  98     internal_function;
  99static int check_dst_limits (const re_match_context_t *mctx,
 100                             re_node_set *limits,
 101                             int dst_node, int dst_idx, int src_node,
 102                             int src_idx) internal_function;
 103static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                        int boundaries, int subexp_idx,
 105                                        int from_node, int bkref_idx)
 106     internal_function;
 107static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                      int limit, int subexp_idx,
 109                                      int node, int str_idx,
 110                                      int bkref_idx) internal_function;
 111static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                          re_node_set *dest_nodes,
 113                                          const re_node_set *candidates,
 114                                          re_node_set *limits,
 115                                          struct re_backref_cache_entry *bkref_ents,
 116                                          int str_idx) internal_function;
 117static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                        re_sift_context_t *sctx,
 119                                        int str_idx, const re_node_set *candidates)
 120     internal_function;
 121static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                        re_dfastate_t **dst,
 123                                        re_dfastate_t **src, int num)
 124     internal_function;
 125static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                         re_match_context_t *mctx) internal_function;
 127static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                     re_match_context_t *mctx,
 129                                     re_dfastate_t *state) internal_function;
 130static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                            re_match_context_t *mctx,
 132                                            re_dfastate_t *next_state)
 133     internal_function;
 134static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                re_node_set *cur_nodes,
 136                                                int str_idx) internal_function;
 137#if 0
 138static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                        re_match_context_t *mctx,
 140                                        re_dfastate_t *pstate)
 141     internal_function;
 142#endif
 143#ifdef RE_ENABLE_I18N
 144static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                       re_dfastate_t *pstate)
 146     internal_function;
 147#endif /* RE_ENABLE_I18N */
 148static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                          const re_node_set *nodes)
 150     internal_function;
 151static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                 int bkref_node, int bkref_str_idx)
 153     internal_function;
 154static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                     const re_sub_match_top_t *sub_top,
 156                                     re_sub_match_last_t *sub_last,
 157                                     int bkref_node, int bkref_str)
 158     internal_function;
 159static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                             int subexp_idx, int type) internal_function;
 161static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                    state_array_t *path, int top_node,
 163                                    int top_str, int last_node, int last_str,
 164                                    int type) internal_function;
 165static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                   int str_idx,
 167                                                   re_node_set *cur_nodes,
 168                                                   re_node_set *next_nodes)
 169     internal_function;
 170static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                               re_node_set *cur_nodes,
 172                                               int ex_subexp, int type)
 173     internal_function;
 174static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                   re_node_set *dst_nodes,
 176                                                   int target, int ex_subexp,
 177                                                   int type) internal_function;
 178static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                         re_node_set *cur_nodes, int cur_str,
 180                                         int subexp_num, int type)
 181     internal_function;
 182static int build_trtable (const re_dfa_t *dfa,
 183                          re_dfastate_t *state) internal_function;
 184#ifdef RE_ENABLE_I18N
 185static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                    const re_string_t *input, int idx)
 187     internal_function;
 188# ifdef _LIBC
 189static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                   size_t name_len)
 191     internal_function;
 192# endif /* _LIBC */
 193#endif /* RE_ENABLE_I18N */
 194static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                       const re_dfastate_t *state,
 196                                       re_node_set *states_node,
 197                                       bitset_t *states_ch) internal_function;
 198static int check_node_accept (const re_match_context_t *mctx,
 199                              const re_token_t *node, int idx)
 200     internal_function;
 201static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202     internal_function;
 203\f
 204/* Entry point for POSIX code.  */
 205
 206/* regexec searches for a given pattern, specified by PREG, in the
 207   string STRING.
 208
 209   If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210   `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211   least NMATCH elements, and we set them to the offsets of the
 212   corresponding matched substrings.
 213
 214   EFLAGS specifies `execution flags' which affect matching: if
 215   REG_NOTBOL is set, then ^ does not match at the beginning of the
 216   string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218   We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220int
 221regexec (preg, string, nmatch, pmatch, eflags)
 222    const regex_t *__restrict preg;
 223    const char *__restrict string;
 224    size_t nmatch;
 225    regmatch_t pmatch[];
 226    int eflags;
 227{
 228  reg_errcode_t err;
 229  int start, length;
 230
 231  if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 232    return REG_BADPAT;
 233
 234  if (eflags & REG_STARTEND)
 235    {
 236      start = pmatch[0].rm_so;
 237      length = pmatch[0].rm_eo;
 238    }
 239  else
 240    {
 241      start = 0;
 242      length = strlen (string);
 243    }
 244
 245  __libc_lock_lock (dfa->lock);
 246  if (preg->no_sub)
 247    err = re_search_internal (preg, string, length, start, length - start,
 248                              length, 0, NULL, eflags);
 249  else
 250    err = re_search_internal (preg, string, length, start, length - start,
 251                              length, nmatch, pmatch, eflags);
 252  __libc_lock_unlock (dfa->lock);
 253  return err != REG_NOERROR;
 254}
 255
 256#ifdef _LIBC
 257# include <shlib-compat.h>
 258versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 259
 260# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 261__typeof__ (__regexec) __compat_regexec;
 262
 263int
 264attribute_compat_text_section
 265__compat_regexec (const regex_t *__restrict preg,
 266                  const char *__restrict string, size_t nmatch,
 267                  regmatch_t pmatch[], int eflags)
 268{
 269  return regexec (preg, string, nmatch, pmatch,
 270                  eflags & (REG_NOTBOL | REG_NOTEOL));
 271}
 272compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 273# endif
 274#endif
 275
 276/* Entry points for GNU code.  */
 277
 278/* re_match, re_search, re_match_2, re_search_2
 279
 280   The former two functions operate on STRING with length LENGTH,
 281   while the later two operate on concatenation of STRING1 and STRING2
 282   with lengths LENGTH1 and LENGTH2, respectively.
 283
 284   re_match() matches the compiled pattern in BUFP against the string,
 285   starting at index START.
 286
 287   re_search() first tries matching at index START, then it tries to match
 288   starting from index START + 1, and so on.  The last start position tried
 289   is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 290   way as re_match().)
 291
 292   The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 293   the first STOP characters of the concatenation of the strings should be
 294   concerned.
 295
 296   If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 297   and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 298   computed relative to the concatenation, not relative to the individual
 299   strings.)
 300
 301   On success, re_match* functions return the length of the match, re_search*
 302   return the position of the start of the match.  Return value -1 means no
 303   match was found and -2 indicates an internal error.  */
 304
 305int
 306re_match (bufp, string, length, start, regs)
 307    struct re_pattern_buffer *bufp;
 308    const char *string;
 309    int length, start;
 310    struct re_registers *regs;
 311{
 312  return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 313}
 314#ifdef _LIBC
 315weak_alias (__re_match, re_match)
 316#endif
 317
 318int
 319re_search (bufp, string, length, start, range, regs)
 320    struct re_pattern_buffer *bufp;
 321    const char *string;
 322    int length, start, range;
 323    struct re_registers *regs;
 324{
 325  return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 326}
 327#ifdef _LIBC
 328weak_alias (__re_search, re_search)
 329#endif
 330
 331int
 332re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 333    struct re_pattern_buffer *bufp;
 334    const char *string1, *string2;
 335    int length1, length2, start, stop;
 336    struct re_registers *regs;
 337{
 338  return re_search_2_stub (bufp, string1, length1, string2, length2,
 339                           start, 0, regs, stop, 1);
 340}
 341#ifdef _LIBC
 342weak_alias (__re_match_2, re_match_2)
 343#endif
 344
 345int
 346re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 347    struct re_pattern_buffer *bufp;
 348    const char *string1, *string2;
 349    int length1, length2, start, range, stop;
 350    struct re_registers *regs;
 351{
 352  return re_search_2_stub (bufp, string1, length1, string2, length2,
 353                           start, range, regs, stop, 0);
 354}
 355#ifdef _LIBC
 356weak_alias (__re_search_2, re_search_2)
 357#endif
 358
 359static int
 360re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 361                  stop, ret_len)
 362    struct re_pattern_buffer *bufp;
 363    const char *string1, *string2;
 364    int length1, length2, start, range, stop, ret_len;
 365    struct re_registers *regs;
 366{
 367  const char *str;
 368  int rval;
 369  int len = length1 + length2;
 370  int free_str = 0;
 371
 372  if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 373    return -2;
 374
 375  /* Concatenate the strings.  */
 376  if (length2 > 0)
 377    if (length1 > 0)
 378      {
 379        char *s = re_malloc (char, len);
 380
 381        if (BE (s == NULL, 0))
 382          return -2;
 383        memcpy (s, string1, length1);
 384        memcpy (s + length1, string2, length2);
 385        str = s;
 386        free_str = 1;
 387      }
 388    else
 389      str = string2;
 390  else
 391    str = string1;
 392
 393  rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 394  if (free_str)
 395    re_free ((char *) str);
 396  return rval;
 397}
 398
 399/* The parameters have the same meaning as those of re_search.
 400   Additional parameters:
 401   If RET_LEN is nonzero the length of the match is returned (re_match style);
 402   otherwise the position of the match is returned.  */
 403
 404static int
 405re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 406    struct re_pattern_buffer *bufp;
 407    const char *string;
 408    int length, start, range, stop, ret_len;
 409    struct re_registers *regs;
 410{
 411  reg_errcode_t result;
 412  regmatch_t *pmatch;
 413  int nregs, rval;
 414  int eflags = 0;
 415
 416  /* Check for out-of-range.  */
 417  if (BE (start < 0 || start > length, 0))
 418    return -1;
 419  if (BE (start + range > length, 0))
 420    range = length - start;
 421  else if (BE (start + range < 0, 0))
 422    range = -start;
 423
 424  __libc_lock_lock (dfa->lock);
 425
 426  eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 427  eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 428
 429  /* Compile fastmap if we haven't yet.  */
 430  if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 431    re_compile_fastmap (bufp);
 432
 433  if (BE (bufp->no_sub, 0))
 434    regs = NULL;
 435
 436  /* We need at least 1 register.  */
 437  if (regs == NULL)
 438    nregs = 1;
 439  else if (BE (bufp->regs_allocated == REGS_FIXED &&
 440               regs->num_regs < bufp->re_nsub + 1, 0))
 441    {
 442      nregs = regs->num_regs;
 443      if (BE (nregs < 1, 0))
 444        {
 445          /* Nothing can be copied to regs.  */
 446          regs = NULL;
 447          nregs = 1;
 448        }
 449    }
 450  else
 451    nregs = bufp->re_nsub + 1;
 452  pmatch = re_malloc (regmatch_t, nregs);
 453  if (BE (pmatch == NULL, 0))
 454    {
 455      rval = -2;
 456      goto out;
 457    }
 458
 459  result = re_search_internal (bufp, string, length, start, range, stop,
 460                               nregs, pmatch, eflags);
 461
 462  rval = 0;
 463
 464  /* I hope we needn't fill ther regs with -1's when no match was found.  */
 465  if (result != REG_NOERROR)
 466    rval = -1;
 467  else if (regs != NULL)
 468    {
 469      /* If caller wants register contents data back, copy them.  */
 470      bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 471                                           bufp->regs_allocated);
 472      if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 473        rval = -2;
 474    }
 475
 476  if (BE (rval == 0, 1))
 477    {
 478      if (ret_len)
 479        {
 480          assert (pmatch[0].rm_so == start);
 481          rval = pmatch[0].rm_eo - start;
 482        }
 483      else
 484        rval = pmatch[0].rm_so;
 485    }
 486  re_free (pmatch);
 487 out:
 488  __libc_lock_unlock (dfa->lock);
 489  return rval;
 490}
 491
 492static unsigned
 493re_copy_regs (regs, pmatch, nregs, regs_allocated)
 494    struct re_registers *regs;
 495    regmatch_t *pmatch;
 496    int nregs, regs_allocated;
 497{
 498  int rval = REGS_REALLOCATE;
 499  int i;
 500  int need_regs = nregs + 1;
 501  /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 502     uses.  */
 503
 504  /* Have the register data arrays been allocated?  */
 505  if (regs_allocated == REGS_UNALLOCATED)
 506    { /* No.  So allocate them with malloc.  */
 507      regs->start = re_malloc (regoff_t, need_regs);
 508      if (BE (regs->start == NULL, 0))
 509        return REGS_UNALLOCATED;
 510      regs->end = re_malloc (regoff_t, need_regs);
 511      if (BE (regs->end == NULL, 0))
 512        {
 513          re_free (regs->start);
 514          return REGS_UNALLOCATED;
 515        }
 516      regs->num_regs = need_regs;
 517    }
 518  else if (regs_allocated == REGS_REALLOCATE)
 519    { /* Yes.  If we need more elements than were already
 520         allocated, reallocate them.  If we need fewer, just
 521         leave it alone.  */
 522      if (BE (need_regs > regs->num_regs, 0))
 523        {
 524          regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 525          regoff_t *new_end;
 526          if (BE (new_start == NULL, 0))
 527            return REGS_UNALLOCATED;
 528          new_end = re_realloc (regs->end, regoff_t, need_regs);
 529          if (BE (new_end == NULL, 0))
 530            {
 531              re_free (new_start);
 532              return REGS_UNALLOCATED;
 533            }
 534          regs->start = new_start;
 535          regs->end = new_end;
 536          regs->num_regs = need_regs;
 537        }
 538    }
 539  else
 540    {
 541      assert (regs_allocated == REGS_FIXED);
 542      /* This function may not be called with REGS_FIXED and nregs too big.  */
 543      assert (regs->num_regs >= nregs);
 544      rval = REGS_FIXED;
 545    }
 546
 547  /* Copy the regs.  */
 548  for (i = 0; i < nregs; ++i)
 549    {
 550      regs->start[i] = pmatch[i].rm_so;
 551      regs->end[i] = pmatch[i].rm_eo;
 552    }
 553  for ( ; i < regs->num_regs; ++i)
 554    regs->start[i] = regs->end[i] = -1;
 555
 556  return rval;
 557}
 558
 559/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 560   ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 561   this memory for recording register information.  STARTS and ENDS
 562   must be allocated using the malloc library routine, and must each
 563   be at least NUM_REGS * sizeof (regoff_t) bytes long.
 564
 565   If NUM_REGS == 0, then subsequent matches should allocate their own
 566   register data.
 567
 568   Unless this function is called, the first search or match using
 569   PATTERN_BUFFER will allocate its own register data, without
 570   freeing the old data.  */
 571
 572void
 573re_set_registers (bufp, regs, num_regs, starts, ends)
 574    struct re_pattern_buffer *bufp;
 575    struct re_registers *regs;
 576    unsigned num_regs;
 577    regoff_t *starts, *ends;
 578{
 579  if (num_regs)
 580    {
 581      bufp->regs_allocated = REGS_REALLOCATE;
 582      regs->num_regs = num_regs;
 583      regs->start = starts;
 584      regs->end = ends;
 585    }
 586  else
 587    {
 588      bufp->regs_allocated = REGS_UNALLOCATED;
 589      regs->num_regs = 0;
 590      regs->start = regs->end = (regoff_t *) 0;
 591    }
 592}
 593#ifdef _LIBC
 594weak_alias (__re_set_registers, re_set_registers)
 595#endif
 596\f
 597/* Entry points compatible with 4.2 BSD regex library.  We don't define
 598   them unless specifically requested.  */
 599
 600#if defined _REGEX_RE_COMP || defined _LIBC
 601int
 602# ifdef _LIBC
 603weak_function
 604# endif
 605re_exec (s)
 606     const char *s;
 607{
 608  return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 609}
 610#endif /* _REGEX_RE_COMP */
 611\f
 612/* Internal entry point.  */
 613
 614/* Searches for a compiled pattern PREG in the string STRING, whose
 615   length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 616   mingings with regexec.  START, and RANGE have the same meanings
 617   with re_search.
 618   Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 619   otherwise return the error code.
 620   Note: We assume front end functions already check ranges.
 621   (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 622
 623static reg_errcode_t
 624re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 625                    eflags)
 626    const regex_t *preg;
 627    const char *string;
 628    int length, start, range, stop, eflags;
 629    size_t nmatch;
 630    regmatch_t pmatch[];
 631{
 632  reg_errcode_t err;
 633  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 634  int left_lim, right_lim, incr;
 635  int fl_longest_match, match_first, match_kind, match_last = -1;
 636  int extra_nmatch;
 637  int sb, ch;
 638#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 639  re_match_context_t mctx = { .dfa = dfa };
 640#else
 641  re_match_context_t mctx;
 642#endif
 643  char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 644                   && range && !preg->can_be_null) ? preg->fastmap : NULL;
 645  RE_TRANSLATE_TYPE t = preg->translate;
 646
 647#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 648  memset (&mctx, '\0', sizeof (re_match_context_t));
 649  mctx.dfa = dfa;
 650#endif
 651
 652  extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 653  nmatch -= extra_nmatch;
 654
 655  /* Check if the DFA haven't been compiled.  */
 656  if (BE (preg->used == 0 || dfa->init_state == NULL
 657          || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 658          || dfa->init_state_begbuf == NULL, 0))
 659    return REG_NOMATCH;
 660
 661#ifdef DEBUG
 662  /* We assume front-end functions already check them.  */
 663  assert (start + range >= 0 && start + range <= length);
 664#endif
 665
 666  /* If initial states with non-begbuf contexts have no elements,
 667     the regex must be anchored.  If preg->newline_anchor is set,
 668     we'll never use init_state_nl, so do not check it.  */
 669  if (dfa->init_state->nodes.nelem == 0
 670      && dfa->init_state_word->nodes.nelem == 0
 671      && (dfa->init_state_nl->nodes.nelem == 0
 672          || !preg->newline_anchor))
 673    {
 674      if (start != 0 && start + range != 0)
 675        return REG_NOMATCH;
 676      start = range = 0;
 677    }
 678
 679  /* We must check the longest matching, if nmatch > 0.  */
 680  fl_longest_match = (nmatch != 0 || dfa->nbackref);
 681
 682  err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 683                            preg->translate, preg->syntax & RE_ICASE, dfa);
 684  if (BE (err != REG_NOERROR, 0))
 685    goto free_return;
 686  mctx.input.stop = stop;
 687  mctx.input.raw_stop = stop;
 688  mctx.input.newline_anchor = preg->newline_anchor;
 689
 690  err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 691  if (BE (err != REG_NOERROR, 0))
 692    goto free_return;
 693
 694  /* We will log all the DFA states through which the dfa pass,
 695     if nmatch > 1, or this dfa has "multibyte node", which is a
 696     back-reference or a node which can accept multibyte character or
 697     multi character collating element.  */
 698  if (nmatch > 1 || dfa->has_mb_node)
 699    {
 700      /* Avoid overflow.  */
 701      if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 702        {
 703          err = REG_ESPACE;
 704          goto free_return;
 705        }
 706
 707      mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 708      if (BE (mctx.state_log == NULL, 0))
 709        {
 710          err = REG_ESPACE;
 711          goto free_return;
 712        }
 713    }
 714  else
 715    mctx.state_log = NULL;
 716
 717  match_first = start;
 718  mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 719                           : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 720
 721  /* Check incrementally whether of not the input string match.  */
 722  incr = (range < 0) ? -1 : 1;
 723  left_lim = (range < 0) ? start + range : start;
 724  right_lim = (range < 0) ? start : start + range;
 725  sb = dfa->mb_cur_max == 1;
 726  match_kind =
 727    (fastmap
 728     ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 729        | (range >= 0 ? 2 : 0)
 730        | (t != NULL ? 1 : 0))
 731     : 8);
 732
 733  for (;; match_first += incr)
 734    {
 735      err = REG_NOMATCH;
 736      if (match_first < left_lim || right_lim < match_first)
 737        goto free_return;
 738
 739      /* Advance as rapidly as possible through the string, until we
 740         find a plausible place to start matching.  This may be done
 741         with varying efficiency, so there are various possibilities:
 742         only the most common of them are specialized, in order to
 743         save on code size.  We use a switch statement for speed.  */
 744      switch (match_kind)
 745        {
 746        case 8:
 747          /* No fastmap.  */
 748          break;
 749
 750        case 7:
 751          /* Fastmap with single-byte translation, match forward.  */
 752          while (BE (match_first < right_lim, 1)
 753                 && !fastmap[t[(unsigned char) string[match_first]]])
 754            ++match_first;
 755          goto forward_match_found_start_or_reached_end;
 756
 757        case 6:
 758          /* Fastmap without translation, match forward.  */
 759          while (BE (match_first < right_lim, 1)
 760                 && !fastmap[(unsigned char) string[match_first]])
 761            ++match_first;
 762
 763        forward_match_found_start_or_reached_end:
 764          if (BE (match_first == right_lim, 0))
 765            {
 766              ch = match_first >= length
 767                       ? 0 : (unsigned char) string[match_first];
 768              if (!fastmap[t ? t[ch] : ch])
 769                goto free_return;
 770            }
 771          break;
 772
 773        case 4:
 774        case 5:
 775          /* Fastmap without multi-byte translation, match backwards.  */
 776          while (match_first >= left_lim)
 777            {
 778              ch = match_first >= length
 779                       ? 0 : (unsigned char) string[match_first];
 780              if (fastmap[t ? t[ch] : ch])
 781                break;
 782              --match_first;
 783            }
 784          if (match_first < left_lim)
 785            goto free_return;
 786          break;
 787
 788        default:
 789          /* In this case, we can't determine easily the current byte,
 790             since it might be a component byte of a multibyte
 791             character.  Then we use the constructed buffer instead.  */
 792          for (;;)
 793            {
 794              /* If MATCH_FIRST is out of the valid range, reconstruct the
 795                 buffers.  */
 796              unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 797              if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 798                {
 799                  err = re_string_reconstruct (&mctx.input, match_first,
 800                                               eflags);
 801                  if (BE (err != REG_NOERROR, 0))
 802                    goto free_return;
 803
 804                  offset = match_first - mctx.input.raw_mbs_idx;
 805                }
 806              /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 807                 Note that MATCH_FIRST must not be smaller than 0.  */
 808              ch = (match_first >= length
 809                    ? 0 : re_string_byte_at (&mctx.input, offset));
 810              if (fastmap[ch])
 811                break;
 812              match_first += incr;
 813              if (match_first < left_lim || match_first > right_lim)
 814                {
 815                  err = REG_NOMATCH;
 816                  goto free_return;
 817                }
 818            }
 819          break;
 820        }
 821
 822      /* Reconstruct the buffers so that the matcher can assume that
 823         the matching starts from the beginning of the buffer.  */
 824      err = re_string_reconstruct (&mctx.input, match_first, eflags);
 825      if (BE (err != REG_NOERROR, 0))
 826        goto free_return;
 827
 828#ifdef RE_ENABLE_I18N
 829     /* Don't consider this char as a possible match start if it part,
 830        yet isn't the head, of a multibyte character.  */
 831      if (!sb && !re_string_first_byte (&mctx.input, 0))
 832        continue;
 833#endif
 834
 835      /* It seems to be appropriate one, then use the matcher.  */
 836      /* We assume that the matching starts from 0.  */
 837      mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 838      match_last = check_matching (&mctx, fl_longest_match,
 839                                   range >= 0 ? &match_first : NULL);
 840      if (match_last != -1)
 841        {
 842          if (BE (match_last == -2, 0))
 843            {
 844              err = REG_ESPACE;
 845              goto free_return;
 846            }
 847          else
 848            {
 849              mctx.match_last = match_last;
 850              if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 851                {
 852                  re_dfastate_t *pstate = mctx.state_log[match_last];
 853                  mctx.last_node = check_halt_state_context (&mctx, pstate,
 854                                                             match_last);
 855                }
 856              if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 857                  || dfa->nbackref)
 858                {
 859                  err = prune_impossible_nodes (&mctx);
 860                  if (err == REG_NOERROR)
 861                    break;
 862                  if (BE (err != REG_NOMATCH, 0))
 863                    goto free_return;
 864                  match_last = -1;
 865                }
 866              else
 867                break; /* We found a match.  */
 868            }
 869        }
 870
 871      match_ctx_clean (&mctx);
 872    }
 873
 874#ifdef DEBUG
 875  assert (match_last != -1);
 876  assert (err == REG_NOERROR);
 877#endif
 878
 879  /* Set pmatch[] if we need.  */
 880  if (nmatch > 0)
 881    {
 882      int reg_idx;
 883
 884      /* Initialize registers.  */
 885      for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 886        pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 887
 888      /* Set the points where matching start/end.  */
 889      pmatch[0].rm_so = 0;
 890      pmatch[0].rm_eo = mctx.match_last;
 891
 892      if (!preg->no_sub && nmatch > 1)
 893        {
 894          err = set_regs (preg, &mctx, nmatch, pmatch,
 895                          dfa->has_plural_match && dfa->nbackref > 0);
 896          if (BE (err != REG_NOERROR, 0))
 897            goto free_return;
 898        }
 899
 900      /* At last, add the offset to the each registers, since we slided
 901         the buffers so that we could assume that the matching starts
 902         from 0.  */
 903      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 904        if (pmatch[reg_idx].rm_so != -1)
 905          {
 906#ifdef RE_ENABLE_I18N
 907            if (BE (mctx.input.offsets_needed != 0, 0))
 908              {
 909                pmatch[reg_idx].rm_so =
 910                  (pmatch[reg_idx].rm_so == mctx.input.valid_len
 911                   ? mctx.input.valid_raw_len
 912                   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 913                pmatch[reg_idx].rm_eo =
 914                  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 915                   ? mctx.input.valid_raw_len
 916                   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 917              }
 918#else
 919            assert (mctx.input.offsets_needed == 0);
 920#endif
 921            pmatch[reg_idx].rm_so += match_first;
 922            pmatch[reg_idx].rm_eo += match_first;
 923          }
 924      for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 925        {
 926          pmatch[nmatch + reg_idx].rm_so = -1;
 927          pmatch[nmatch + reg_idx].rm_eo = -1;
 928        }
 929
 930      if (dfa->subexp_map)
 931        for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 932          if (dfa->subexp_map[reg_idx] != reg_idx)
 933            {
 934              pmatch[reg_idx + 1].rm_so
 935                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 936              pmatch[reg_idx + 1].rm_eo
 937                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 938            }
 939    }
 940
 941 free_return:
 942  re_free (mctx.state_log);
 943  if (dfa->nbackref)
 944    match_ctx_free (&mctx);
 945  re_string_destruct (&mctx.input);
 946  return err;
 947}
 948
 949static reg_errcode_t
 950prune_impossible_nodes (mctx)
 951     re_match_context_t *mctx;
 952{
 953  const re_dfa_t *const dfa = mctx->dfa;
 954  int halt_node, match_last;
 955  reg_errcode_t ret;
 956  re_dfastate_t **sifted_states;
 957  re_dfastate_t **lim_states = NULL;
 958  re_sift_context_t sctx;
 959#ifdef DEBUG
 960  assert (mctx->state_log != NULL);
 961#endif
 962  match_last = mctx->match_last;
 963  halt_node = mctx->last_node;
 964
 965  /* Avoid overflow.  */
 966  if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 967    return REG_ESPACE;
 968
 969  sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 970  if (BE (sifted_states == NULL, 0))
 971    {
 972      ret = REG_ESPACE;
 973      goto free_return;
 974    }
 975  if (dfa->nbackref)
 976    {
 977      lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 978      if (BE (lim_states == NULL, 0))
 979        {
 980          ret = REG_ESPACE;
 981          goto free_return;
 982        }
 983      while (1)
 984        {
 985          memset (lim_states, '\0',
 986                  sizeof (re_dfastate_t *) * (match_last + 1));
 987          sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 988                         match_last);
 989          ret = sift_states_backward (mctx, &sctx);
 990          re_node_set_free (&sctx.limits);
 991          if (BE (ret != REG_NOERROR, 0))
 992              goto free_return;
 993          if (sifted_states[0] != NULL || lim_states[0] != NULL)
 994            break;
 995          do
 996            {
 997              --match_last;
 998              if (match_last < 0)
 999                {
1000                  ret = REG_NOMATCH;
1001                  goto free_return;
1002                }
1003            } while (mctx->state_log[match_last] == NULL
1004                     || !mctx->state_log[match_last]->halt);
1005          halt_node = check_halt_state_context (mctx,
1006                                                mctx->state_log[match_last],
1007                                                match_last);
1008        }
1009      ret = merge_state_array (dfa, sifted_states, lim_states,
1010                               match_last + 1);
1011      re_free (lim_states);
1012      lim_states = NULL;
1013      if (BE (ret != REG_NOERROR, 0))
1014        goto free_return;
1015    }
1016  else
1017    {
1018      sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1019      ret = sift_states_backward (mctx, &sctx);
1020      re_node_set_free (&sctx.limits);
1021      if (BE (ret != REG_NOERROR, 0))
1022        goto free_return;
1023      if (sifted_states[0] == NULL)
1024        {
1025          ret = REG_NOMATCH;
1026          goto free_return;
1027        }
1028    }
1029  re_free (mctx->state_log);
1030  mctx->state_log = sifted_states;
1031  sifted_states = NULL;
1032  mctx->last_node = halt_node;
1033  mctx->match_last = match_last;
1034  ret = REG_NOERROR;
1035 free_return:
1036  re_free (sifted_states);
1037  re_free (lim_states);
1038  return ret;
1039}
1040
1041/* Acquire an initial state and return it.
1042   We must select appropriate initial state depending on the context,
1043   since initial states may have constraints like "\<", "^", etc..  */
1044
1045static inline re_dfastate_t *
1046__attribute ((always_inline)) internal_function
1047acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1048                            int idx)
1049{
1050  const re_dfa_t *const dfa = mctx->dfa;
1051  if (dfa->init_state->has_constraint)
1052    {
1053      unsigned int context;
1054      context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1055      if (IS_WORD_CONTEXT (context))
1056        return dfa->init_state_word;
1057      else if (IS_ORDINARY_CONTEXT (context))
1058        return dfa->init_state;
1059      else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1060        return dfa->init_state_begbuf;
1061      else if (IS_NEWLINE_CONTEXT (context))
1062        return dfa->init_state_nl;
1063      else if (IS_BEGBUF_CONTEXT (context))
1064        {
1065          /* It is relatively rare case, then calculate on demand.  */
1066          return re_acquire_state_context (err, dfa,
1067                                           dfa->init_state->entrance_nodes,
1068                                           context);
1069        }
1070      else
1071        /* Must not happen?  */
1072        return dfa->init_state;
1073    }
1074  else
1075    return dfa->init_state;
1076}
1077
1078/* Check whether the regular expression match input string INPUT or not,
1079   and return the index where the matching end, return -1 if not match,
1080   or return -2 in case of an error.
1081   FL_LONGEST_MATCH means we want the POSIX longest matching.
1082   If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1083   next place where we may want to try matching.
1084   Note that the matcher assume that the maching starts from the current
1085   index of the buffer.  */
1086
1087static int
1088internal_function
1089check_matching (re_match_context_t *mctx, int fl_longest_match,
1090                int *p_match_first)
1091{
1092  const re_dfa_t *const dfa = mctx->dfa;
1093  reg_errcode_t err;
1094  int match = 0;
1095  int match_last = -1;
1096  int cur_str_idx = re_string_cur_idx (&mctx->input);
1097  re_dfastate_t *cur_state;
1098  int at_init_state = p_match_first != NULL;
1099  int next_start_idx = cur_str_idx;
1100
1101  err = REG_NOERROR;
1102  cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1103  /* An initial state must not be NULL (invalid).  */
1104  if (BE (cur_state == NULL, 0))
1105    {
1106      assert (err == REG_ESPACE);
1107      return -2;
1108    }
1109
1110  if (mctx->state_log != NULL)
1111    {
1112      mctx->state_log[cur_str_idx] = cur_state;
1113
1114      /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1115         later.  E.g. Processing back references.  */
1116      if (BE (dfa->nbackref, 0))
1117        {
1118          at_init_state = 0;
1119          err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1120          if (BE (err != REG_NOERROR, 0))
1121            return err;
1122
1123          if (cur_state->has_backref)
1124            {
1125              err = transit_state_bkref (mctx, &cur_state->nodes);
1126              if (BE (err != REG_NOERROR, 0))
1127                return err;
1128            }
1129        }
1130    }
1131
1132  /* If the RE accepts NULL string.  */
1133  if (BE (cur_state->halt, 0))
1134    {
1135      if (!cur_state->has_constraint
1136          || check_halt_state_context (mctx, cur_state, cur_str_idx))
1137        {
1138          if (!fl_longest_match)
1139            return cur_str_idx;
1140          else
1141            {
1142              match_last = cur_str_idx;
1143              match = 1;
1144            }
1145        }
1146    }
1147
1148  while (!re_string_eoi (&mctx->input))
1149    {
1150      re_dfastate_t *old_state = cur_state;
1151      int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1152
1153      if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1154          || (BE (next_char_idx >= mctx->input.valid_len, 0)
1155              && mctx->input.valid_len < mctx->input.len))
1156        {
1157          err = extend_buffers (mctx);
1158          if (BE (err != REG_NOERROR, 0))
1159            {
1160              assert (err == REG_ESPACE);
1161              return -2;
1162            }
1163        }
1164
1165      cur_state = transit_state (&err, mctx, cur_state);
1166      if (mctx->state_log != NULL)
1167        cur_state = merge_state_with_log (&err, mctx, cur_state);
1168
1169      if (cur_state == NULL)
1170        {
1171          /* Reached the invalid state or an error.  Try to recover a valid
1172             state using the state log, if available and if we have not
1173             already found a valid (even if not the longest) match.  */
1174          if (BE (err != REG_NOERROR, 0))
1175            return -2;
1176
1177          if (mctx->state_log == NULL
1178              || (match && !fl_longest_match)
1179              || (cur_state = find_recover_state (&err, mctx)) == NULL)
1180            break;
1181        }
1182
1183      if (BE (at_init_state, 0))
1184        {
1185          if (old_state == cur_state)
1186            next_start_idx = next_char_idx;
1187          else
1188            at_init_state = 0;
1189        }
1190
1191      if (cur_state->halt)
1192        {
1193          /* Reached a halt state.
1194             Check the halt state can satisfy the current context.  */
1195          if (!cur_state->has_constraint
1196              || check_halt_state_context (mctx, cur_state,
1197                                           re_string_cur_idx (&mctx->input)))
1198            {
1199              /* We found an appropriate halt state.  */
1200              match_last = re_string_cur_idx (&mctx->input);
1201              match = 1;
1202
1203              /* We found a match, do not modify match_first below.  */
1204              p_match_first = NULL;
1205              if (!fl_longest_match)
1206                break;
1207            }
1208        }
1209    }
1210
1211  if (p_match_first)
1212    *p_match_first += next_start_idx;
1213
1214  return match_last;
1215}
1216
1217/* Check NODE match the current context.  */
1218
1219static int
1220internal_function
1221check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1222{
1223  re_token_type_t type = dfa->nodes[node].type;
1224  unsigned int constraint = dfa->nodes[node].constraint;
1225  if (type != END_OF_RE)
1226    return 0;
1227  if (!constraint)
1228    return 1;
1229  if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1230    return 0;
1231  return 1;
1232}
1233
1234/* Check the halt state STATE match the current context.
1235   Return 0 if not match, if the node, STATE has, is a halt node and
1236   match the context, return the node.  */
1237
1238static int
1239internal_function
1240check_halt_state_context (const re_match_context_t *mctx,
1241                          const re_dfastate_t *state, int idx)
1242{
1243  int i;
1244  unsigned int context;
1245#ifdef DEBUG
1246  assert (state->halt);
1247#endif
1248  context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1249  for (i = 0; i < state->nodes.nelem; ++i)
1250    if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1251      return state->nodes.elems[i];
1252  return 0;
1253}
1254
1255/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1256   corresponding to the DFA).
1257   Return the destination node, and update EPS_VIA_NODES, return -1 in case
1258   of errors.  */
1259
1260static int
1261internal_function
1262proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1263                   int *pidx, int node, re_node_set *eps_via_nodes,
1264                   struct re_fail_stack_t *fs)
1265{
1266  const re_dfa_t *const dfa = mctx->dfa;
1267  int i, err;
1268  if (IS_EPSILON_NODE (dfa->nodes[node].type))
1269    {
1270      re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1271      re_node_set *edests = &dfa->edests[node];
1272      int dest_node;
1273      err = re_node_set_insert (eps_via_nodes, node);
1274      if (BE (err < 0, 0))
1275        return -2;
1276      /* Pick up a valid destination, or return -1 if none is found.  */
1277      for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1278        {
1279          int candidate = edests->elems[i];
1280          if (!re_node_set_contains (cur_nodes, candidate))
1281            continue;
1282          if (dest_node == -1)
1283            dest_node = candidate;
1284
1285          else
1286            {
1287              /* In order to avoid infinite loop like "(a*)*", return the second
1288                 epsilon-transition if the first was already considered.  */
1289              if (re_node_set_contains (eps_via_nodes, dest_node))
1290                return candidate;
1291
1292              /* Otherwise, push the second epsilon-transition on the fail stack.  */
1293              else if (fs != NULL
1294                       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1295                                           eps_via_nodes))
1296                return -2;
1297
1298              /* We know we are going to exit.  */
1299              break;
1300            }
1301        }
1302      return dest_node;
1303    }
1304  else
1305    {
1306      int naccepted = 0;
1307      re_token_type_t type = dfa->nodes[node].type;
1308
1309#ifdef RE_ENABLE_I18N
1310      if (dfa->nodes[node].accept_mb)
1311        naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1312      else
1313#endif /* RE_ENABLE_I18N */
1314      if (type == OP_BACK_REF)
1315        {
1316          int subexp_idx = dfa->nodes[node].opr.idx + 1;
1317          naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1318          if (fs != NULL)
1319            {
1320              if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1321                return -1;
1322              else if (naccepted)
1323                {
1324                  char *buf = (char *) re_string_get_buffer (&mctx->input);
1325                  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1326                              naccepted) != 0)
1327                    return -1;
1328                }
1329            }
1330
1331          if (naccepted == 0)
1332            {
1333              int dest_node;
1334              err = re_node_set_insert (eps_via_nodes, node);
1335              if (BE (err < 0, 0))
1336                return -2;
1337              dest_node = dfa->edests[node].elems[0];
1338              if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1339                                        dest_node))
1340                return dest_node;
1341            }
1342        }
1343
1344      if (naccepted != 0
1345          || check_node_accept (mctx, dfa->nodes + node, *pidx))
1346        {
1347          int dest_node = dfa->nexts[node];
1348          *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1349          if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1350                     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1351                                               dest_node)))
1352            return -1;
1353          re_node_set_empty (eps_via_nodes);
1354          return dest_node;
1355        }
1356    }
1357  return -1;
1358}
1359
1360static reg_errcode_t
1361internal_function
1362push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1363                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1364{
1365  reg_errcode_t err;
1366  int num = fs->num++;
1367  if (fs->num == fs->alloc)
1368    {
1369      struct re_fail_stack_ent_t *new_array;
1370      new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1371                                       * fs->alloc * 2));
1372      if (new_array == NULL)
1373        return REG_ESPACE;
1374      fs->alloc *= 2;
1375      fs->stack = new_array;
1376    }
1377  fs->stack[num].idx = str_idx;
1378  fs->stack[num].node = dest_node;
1379  fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1380  if (fs->stack[num].regs == NULL)
1381    return REG_ESPACE;
1382  memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1383  err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1384  return err;
1385}
1386
1387static int
1388internal_function
1389pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1390                regmatch_t *regs, re_node_set *eps_via_nodes)
1391{
1392  int num = --fs->num;
1393  assert (num >= 0);
1394  *pidx = fs->stack[num].idx;
1395  memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1396  re_node_set_free (eps_via_nodes);
1397  re_free (fs->stack[num].regs);
1398  *eps_via_nodes = fs->stack[num].eps_via_nodes;
1399  return fs->stack[num].node;
1400}
1401
1402/* Set the positions where the subexpressions are starts/ends to registers
1403   PMATCH.
1404   Note: We assume that pmatch[0] is already set, and
1405   pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1406
1407static reg_errcode_t
1408internal_function
1409set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1410          regmatch_t *pmatch, int fl_backtrack)
1411{
1412  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1413  int idx, cur_node;
1414  re_node_set eps_via_nodes;
1415  struct re_fail_stack_t *fs;
1416  struct re_fail_stack_t fs_body = { 0, 2, NULL };
1417  regmatch_t *prev_idx_match;
1418  int prev_idx_match_malloced = 0;
1419
1420#ifdef DEBUG
1421  assert (nmatch > 1);
1422  assert (mctx->state_log != NULL);
1423#endif
1424  if (fl_backtrack)
1425    {
1426      fs = &fs_body;
1427      fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1428      if (fs->stack == NULL)
1429        return REG_ESPACE;
1430    }
1431  else
1432    fs = NULL;
1433
1434  cur_node = dfa->init_node;
1435  re_node_set_init_empty (&eps_via_nodes);
1436
1437#ifdef HAVE_ALLOCA
1438  if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1439    prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1440  else
1441#endif
1442    {
1443      prev_idx_match = re_malloc (regmatch_t, nmatch);
1444      if (prev_idx_match == NULL)
1445        {
1446          free_fail_stack_return (fs);
1447          return REG_ESPACE;
1448        }
1449      prev_idx_match_malloced = 1;
1450    }
1451  memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1452
1453  for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1454    {
1455      update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1456
1457      if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1458        {
1459          int reg_idx;
1460          if (fs)
1461            {
1462              for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1463                if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1464                  break;
1465              if (reg_idx == nmatch)
1466                {
1467                  re_node_set_free (&eps_via_nodes);
1468                  if (prev_idx_match_malloced)
1469                    re_free (prev_idx_match);
1470                  return free_fail_stack_return (fs);
1471                }
1472              cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1473                                         &eps_via_nodes);
1474            }
1475          else
1476            {
1477              re_node_set_free (&eps_via_nodes);
1478              if (prev_idx_match_malloced)
1479                re_free (prev_idx_match);
1480              return REG_NOERROR;
1481            }
1482        }
1483
1484      /* Proceed to next node.  */
1485      cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1486                                    &eps_via_nodes, fs);
1487
1488      if (BE (cur_node < 0, 0))
1489        {
1490          if (BE (cur_node == -2, 0))
1491            {
1492              re_node_set_free (&eps_via_nodes);
1493              if (prev_idx_match_malloced)
1494                re_free (prev_idx_match);
1495              free_fail_stack_return (fs);
1496              return REG_ESPACE;
1497            }
1498          if (fs)
1499            cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1500                                       &eps_via_nodes);
1501          else
1502            {
1503              re_node_set_free (&eps_via_nodes);
1504              if (prev_idx_match_malloced)
1505                re_free (prev_idx_match);
1506              return REG_NOMATCH;
1507            }
1508        }
1509    }
1510  re_node_set_free (&eps_via_nodes);
1511  if (prev_idx_match_malloced)
1512    re_free (prev_idx_match);
1513  return free_fail_stack_return (fs);
1514}
1515
1516static reg_errcode_t
1517internal_function
1518free_fail_stack_return (struct re_fail_stack_t *fs)
1519{
1520  if (fs)
1521    {
1522      int fs_idx;
1523      for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1524        {
1525          re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1526          re_free (fs->stack[fs_idx].regs);
1527        }
1528      re_free (fs->stack);
1529    }
1530  return REG_NOERROR;
1531}
1532
1533static void
1534internal_function
1535update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1536             regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1537{
1538  int type = dfa->nodes[cur_node].type;
1539  if (type == OP_OPEN_SUBEXP)
1540    {
1541      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1542
1543      /* We are at the first node of this sub expression.  */
1544      if (reg_num < nmatch)
1545        {
1546          pmatch[reg_num].rm_so = cur_idx;
1547          pmatch[reg_num].rm_eo = -1;
1548        }
1549    }
1550  else if (type == OP_CLOSE_SUBEXP)
1551    {
1552      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1553      if (reg_num < nmatch)
1554        {
1555          /* We are at the last node of this sub expression.  */
1556          if (pmatch[reg_num].rm_so < cur_idx)
1557            {
1558              pmatch[reg_num].rm_eo = cur_idx;
1559              /* This is a non-empty match or we are not inside an optional
1560                 subexpression.  Accept this right away.  */
1561              memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1562            }
1563          else
1564            {
1565              if (dfa->nodes[cur_node].opt_subexp
1566                  && prev_idx_match[reg_num].rm_so != -1)
1567                /* We transited through an empty match for an optional
1568                   subexpression, like (a?)*, and this is not the subexp's
1569                   first match.  Copy back the old content of the registers
1570                   so that matches of an inner subexpression are undone as
1571                   well, like in ((a?))*.  */
1572                memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1573              else
1574                /* We completed a subexpression, but it may be part of
1575                   an optional one, so do not update PREV_IDX_MATCH.  */
1576                pmatch[reg_num].rm_eo = cur_idx;
1577            }
1578        }
1579    }
1580}
1581
1582/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1583   and sift the nodes in each states according to the following rules.
1584   Updated state_log will be wrote to STATE_LOG.
1585
1586   Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1587     1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1588        If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1589        the LAST_NODE, we throw away the node `a'.
1590     2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1591        string `s' and transit to `b':
1592        i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1593           away the node `a'.
1594        ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1595            thrown away, we throw away the node `a'.
1596     3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1597        i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1598           node `a'.
1599        ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1600            we throw away the node `a'.  */
1601
1602#define STATE_NODE_CONTAINS(state,node) \
1603  ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1604
1605static reg_errcode_t
1606internal_function
1607sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1608{
1609  reg_errcode_t err;
1610  int null_cnt = 0;
1611  int str_idx = sctx->last_str_idx;
1612  re_node_set cur_dest;
1613
1614#ifdef DEBUG
1615  assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1616#endif
1617
1618  /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1619     transit to the last_node and the last_node itself.  */
1620  err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1621  if (BE (err != REG_NOERROR, 0))
1622    return err;
1623  err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1624  if (BE (err != REG_NOERROR, 0))
1625    goto free_return;
1626
1627  /* Then check each states in the state_log.  */
1628  while (str_idx > 0)
1629    {
1630      /* Update counters.  */
1631      null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1632      if (null_cnt > mctx->max_mb_elem_len)
1633        {
1634          memset (sctx->sifted_states, '\0',
1635                  sizeof (re_dfastate_t *) * str_idx);
1636          re_node_set_free (&cur_dest);
1637          return REG_NOERROR;
1638        }
1639      re_node_set_empty (&cur_dest);
1640      --str_idx;
1641
1642      if (mctx->state_log[str_idx])
1643        {
1644          err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1645          if (BE (err != REG_NOERROR, 0))
1646            goto free_return;
1647        }
1648
1649      /* Add all the nodes which satisfy the following conditions:
1650         - It can epsilon transit to a node in CUR_DEST.
1651         - It is in CUR_SRC.
1652         And update state_log.  */
1653      err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1654      if (BE (err != REG_NOERROR, 0))
1655        goto free_return;
1656    }
1657  err = REG_NOERROR;
1658 free_return:
1659  re_node_set_free (&cur_dest);
1660  return err;
1661}
1662
1663static reg_errcode_t
1664internal_function
1665build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1666                     int str_idx, re_node_set *cur_dest)
1667{
1668  const re_dfa_t *const dfa = mctx->dfa;
1669  const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1670  int i;
1671
1672  /* Then build the next sifted state.
1673     We build the next sifted state on `cur_dest', and update
1674     `sifted_states[str_idx]' with `cur_dest'.
1675     Note:
1676     `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1677     `cur_src' points the node_set of the old `state_log[str_idx]'
1678     (with the epsilon nodes pre-filtered out).  */
1679  for (i = 0; i < cur_src->nelem; i++)
1680    {
1681      int prev_node = cur_src->elems[i];
1682      int naccepted = 0;
1683      int ret;
1684
1685#ifdef DEBUG
1686      re_token_type_t type = dfa->nodes[prev_node].type;
1687      assert (!IS_EPSILON_NODE (type));
1688#endif
1689#ifdef RE_ENABLE_I18N
1690      /* If the node may accept `multi byte'.  */
1691      if (dfa->nodes[prev_node].accept_mb)
1692        naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1693                                         str_idx, sctx->last_str_idx);
1694#endif /* RE_ENABLE_I18N */
1695
1696      /* We don't check backreferences here.
1697         See update_cur_sifted_state().  */
1698      if (!naccepted
1699          && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1700          && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1701                                  dfa->nexts[prev_node]))
1702        naccepted = 1;
1703
1704      if (naccepted == 0)
1705        continue;
1706
1707      if (sctx->limits.nelem)
1708        {
1709          int to_idx = str_idx + naccepted;
1710          if (check_dst_limits (mctx, &sctx->limits,
1711                                dfa->nexts[prev_node], to_idx,
1712                                prev_node, str_idx))
1713            continue;
1714        }
1715      ret = re_node_set_insert (cur_dest, prev_node);
1716      if (BE (ret == -1, 0))
1717        return REG_ESPACE;
1718    }
1719
1720  return REG_NOERROR;
1721}
1722
1723/* Helper functions.  */
1724
1725static reg_errcode_t
1726internal_function
1727clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1728{
1729  int top = mctx->state_log_top;
1730
1731  if (next_state_log_idx >= mctx->input.bufs_len
1732      || (next_state_log_idx >= mctx->input.valid_len
1733          && mctx->input.valid_len < mctx->input.len))
1734    {
1735      reg_errcode_t err;
1736      err = extend_buffers (mctx);
1737      if (BE (err != REG_NOERROR, 0))
1738        return err;
1739    }
1740
1741  if (top < next_state_log_idx)
1742    {
1743      memset (mctx->state_log + top + 1, '\0',
1744              sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1745      mctx->state_log_top = next_state_log_idx;
1746    }
1747  return REG_NOERROR;
1748}
1749
1750static reg_errcode_t
1751internal_function
1752merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1753                   re_dfastate_t **src, int num)
1754{
1755  int st_idx;
1756  reg_errcode_t err;
1757  for (st_idx = 0; st_idx < num; ++st_idx)
1758    {
1759      if (dst[st_idx] == NULL)
1760        dst[st_idx] = src[st_idx];
1761      else if (src[st_idx] != NULL)
1762        {
1763          re_node_set merged_set;
1764          err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1765                                        &src[st_idx]->nodes);
1766          if (BE (err != REG_NOERROR, 0))
1767            return err;
1768          dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1769          re_node_set_free (&merged_set);
1770          if (BE (err != REG_NOERROR, 0))
1771            return err;
1772        }
1773    }
1774  return REG_NOERROR;
1775}
1776
1777static reg_errcode_t
1778internal_function
1779update_cur_sifted_state (const re_match_context_t *mctx,
1780                         re_sift_context_t *sctx, int str_idx,
1781                         re_node_set *dest_nodes)
1782{
1783  const re_dfa_t *const dfa = mctx->dfa;
1784  reg_errcode_t err = REG_NOERROR;
1785  const re_node_set *candidates;
1786  candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1787                : &mctx->state_log[str_idx]->nodes);
1788
1789  if (dest_nodes->nelem == 0)
1790    sctx->sifted_states[str_idx] = NULL;
1791  else
1792    {
1793      if (candidates)
1794        {
1795          /* At first, add the nodes which can epsilon transit to a node in
1796             DEST_NODE.  */
1797          err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1798          if (BE (err != REG_NOERROR, 0))
1799            return err;
1800
1801          /* Then, check the limitations in the current sift_context.  */
1802          if (sctx->limits.nelem)
1803            {
1804              err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1805                                         mctx->bkref_ents, str_idx);
1806              if (BE (err != REG_NOERROR, 0))
1807                return err;
1808            }
1809        }
1810
1811      sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1812      if (BE (err != REG_NOERROR, 0))
1813        return err;
1814    }
1815
1816  if (candidates && mctx->state_log[str_idx]->has_backref)
1817    {
1818      err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1819      if (BE (err != REG_NOERROR, 0))
1820        return err;
1821    }
1822  return REG_NOERROR;
1823}
1824
1825static reg_errcode_t
1826internal_function
1827add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1828                       const re_node_set *candidates)
1829{
1830  reg_errcode_t err = REG_NOERROR;
1831  int i;
1832
1833  re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1834  if (BE (err != REG_NOERROR, 0))
1835    return err;
1836
1837  if (!state->inveclosure.alloc)
1838    {
1839      err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1840      if (BE (err != REG_NOERROR, 0))
1841        return REG_ESPACE;
1842      for (i = 0; i < dest_nodes->nelem; i++)
1843        {
1844          err = re_node_set_merge (&state->inveclosure,
1845                                   dfa->inveclosures + dest_nodes->elems[i]);
1846          if (BE (err != REG_NOERROR, 0))
1847            return REG_ESPACE;
1848        }
1849    }
1850  return re_node_set_add_intersect (dest_nodes, candidates,
1851                                    &state->inveclosure);
1852}
1853
1854static reg_errcode_t
1855internal_function
1856sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1857                       const re_node_set *candidates)
1858{
1859    int ecl_idx;
1860    reg_errcode_t err;
1861    re_node_set *inv_eclosure = dfa->inveclosures + node;
1862    re_node_set except_nodes;
1863    re_node_set_init_empty (&except_nodes);
1864    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1865      {
1866        int cur_node = inv_eclosure->elems[ecl_idx];
1867        if (cur_node == node)
1868          continue;
1869        if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1870          {
1871            int edst1 = dfa->edests[cur_node].elems[0];
1872            int edst2 = ((dfa->edests[cur_node].nelem > 1)
1873                         ? dfa->edests[cur_node].elems[1] : -1);
1874            if ((!re_node_set_contains (inv_eclosure, edst1)
1875                 && re_node_set_contains (dest_nodes, edst1))
1876                || (edst2 > 0
1877                    && !re_node_set_contains (inv_eclosure, edst2)
1878                    && re_node_set_contains (dest_nodes, edst2)))
1879              {
1880                err = re_node_set_add_intersect (&except_nodes, candidates,
1881                                                 dfa->inveclosures + cur_node);
1882                if (BE (err != REG_NOERROR, 0))
1883                  {
1884                    re_node_set_free (&except_nodes);
1885                    return err;
1886                  }
1887              }
1888          }
1889      }
1890    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1891      {
1892        int cur_node = inv_eclosure->elems[ecl_idx];
1893        if (!re_node_set_contains (&except_nodes, cur_node))
1894          {
1895            int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1896            re_node_set_remove_at (dest_nodes, idx);
1897          }
1898      }
1899    re_node_set_free (&except_nodes);
1900    return REG_NOERROR;
1901}
1902
1903static int
1904internal_function
1905check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1906                  int dst_node, int dst_idx, int src_node, int src_idx)
1907{
1908  const re_dfa_t *const dfa = mctx->dfa;
1909  int lim_idx, src_pos, dst_pos;
1910
1911  int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1912  int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1913  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1914    {
1915      int subexp_idx;
1916      struct re_backref_cache_entry *ent;
1917      ent = mctx->bkref_ents + limits->elems[lim_idx];
1918      subexp_idx = dfa->nodes[ent->node].opr.idx;
1919
1920      dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1921                                           subexp_idx, dst_node, dst_idx,
1922                                           dst_bkref_idx);
1923      src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1924                                           subexp_idx, src_node, src_idx,
1925                                           src_bkref_idx);
1926
1927      /* In case of:
1928         <src> <dst> ( <subexp> )
1929         ( <subexp> ) <src> <dst>
1930         ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1931      if (src_pos == dst_pos)
1932        continue; /* This is unrelated limitation.  */
1933      else
1934        return 1;
1935    }
1936  return 0;
1937}
1938
1939static int
1940internal_function
1941check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1942                             int subexp_idx, int from_node, int bkref_idx)
1943{
1944  const re_dfa_t *const dfa = mctx->dfa;
1945  const re_node_set *eclosures = dfa->eclosures + from_node;
1946  int node_idx;
1947
1948  /* Else, we are on the boundary: examine the nodes on the epsilon
1949     closure.  */
1950  for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1951    {
1952      int node = eclosures->elems[node_idx];
1953      switch (dfa->nodes[node].type)
1954        {
1955        case OP_BACK_REF:
1956          if (bkref_idx != -1)
1957            {
1958              struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1959              do
1960                {
1961                  int dst, cpos;
1962
1963                  if (ent->node != node)
1964                    continue;
1965
1966                  if (subexp_idx < BITSET_WORD_BITS
1967                      && !(ent->eps_reachable_subexps_map
1968                           & ((bitset_word_t) 1 << subexp_idx)))
1969                    continue;
1970
1971                  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1972                     OP_CLOSE_SUBEXP cases below.  But, if the
1973                     destination node is the same node as the source
1974                     node, don't recurse because it would cause an
1975                     infinite loop: a regex that exhibits this behavior
1976                     is ()\1*\1*  */
1977                  dst = dfa->edests[node].elems[0];
1978                  if (dst == from_node)
1979                    {
1980                      if (boundaries & 1)
1981                        return -1;
1982                      else /* if (boundaries & 2) */
1983                        return 0;
1984                    }
1985
1986                  cpos =
1987                    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1988                                                 dst, bkref_idx);
1989                  if (cpos == -1 /* && (boundaries & 1) */)
1990                    return -1;
1991                  if (cpos == 0 && (boundaries & 2))
1992                    return 0;
1993
1994                  if (subexp_idx < BITSET_WORD_BITS)
1995                    ent->eps_reachable_subexps_map
1996                      &= ~((bitset_word_t) 1 << subexp_idx);
1997                }
1998              while (ent++->more);
1999            }
2000          break;
2001
2002        case OP_OPEN_SUBEXP:
2003          if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2004            return -1;
2005          break;
2006
2007        case OP_CLOSE_SUBEXP:
2008          if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2009            return 0;
2010          break;
2011
2012        default:
2013            break;
2014        }
2015    }
2016
2017  return (boundaries & 2) ? 1 : 0;
2018}
2019
2020static int
2021internal_function
2022check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2023                           int subexp_idx, int from_node, int str_idx,
2024                           int bkref_idx)
2025{
2026  struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2027  int boundaries;
2028
2029  /* If we are outside the range of the subexpression, return -1 or 1.  */
2030  if (str_idx < lim->subexp_from)
2031    return -1;
2032
2033  if (lim->subexp_to < str_idx)
2034    return 1;
2035
2036  /* If we are within the subexpression, return 0.  */
2037  boundaries = (str_idx == lim->subexp_from);
2038  boundaries |= (str_idx == lim->subexp_to) << 1;
2039  if (boundaries == 0)
2040    return 0;
2041
2042  /* Else, examine epsilon closure.  */
2043  return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2044                                      from_node, bkref_idx);
2045}
2046
2047/* Check the limitations of sub expressions LIMITS, and remove the nodes
2048   which are against limitations from DEST_NODES. */
2049
2050static reg_errcode_t
2051internal_function
2052check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2053                     const re_node_set *candidates, re_node_set *limits,
2054                     struct re_backref_cache_entry *bkref_ents, int str_idx)
2055{
2056  reg_errcode_t err;
2057  int node_idx, lim_idx;
2058
2059  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2060    {
2061      int subexp_idx;
2062      struct re_backref_cache_entry *ent;
2063      ent = bkref_ents + limits->elems[lim_idx];
2064
2065      if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2066        continue; /* This is unrelated limitation.  */
2067
2068      subexp_idx = dfa->nodes[ent->node].opr.idx;
2069      if (ent->subexp_to == str_idx)
2070        {
2071          int ops_node = -1;
2072          int cls_node = -1;
2073          for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2074            {
2075              int node = dest_nodes->elems[node_idx];
2076              re_token_type_t type = dfa->nodes[node].type;
2077              if (type == OP_OPEN_SUBEXP
2078                  && subexp_idx == dfa->nodes[node].opr.idx)
2079                ops_node = node;
2080              else if (type == OP_CLOSE_SUBEXP
2081                       && subexp_idx == dfa->nodes[node].opr.idx)
2082                cls_node = node;
2083            }
2084
2085          /* Check the limitation of the open subexpression.  */
2086          /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2087          if (ops_node >= 0)
2088            {
2089              err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2090                                           candidates);
2091              if (BE (err != REG_NOERROR, 0))
2092                return err;
2093            }
2094
2095          /* Check the limitation of the close subexpression.  */
2096          if (cls_node >= 0)
2097            for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2098              {
2099                int node = dest_nodes->elems[node_idx];
2100                if (!re_node_set_contains (dfa->inveclosures + node,
2101                                           cls_node)
2102                    && !re_node_set_contains (dfa->eclosures + node,
2103                                              cls_node))
2104                  {
2105                    /* It is against this limitation.
2106                       Remove it form the current sifted state.  */
2107                    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2108                                                 candidates);
2109                    if (BE (err != REG_NOERROR, 0))
2110                      return err;
2111                    --node_idx;
2112                  }
2113              }
2114        }
2115      else /* (ent->subexp_to != str_idx)  */
2116        {
2117          for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2118            {
2119              int node = dest_nodes->elems[node_idx];
2120              re_token_type_t type = dfa->nodes[node].type;
2121              if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2122                {
2123                  if (subexp_idx != dfa->nodes[node].opr.idx)
2124                    continue;
2125                  /* It is against this limitation.
2126                     Remove it form the current sifted state.  */
2127                  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2128                                               candidates);
2129                  if (BE (err != REG_NOERROR, 0))
2130                    return err;
2131                }
2132            }
2133        }
2134    }
2135  return REG_NOERROR;
2136}
2137
2138static reg_errcode_t
2139internal_function
2140sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2141                   int str_idx, const re_node_set *candidates)
2142{
2143  const re_dfa_t *const dfa = mctx->dfa;
2144  reg_errcode_t err;
2145  int node_idx, node;
2146  re_sift_context_t local_sctx;
2147  int first_idx = search_cur_bkref_entry (mctx, str_idx);
2148
2149  if (first_idx == -1)
2150    return REG_NOERROR;
2151
2152  local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2153
2154  for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2155    {
2156      int enabled_idx;
2157      re_token_type_t type;
2158      struct re_backref_cache_entry *entry;
2159      node = candidates->elems[node_idx];
2160      type = dfa->nodes[node].type;
2161      /* Avoid infinite loop for the REs like "()\1+".  */
2162      if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2163        continue;
2164      if (type != OP_BACK_REF)
2165        continue;
2166
2167      entry = mctx->bkref_ents + first_idx;
2168      enabled_idx = first_idx;
2169      do
2170        {
2171          int subexp_len;
2172          int to_idx;
2173          int dst_node;
2174          int ret;
2175          re_dfastate_t *cur_state;
2176
2177          if (entry->node != node)
2178            continue;
2179          subexp_len = entry->subexp_to - entry->subexp_from;
2180          to_idx = str_idx + subexp_len;
2181          dst_node = (subexp_len ? dfa->nexts[node]
2182                      : dfa->edests[node].elems[0]);
2183
2184          if (to_idx > sctx->last_str_idx
2185              || sctx->sifted_states[to_idx] == NULL
2186              || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2187              || check_dst_limits (mctx, &sctx->limits, node,
2188                                   str_idx, dst_node, to_idx))
2189            continue;
2190
2191          if (local_sctx.sifted_states == NULL)
2192            {
2193              local_sctx = *sctx;
2194              err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2195              if (BE (err != REG_NOERROR, 0))
2196                goto free_return;
2197            }
2198          local_sctx.last_node = node;
2199          local_sctx.last_str_idx = str_idx;
2200          ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2201          if (BE (ret < 0, 0))
2202            {
2203              err = REG_ESPACE;
2204              goto free_return;
2205            }
2206          cur_state = local_sctx.sifted_states[str_idx];
2207          err = sift_states_backward (mctx, &local_sctx);
2208          if (BE (err != REG_NOERROR, 0))
2209            goto free_return;
2210          if (sctx->limited_states != NULL)
2211            {
2212              err = merge_state_array (dfa, sctx->limited_states,
2213                                       local_sctx.sifted_states,
2214                                       str_idx + 1);
2215              if (BE (err != REG_NOERROR, 0))
2216                goto free_return;
2217            }
2218          local_sctx.sifted_states[str_idx] = cur_state;
2219          re_node_set_remove (&local_sctx.limits, enabled_idx);
2220
2221          /* mctx->bkref_ents may have changed, reload the pointer.  */
2222          entry = mctx->bkref_ents + enabled_idx;
2223        }
2224      while (enabled_idx++, entry++->more);
2225    }
2226  err = REG_NOERROR;
2227 free_return:
2228  if (local_sctx.sifted_states != NULL)
2229    {
2230      re_node_set_free (&local_sctx.limits);
2231    }
2232
2233  return err;
2234}
2235
2236
2237#ifdef RE_ENABLE_I18N
2238static int
2239internal_function
2240sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2241                     int node_idx, int str_idx, int max_str_idx)
2242{
2243  const re_dfa_t *const dfa = mctx->dfa;
2244  int naccepted;
2245  /* Check the node can accept `multi byte'.  */
2246  naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2247  if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2248      !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2249                            dfa->nexts[node_idx]))
2250    /* The node can't accept the `multi byte', or the
2251       destination was already thrown away, then the node
2252       could't accept the current input `multi byte'.   */
2253    naccepted = 0;
2254  /* Otherwise, it is sure that the node could accept
2255     `naccepted' bytes input.  */
2256  return naccepted;
2257}
2258#endif /* RE_ENABLE_I18N */
2259
2260\f
2261/* Functions for state transition.  */
2262
2263/* Return the next state to which the current state STATE will transit by
2264   accepting the current input byte, and update STATE_LOG if necessary.
2265   If STATE can accept a multibyte char/collating element/back reference
2266   update the destination of STATE_LOG.  */
2267
2268static re_dfastate_t *
2269internal_function
2270transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2271               re_dfastate_t *state)
2272{
2273  re_dfastate_t **trtable;
2274  unsigned char ch;
2275
2276#ifdef RE_ENABLE_I18N
2277  /* If the current state can accept multibyte.  */
2278  if (BE (state->accept_mb, 0))
2279    {
2280      *err = transit_state_mb (mctx, state);
2281      if (BE (*err != REG_NOERROR, 0))
2282        return NULL;
2283    }
2284#endif /* RE_ENABLE_I18N */
2285
2286  /* Then decide the next state with the single byte.  */
2287#if 0
2288  if (0)
2289    /* don't use transition table  */
2290    return transit_state_sb (err, mctx, state);
2291#endif
2292
2293  /* Use transition table  */
2294  ch = re_string_fetch_byte (&mctx->input);
2295  for (;;)
2296    {
2297      trtable = state->trtable;
2298      if (BE (trtable != NULL, 1))
2299        return trtable[ch];
2300
2301      trtable = state->word_trtable;
2302      if (BE (trtable != NULL, 1))
2303        {
2304          unsigned int context;
2305          context
2306            = re_string_context_at (&mctx->input,
2307                                    re_string_cur_idx (&mctx->input) - 1,
2308                                    mctx->eflags);
2309          if (IS_WORD_CONTEXT (context))
2310            return trtable[ch + SBC_MAX];
2311          else
2312            return trtable[ch];
2313        }
2314
2315      if (!build_trtable (mctx->dfa, state))
2316        {
2317          *err = REG_ESPACE;
2318          return NULL;
2319        }
2320
2321      /* Retry, we now have a transition table.  */
2322    }
2323}
2324
2325/* Update the state_log if we need */
2326re_dfastate_t *
2327internal_function
2328merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2329                      re_dfastate_t *next_state)
2330{
2331  const re_dfa_t *const dfa = mctx->dfa;
2332  int cur_idx = re_string_cur_idx (&mctx->input);
2333
2334  if (cur_idx > mctx->state_log_top)
2335    {
2336      mctx->state_log[cur_idx] = next_state;
2337      mctx->state_log_top = cur_idx;
2338    }
2339  else if (mctx->state_log[cur_idx] == 0)
2340    {
2341      mctx->state_log[cur_idx] = next_state;
2342    }
2343  else
2344    {
2345      re_dfastate_t *pstate;
2346      unsigned int context;
2347      re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2348      /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2349         the destination of a multibyte char/collating element/
2350         back reference.  Then the next state is the union set of
2351         these destinations and the results of the transition table.  */
2352      pstate = mctx->state_log[cur_idx];
2353      log_nodes = pstate->entrance_nodes;
2354      if (next_state != NULL)
2355        {
2356          table_nodes = next_state->entrance_nodes;
2357          *err = re_node_set_init_union (&next_nodes, table_nodes,
2358                                             log_nodes);
2359          if (BE (*err != REG_NOERROR, 0))
2360            return NULL;
2361        }
2362      else
2363        next_nodes = *log_nodes;
2364      /* Note: We already add the nodes of the initial state,
2365         then we don't need to add them here.  */
2366
2367      context = re_string_context_at (&mctx->input,
2368                                      re_string_cur_idx (&mctx->input) - 1,
2369                                      mctx->eflags);
2370      next_state = mctx->state_log[cur_idx]
2371        = re_acquire_state_context (err, dfa, &next_nodes, context);
2372      /* We don't need to check errors here, since the return value of
2373         this function is next_state and ERR is already set.  */
2374
2375      if (table_nodes != NULL)
2376        re_node_set_free (&next_nodes);
2377    }
2378
2379  if (BE (dfa->nbackref, 0) && next_state != NULL)
2380    {
2381      /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2382         later.  We must check them here, since the back references in the
2383         next state might use them.  */
2384      *err = check_subexp_matching_top (mctx, &next_state->nodes,
2385                                        cur_idx);
2386      if (BE (*err != REG_NOERROR, 0))
2387        return NULL;
2388
2389      /* If the next state has back references.  */
2390      if (next_state->has_backref)
2391        {
2392          *err = transit_state_bkref (mctx, &next_state->nodes);
2393          if (BE (*err != REG_NOERROR, 0))
2394            return NULL;
2395          next_state = mctx->state_log[cur_idx];
2396        }
2397    }
2398
2399  return next_state;
2400}
2401
2402/* Skip bytes in the input that correspond to part of a
2403   multi-byte match, then look in the log for a state
2404   from which to restart matching.  */
2405re_dfastate_t *
2406internal_function
2407find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2408{
2409  re_dfastate_t *cur_state;
2410  do
2411    {
2412      int max = mctx->state_log_top;
2413      int cur_str_idx = re_string_cur_idx (&mctx->input);
2414
2415      do
2416        {
2417          if (++cur_str_idx > max)
2418            return NULL;
2419          re_string_skip_bytes (&mctx->input, 1);
2420        }
2421      while (mctx->state_log[cur_str_idx] == NULL);
2422
2423      cur_state = merge_state_with_log (err, mctx, NULL);
2424    }
2425  while (*err == REG_NOERROR && cur_state == NULL);
2426  return cur_state;
2427}
2428
2429/* Helper functions for transit_state.  */
2430
2431/* From the node set CUR_NODES, pick up the nodes whose types are
2432   OP_OPEN_SUBEXP and which have corresponding back references in the regular
2433   expression. And register them to use them later for evaluating the
2434   correspoding back references.  */
2435
2436static reg_errcode_t
2437internal_function
2438check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2439                           int str_idx)
2440{
2441  const re_dfa_t *const dfa = mctx->dfa;
2442  int node_idx;
2443  reg_errcode_t err;
2444
2445  /* TODO: This isn't efficient.
2446           Because there might be more than one nodes whose types are
2447           OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2448           nodes.
2449           E.g. RE: (a){2}  */
2450  for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2451    {
2452      int node = cur_nodes->elems[node_idx];
2453      if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2454          && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2455          && (dfa->used_bkref_map
2456              & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2457        {
2458          err = match_ctx_add_subtop (mctx, node, str_idx);
2459          if (BE (err != REG_NOERROR, 0))
2460            return err;
2461        }
2462    }
2463  return REG_NOERROR;
2464}
2465
2466#if 0
2467/* Return the next state to which the current state STATE will transit by
2468   accepting the current input byte.  */
2469
2470static re_dfastate_t *
2471transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2472                  re_dfastate_t *state)
2473{
2474  const re_dfa_t *const dfa = mctx->dfa;
2475  re_node_set next_nodes;
2476  re_dfastate_t *next_state;
2477  int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2478  unsigned int context;
2479
2480  *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2481  if (BE (*err != REG_NOERROR, 0))
2482    return NULL;
2483  for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2484    {
2485      int cur_node = state->nodes.elems[node_cnt];
2486      if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2487        {
2488          *err = re_node_set_merge (&next_nodes,
2489                                    dfa->eclosures + dfa->nexts[cur_node]);
2490          if (BE (*err != REG_NOERROR, 0))
2491            {
2492              re_node_set_free (&next_nodes);
2493              return NULL;
2494            }
2495        }
2496    }
2497  context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2498  next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2499  /* We don't need to check errors here, since the return value of
2500     this function is next_state and ERR is already set.  */
2501
2502  re_node_set_free (&next_nodes);
2503  re_string_skip_bytes (&mctx->input, 1);
2504  return next_state;
2505}
2506#endif
2507
2508#ifdef RE_ENABLE_I18N
2509static reg_errcode_t
2510internal_function
2511transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2512{
2513  const re_dfa_t *const dfa = mctx->dfa;
2514  reg_errcode_t err;
2515  int i;
2516
2517  for (i = 0; i < pstate->nodes.nelem; ++i)
2518    {
2519      re_node_set dest_nodes, *new_nodes;
2520      int cur_node_idx = pstate->nodes.elems[i];
2521      int naccepted, dest_idx;
2522      unsigned int context;
2523      re_dfastate_t *dest_state;
2524
2525      if (!dfa->nodes[cur_node_idx].accept_mb)
2526        continue;
2527
2528      if (dfa->nodes[cur_node_idx].constraint)
2529        {
2530          context = re_string_context_at (&mctx->input,
2531                                          re_string_cur_idx (&mctx->input),
2532                                          mctx->eflags);
2533          if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2534                                           context))
2535            continue;
2536        }
2537
2538      /* How many bytes the node can accept?  */
2539      naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2540                                           re_string_cur_idx (&mctx->input));
2541      if (naccepted == 0)
2542        continue;
2543
2544      /* The node can accepts `naccepted' bytes.  */
2545      dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2546      mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2547                               : mctx->max_mb_elem_len);
2548      err = clean_state_log_if_needed (mctx, dest_idx);
2549      if (BE (err != REG_NOERROR, 0))
2550        return err;
2551#ifdef DEBUG
2552      assert (dfa->nexts[cur_node_idx] != -1);
2553#endif
2554      new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2555
2556      dest_state = mctx->state_log[dest_idx];
2557      if (dest_state == NULL)
2558        dest_nodes = *new_nodes;
2559      else
2560        {
2561          err = re_node_set_init_union (&dest_nodes,
2562                                        dest_state->entrance_nodes, new_nodes);
2563          if (BE (err != REG_NOERROR, 0))
2564            return err;
2565        }
2566      context = re_string_context_at (&mctx->input, dest_idx - 1,
2567                                      mctx->eflags);
2568      mctx->state_log[dest_idx]
2569        = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2570      if (dest_state != NULL)
2571        re_node_set_free (&dest_nodes);
2572      if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2573        return err;
2574    }
2575  return REG_NOERROR;
2576}
2577#endif /* RE_ENABLE_I18N */
2578
2579static reg_errcode_t
2580internal_function
2581transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2582{
2583  const re_dfa_t *const dfa = mctx->dfa;
2584  reg_errcode_t err;
2585  int i;
2586  int cur_str_idx = re_string_cur_idx (&mctx->input);
2587
2588  for (i = 0; i < nodes->nelem; ++i)
2589    {
2590      int dest_str_idx, prev_nelem, bkc_idx;
2591      int node_idx = nodes->elems[i];
2592      unsigned int context;
2593      const re_token_t *node = dfa->nodes + node_idx;
2594      re_node_set *new_dest_nodes;
2595
2596      /* Check whether `node' is a backreference or not.  */
2597      if (node->type != OP_BACK_REF)
2598        continue;
2599
2600      if (node->constraint)
2601        {
2602          context = re_string_context_at (&mctx->input, cur_str_idx,
2603                                          mctx->eflags);
2604          if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2605            continue;
2606        }
2607
2608      /* `node' is a backreference.
2609         Check the substring which the substring matched.  */
2610      bkc_idx = mctx->nbkref_ents;
2611      err = get_subexp (mctx, node_idx, cur_str_idx);
2612      if (BE (err != REG_NOERROR, 0))
2613        goto free_return;
2614
2615      /* And add the epsilon closures (which is `new_dest_nodes') of
2616         the backreference to appropriate state_log.  */
2617#ifdef DEBUG
2618      assert (dfa->nexts[node_idx] != -1);
2619#endif
2620      for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2621        {
2622          int subexp_len;
2623          re_dfastate_t *dest_state;
2624          struct re_backref_cache_entry *bkref_ent;
2625          bkref_ent = mctx->bkref_ents + bkc_idx;
2626          if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2627            continue;
2628          subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2629          new_dest_nodes = (subexp_len == 0
2630                            ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2631                            : dfa->eclosures + dfa->nexts[node_idx]);
2632          dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2633                          - bkref_ent->subexp_from);
2634          context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2635                                          mctx->eflags);
2636          dest_state = mctx->state_log[dest_str_idx];
2637          prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2638                        : mctx->state_log[cur_str_idx]->nodes.nelem);
2639          /* Add `new_dest_node' to state_log.  */
2640          if (dest_state == NULL)
2641            {
2642              mctx->state_log[dest_str_idx]
2643                = re_acquire_state_context (&err, dfa, new_dest_nodes,
2644                                            context);
2645              if (BE (mctx->state_log[dest_str_idx] == NULL
2646                      && err != REG_NOERROR, 0))
2647                goto free_return;
2648            }
2649          else
2650            {
2651              re_node_set dest_nodes;
2652              err = re_node_set_init_union (&dest_nodes,
2653                                            dest_state->entrance_nodes,
2654                                            new_dest_nodes);
2655              if (BE (err != REG_NOERROR, 0))
2656                {
2657                  re_node_set_free (&dest_nodes);
2658                  goto free_return;
2659                }
2660              mctx->state_log[dest_str_idx]
2661                = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2662              re_node_set_free (&dest_nodes);
2663              if (BE (mctx->state_log[dest_str_idx] == NULL
2664                      && err != REG_NOERROR, 0))
2665                goto free_return;
2666            }
2667          /* We need to check recursively if the backreference can epsilon
2668             transit.  */
2669          if (subexp_len == 0
2670              && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2671            {
2672              err = check_subexp_matching_top (mctx, new_dest_nodes,
2673                                               cur_str_idx);
2674              if (BE (err != REG_NOERROR, 0))
2675                goto free_return;
2676              err = transit_state_bkref (mctx, new_dest_nodes);
2677              if (BE (err != REG_NOERROR, 0))
2678                goto free_return;
2679            }
2680        }
2681    }
2682  err = REG_NOERROR;
2683 free_return:
2684  return err;
2685}
2686
2687/* Enumerate all the candidates which the backreference BKREF_NODE can match
2688   at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2689   Note that we might collect inappropriate candidates here.
2690   However, the cost of checking them strictly here is too high, then we
2691   delay these checking for prune_impossible_nodes().  */
2692
2693static reg_errcode_t
2694internal_function
2695get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2696{
2697  const re_dfa_t *const dfa = mctx->dfa;
2698  int subexp_num, sub_top_idx;
2699  const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2700  /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2701  int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2702  if (cache_idx != -1)
2703    {
2704      const struct re_backref_cache_entry *entry
2705        = mctx->bkref_ents + cache_idx;
2706      do
2707        if (entry->node == bkref_node)
2708          return REG_NOERROR; /* We already checked it.  */
2709      while (entry++->more);
2710    }
2711
2712  subexp_num = dfa->nodes[bkref_node].opr.idx;
2713
2714  /* For each sub expression  */
2715  for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2716    {
2717      reg_errcode_t err;
2718      re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2719      re_sub_match_last_t *sub_last;
2720      int sub_last_idx, sl_str, bkref_str_off;
2721
2722      if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2723        continue; /* It isn't related.  */
2724
2725      sl_str = sub_top->str_idx;
2726      bkref_str_off = bkref_str_idx;
2727      /* At first, check the last node of sub expressions we already
2728         evaluated.  */
2729      for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2730        {
2731          int sl_str_diff;
2732          sub_last = sub_top->lasts[sub_last_idx];
2733          sl_str_diff = sub_last->str_idx - sl_str;
2734          /* The matched string by the sub expression match with the substring
2735             at the back reference?  */
2736          if (sl_str_diff > 0)
2737            {
2738              if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2739                {
2740                  /* Not enough chars for a successful match.  */
2741                  if (bkref_str_off + sl_str_diff > mctx->input.len)
2742                    break;
2743
2744                  err = clean_state_log_if_needed (mctx,
2745                                                   bkref_str_off
2746                                                   + sl_str_diff);
2747                  if (BE (err != REG_NOERROR, 0))
2748                    return err;
2749                  buf = (const char *) re_string_get_buffer (&mctx->input);
2750                }
2751              if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2752                /* We don't need to search this sub expression any more.  */
2753                break;
2754            }
2755          bkref_str_off += sl_str_diff;
2756          sl_str += sl_str_diff;
2757          err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2758                                bkref_str_idx);
2759
2760          /* Reload buf, since the preceding call might have reallocated
2761             the buffer.  */
2762          buf = (const char *) re_string_get_buffer (&mctx->input);
2763
2764          if (err == REG_NOMATCH)
2765            continue;
2766          if (BE (err != REG_NOERROR, 0))
2767            return err;
2768        }
2769
2770      if (sub_last_idx < sub_top->nlasts)
2771        continue;
2772      if (sub_last_idx > 0)
2773        ++sl_str;
2774      /* Then, search for the other last nodes of the sub expression.  */
2775      for (; sl_str <= bkref_str_idx; ++sl_str)
2776        {
2777          int cls_node, sl_str_off;
2778          const re_node_set *nodes;
2779          sl_str_off = sl_str - sub_top->str_idx;
2780          /* The matched string by the sub expression match with the substring
2781             at the back reference?  */
2782          if (sl_str_off > 0)
2783            {
2784              if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2785                {
2786                  /* If we are at the end of the input, we cannot match.  */
2787                  if (bkref_str_off >= mctx->input.len)
2788                    break;
2789
2790                  err = extend_buffers (mctx);
2791                  if (BE (err != REG_NOERROR, 0))
2792                    return err;
2793
2794                  buf = (const char *) re_string_get_buffer (&mctx->input);
2795                }
2796              if (buf [bkref_str_off++] != buf[sl_str - 1])
2797                break; /* We don't need to search this sub expression
2798                          any more.  */
2799            }
2800          if (mctx->state_log[sl_str] == NULL)
2801            continue;
2802          /* Does this state have a ')' of the sub expression?  */
2803          nodes = &mctx->state_log[sl_str]->nodes;
2804          cls_node = find_subexp_node (dfa, nodes, subexp_num,
2805                                       OP_CLOSE_SUBEXP);
2806          if (cls_node == -1)
2807            continue; /* No.  */
2808          if (sub_top->path == NULL)
2809            {
2810              sub_top->path = calloc (sizeof (state_array_t),
2811                                      sl_str - sub_top->str_idx + 1);
2812              if (sub_top->path == NULL)
2813                return REG_ESPACE;
2814            }
2815          /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2816             in the current context?  */
2817          err = check_arrival (mctx, sub_top->path, sub_top->node,
2818                               sub_top->str_idx, cls_node, sl_str,
2819                               OP_CLOSE_SUBEXP);
2820          if (err == REG_NOMATCH)
2821              continue;
2822          if (BE (err != REG_NOERROR, 0))
2823              return err;
2824          sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2825          if (BE (sub_last == NULL, 0))
2826            return REG_ESPACE;
2827          err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2828                                bkref_str_idx);
2829          if (err == REG_NOMATCH)
2830            continue;
2831        }
2832    }
2833  return REG_NOERROR;
2834}
2835
2836/* Helper functions for get_subexp().  */
2837
2838/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2839   If it can arrive, register the sub expression expressed with SUB_TOP
2840   and SUB_LAST.  */
2841
2842static reg_errcode_t
2843internal_function
2844get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2845                re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2846{
2847  reg_errcode_t err;
2848  int to_idx;
2849  /* Can the subexpression arrive the back reference?  */
2850  err = check_arrival (mctx, &sub_last->path, sub_last->node,
2851                       sub_last->str_idx, bkref_node, bkref_str,
2852                       OP_OPEN_SUBEXP);
2853  if (err != REG_NOERROR)
2854    return err;
2855  err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2856                             sub_last->str_idx);
2857  if (BE (err != REG_NOERROR, 0))
2858    return err;
2859  to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2860  return clean_state_log_if_needed (mctx, to_idx);
2861}
2862
2863/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2864   Search '(' if FL_OPEN, or search ')' otherwise.
2865   TODO: This function isn't efficient...
2866         Because there might be more than one nodes whose types are
2867         OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2868         nodes.
2869         E.g. RE: (a){2}  */
2870
2871static int
2872internal_function
2873find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2874                  int subexp_idx, int type)
2875{
2876  int cls_idx;
2877  for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2878    {
2879      int cls_node = nodes->elems[cls_idx];
2880      const re_token_t *node = dfa->nodes + cls_node;
2881      if (node->type == type
2882          && node->opr.idx == subexp_idx)
2883        return cls_node;
2884    }
2885  return -1;
2886}
2887
2888/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2889   LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2890   heavily reused.
2891   Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2892
2893static reg_errcode_t
2894internal_function
2895check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2896               int top_str, int last_node, int last_str, int type)
2897{
2898  const re_dfa_t *const dfa = mctx->dfa;
2899  reg_errcode_t err = REG_NOERROR;
2900  int subexp_num, backup_cur_idx, str_idx, null_cnt;
2901  re_dfastate_t *cur_state = NULL;
2902  re_node_set *cur_nodes, next_nodes;
2903  re_dfastate_t **backup_state_log;
2904  unsigned int context;
2905
2906  subexp_num = dfa->nodes[top_node].opr.idx;
2907  /* Extend the buffer if we need.  */
2908  if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2909    {
2910      re_dfastate_t **new_array;
2911      int old_alloc = path->alloc;
2912      path->alloc += last_str + mctx->max_mb_elem_len + 1;
2913      new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2914      if (BE (new_array == NULL, 0))
2915        {
2916          path->alloc = old_alloc;
2917          return REG_ESPACE;
2918        }
2919      path->array = new_array;
2920      memset (new_array + old_alloc, '\0',
2921              sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2922    }
2923
2924  str_idx = path->next_idx ? path->next_idx : top_str;
2925
2926  /* Temporary modify MCTX.  */
2927  backup_state_log = mctx->state_log;
2928  backup_cur_idx = mctx->input.cur_idx;
2929  mctx->state_log = path->array;
2930  mctx->input.cur_idx = str_idx;
2931
2932  /* Setup initial node set.  */
2933  context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2934  if (str_idx == top_str)
2935    {
2936      err = re_node_set_init_1 (&next_nodes, top_node);
2937      if (BE (err != REG_NOERROR, 0))
2938        return err;
2939      err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2940      if (BE (err != REG_NOERROR, 0))
2941        {
2942          re_node_set_free (&next_nodes);
2943          return err;
2944        }
2945    }
2946  else
2947    {
2948      cur_state = mctx->state_log[str_idx];
2949      if (cur_state && cur_state->has_backref)
2950        {
2951          err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2952          if (BE (err != REG_NOERROR, 0))
2953            return err;
2954        }
2955      else
2956        re_node_set_init_empty (&next_nodes);
2957    }
2958  if (str_idx == top_str || (cur_state && cur_state->has_backref))
2959    {
2960      if (next_nodes.nelem)
2961        {
2962          err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2963                                    subexp_num, type);
2964          if (BE (err != REG_NOERROR, 0))
2965            {
2966              re_node_set_free (&next_nodes);
2967              return err;
2968            }
2969        }
2970      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2971      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2972        {
2973          re_node_set_free (&next_nodes);
2974          return err;
2975        }
2976      mctx->state_log[str_idx] = cur_state;
2977    }
2978
2979  for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2980    {
2981      re_node_set_empty (&next_nodes);
2982      if (mctx->state_log[str_idx + 1])
2983        {
2984          err = re_node_set_merge (&next_nodes,
2985                                   &mctx->state_log[str_idx + 1]->nodes);
2986          if (BE (err != REG_NOERROR, 0))
2987            {
2988              re_node_set_free (&next_nodes);
2989              return err;
2990            }
2991        }
2992      if (cur_state)
2993        {
2994          err = check_arrival_add_next_nodes (mctx, str_idx,
2995                                              &cur_state->non_eps_nodes,
2996                                              &next_nodes);
2997          if (BE (err != REG_NOERROR, 0))
2998            {
2999              re_node_set_free (&next_nodes);
3000              return err;
3001            }
3002        }
3003      ++str_idx;
3004      if (next_nodes.nelem)
3005        {
3006          err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3007          if (BE (err != REG_NOERROR, 0))
3008            {
3009              re_node_set_free (&next_nodes);
3010              return err;
3011            }
3012          err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3013                                    subexp_num, type);
3014          if (BE (err != REG_NOERROR, 0))
3015            {
3016              re_node_set_free (&next_nodes);
3017              return err;
3018            }
3019        }
3020      context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3021      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3022      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3023        {
3024          re_node_set_free (&next_nodes);
3025          return err;
3026        }
3027      mctx->state_log[str_idx] = cur_state;
3028      null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3029    }
3030  re_node_set_free (&next_nodes);
3031  cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3032               : &mctx->state_log[last_str]->nodes);
3033  path->next_idx = str_idx;
3034
3035  /* Fix MCTX.  */
3036  mctx->state_log = backup_state_log;
3037  mctx->input.cur_idx = backup_cur_idx;
3038
3039  /* Then check the current node set has the node LAST_NODE.  */
3040  if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3041    return REG_NOERROR;
3042
3043  return REG_NOMATCH;
3044}
3045
3046/* Helper functions for check_arrival.  */
3047
3048/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3049   to NEXT_NODES.
3050   TODO: This function is similar to the functions transit_state*(),
3051         however this function has many additional works.
3052         Can't we unify them?  */
3053
3054static reg_errcode_t
3055internal_function
3056check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3057                              re_node_set *cur_nodes, re_node_set *next_nodes)
3058{
3059  const re_dfa_t *const dfa = mctx->dfa;
3060  int result;
3061  int cur_idx;
3062  reg_errcode_t err = REG_NOERROR;
3063  re_node_set union_set;
3064  re_node_set_init_empty (&union_set);
3065  for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3066    {
3067      int naccepted = 0;
3068      int cur_node = cur_nodes->elems[cur_idx];
3069#ifdef DEBUG
3070      re_token_type_t type = dfa->nodes[cur_node].type;
3071      assert (!IS_EPSILON_NODE (type));
3072#endif
3073#ifdef RE_ENABLE_I18N
3074      /* If the node may accept `multi byte'.  */
3075      if (dfa->nodes[cur_node].accept_mb)
3076        {
3077          naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3078                                               str_idx);
3079          if (naccepted > 1)
3080            {
3081              re_dfastate_t *dest_state;
3082              int next_node = dfa->nexts[cur_node];
3083              int next_idx = str_idx + naccepted;
3084              dest_state = mctx->state_log[next_idx];
3085              re_node_set_empty (&union_set);
3086              if (dest_state)
3087                {
3088                  err = re_node_set_merge (&union_set, &dest_state->nodes);
3089                  if (BE (err != REG_NOERROR, 0))
3090                    {
3091                      re_node_set_free (&union_set);
3092                      return err;
3093                    }
3094                }
3095              result = re_node_set_insert (&union_set, next_node);
3096              if (BE (result < 0, 0))
3097                {
3098                  re_node_set_free (&union_set);
3099                  return REG_ESPACE;
3100                }
3101              mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3102                                                            &union_set);
3103              if (BE (mctx->state_log[next_idx] == NULL
3104                      && err != REG_NOERROR, 0))
3105                {
3106                  re_node_set_free (&union_set);
3107                  return err;
3108                }
3109            }
3110        }
3111#endif /* RE_ENABLE_I18N */
3112      if (naccepted
3113          || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3114        {
3115          result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3116          if (BE (result < 0, 0))
3117            {
3118              re_node_set_free (&union_set);
3119              return REG_ESPACE;
3120            }
3121        }
3122    }
3123  re_node_set_free (&union_set);
3124  return REG_NOERROR;
3125}
3126
3127/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3128   CUR_NODES, however exclude the nodes which are:
3129    - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3130    - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3131*/
3132
3133static reg_errcode_t
3134internal_function
3135check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3136                          int ex_subexp, int type)
3137{
3138  reg_errcode_t err;
3139  int idx, outside_node;
3140  re_node_set new_nodes;
3141#ifdef DEBUG
3142  assert (cur_nodes->nelem);
3143#endif
3144  err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3145  if (BE (err != REG_NOERROR, 0))
3146    return err;
3147  /* Create a new node set NEW_NODES with the nodes which are epsilon
3148     closures of the node in CUR_NODES.  */
3149
3150  for (idx = 0; idx < cur_nodes->nelem; ++idx)
3151    {
3152      int cur_node = cur_nodes->elems[idx];
3153      const re_node_set *eclosure = dfa->eclosures + cur_node;
3154      outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3155      if (outside_node == -1)
3156        {
3157          /* There are no problematic nodes, just merge them.  */
3158          err = re_node_set_merge (&new_nodes, eclosure);
3159          if (BE (err != REG_NOERROR, 0))
3160            {
3161              re_node_set_free (&new_nodes);
3162              return err;
3163            }
3164        }
3165      else
3166        {
3167          /* There are problematic nodes, re-calculate incrementally.  */
3168          err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3169                                              ex_subexp, type);
3170          if (BE (err != REG_NOERROR, 0))
3171            {
3172              re_node_set_free (&new_nodes);
3173              return err;
3174            }
3175        }
3176    }
3177  re_node_set_free (cur_nodes);
3178  *cur_nodes = new_nodes;
3179  return REG_NOERROR;
3180}
3181
3182/* Helper function for check_arrival_expand_ecl.
3183   Check incrementally the epsilon closure of TARGET, and if it isn't
3184   problematic append it to DST_NODES.  */
3185
3186static reg_errcode_t
3187internal_function
3188check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3189                              int target, int ex_subexp, int type)
3190{
3191  int cur_node;
3192  for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3193    {
3194      int err;
3195
3196      if (dfa->nodes[cur_node].type == type
3197          && dfa->nodes[cur_node].opr.idx == ex_subexp)
3198        {
3199          if (type == OP_CLOSE_SUBEXP)
3200            {
3201              err = re_node_set_insert (dst_nodes, cur_node);
3202              if (BE (err == -1, 0))
3203                return REG_ESPACE;
3204            }
3205          break;
3206        }
3207      err = re_node_set_insert (dst_nodes, cur_node);
3208      if (BE (err == -1, 0))
3209        return REG_ESPACE;
3210      if (dfa->edests[cur_node].nelem == 0)
3211        break;
3212      if (dfa->edests[cur_node].nelem == 2)
3213        {
3214          err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3215                                              dfa->edests[cur_node].elems[1],
3216                                              ex_subexp, type);
3217          if (BE (err != REG_NOERROR, 0))
3218            return err;
3219        }
3220      cur_node = dfa->edests[cur_node].elems[0];
3221    }
3222  return REG_NOERROR;
3223}
3224
3225
3226/* For all the back references in the current state, calculate the
3227   destination of the back references by the appropriate entry
3228   in MCTX->BKREF_ENTS.  */
3229
3230static reg_errcode_t
3231internal_function
3232expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3233                    int cur_str, int subexp_num, int type)
3234{
3235  const re_dfa_t *const dfa = mctx->dfa;
3236  reg_errcode_t err;
3237  int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3238  struct re_backref_cache_entry *ent;
3239
3240  if (cache_idx_start == -1)
3241    return REG_NOERROR;
3242
3243 restart:
3244  ent = mctx->bkref_ents + cache_idx_start;
3245  do
3246    {
3247      int to_idx, next_node;
3248
3249      /* Is this entry ENT is appropriate?  */
3250      if (!re_node_set_contains (cur_nodes, ent->node))
3251        continue; /* No.  */
3252
3253      to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3254      /* Calculate the destination of the back reference, and append it
3255         to MCTX->STATE_LOG.  */
3256      if (to_idx == cur_str)
3257        {
3258          /* The backreference did epsilon transit, we must re-check all the
3259             node in the current state.  */
3260          re_node_set new_dests;
3261          reg_errcode_t err2, err3;
3262          next_node = dfa->edests[ent->node].elems[0];
3263          if (re_node_set_contains (cur_nodes, next_node))
3264            continue;
3265          err = re_node_set_init_1 (&new_dests, next_node);
3266          err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3267          err3 = re_node_set_merge (cur_nodes, &new_dests);
3268          re_node_set_free (&new_dests);
3269          if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3270                  || err3 != REG_NOERROR, 0))
3271            {
3272              err = (err != REG_NOERROR ? err
3273                     : (err2 != REG_NOERROR ? err2 : err3));
3274              return err;
3275            }
3276          /* TODO: It is still inefficient...  */
3277          goto restart;
3278        }
3279      else
3280        {
3281          re_node_set union_set;
3282          next_node = dfa->nexts[ent->node];
3283          if (mctx->state_log[to_idx])
3284            {
3285              int ret;
3286              if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3287                                        next_node))
3288                continue;
3289              err = re_node_set_init_copy (&union_set,
3290                                           &mctx->state_log[to_idx]->nodes);
3291              ret = re_node_set_insert (&union_set, next_node);
3292              if (BE (err != REG_NOERROR || ret < 0, 0))
3293                {
3294                  re_node_set_free (&union_set);
3295                  err = err != REG_NOERROR ? err : REG_ESPACE;
3296                  return err;
3297                }
3298            }
3299          else
3300            {
3301              err = re_node_set_init_1 (&union_set, next_node);
3302              if (BE (err != REG_NOERROR, 0))
3303                return err;
3304            }
3305          mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3306          re_node_set_free (&union_set);
3307          if (BE (mctx->state_log[to_idx] == NULL
3308                  && err != REG_NOERROR, 0))
3309            return err;
3310        }
3311    }
3312  while (ent++->more);
3313  return REG_NOERROR;
3314}
3315
3316/* Build transition table for the state.
3317   Return 1 if succeeded, otherwise return NULL.  */
3318
3319static int
3320internal_function
3321build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3322{
3323  reg_errcode_t err;
3324  int i, j, ch, need_word_trtable = 0;
3325  bitset_word_t elem, mask;
3326  bool dests_node_malloced = false;
3327  bool dest_states_malloced = false;
3328  int ndests; /* Number of the destination states from `state'.  */
3329  re_dfastate_t **trtable;
3330  re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3331  re_node_set follows, *dests_node;
3332  bitset_t *dests_ch;
3333  bitset_t acceptable;
3334
3335  struct dests_alloc
3336  {
3337    re_node_set dests_node[SBC_MAX];
3338    bitset_t dests_ch[SBC_MAX];
3339  } *dests_alloc;
3340
3341  /* We build DFA states which corresponds to the destination nodes
3342     from `state'.  `dests_node[i]' represents the nodes which i-th
3343     destination state contains, and `dests_ch[i]' represents the
3344     characters which i-th destination state accepts.  */
3345#ifdef HAVE_ALLOCA
3346  if (__libc_use_alloca (sizeof (struct dests_alloc)))
3347    dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3348  else
3349#endif
3350    {
3351      dests_alloc = re_malloc (struct dests_alloc, 1);
3352      if (BE (dests_alloc == NULL, 0))
3353        return 0;
3354      dests_node_malloced = true;
3355    }
3356  dests_node = dests_alloc->dests_node;
3357  dests_ch = dests_alloc->dests_ch;
3358
3359  /* Initialize transiton table.  */
3360  state->word_trtable = state->trtable = NULL;
3361
3362  /* At first, group all nodes belonging to `state' into several
3363     destinations.  */
3364  ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3365  if (BE (ndests <= 0, 0))
3366    {
3367      if (dests_node_malloced)
3368        free (dests_alloc);
3369      /* Return 0 in case of an error, 1 otherwise.  */
3370      if (ndests == 0)
3371        {
3372          state->trtable = (re_dfastate_t **)
3373            calloc (sizeof (re_dfastate_t *), SBC_MAX);
3374          return 1;
3375        }
3376      return 0;
3377    }
3378
3379  err = re_node_set_alloc (&follows, ndests + 1);
3380  if (BE (err != REG_NOERROR, 0))
3381    goto out_free;
3382
3383  /* Avoid arithmetic overflow in size calculation.  */
3384  if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3385            / (3 * sizeof (re_dfastate_t *)))
3386           < ndests),
3387          0))
3388    goto out_free;
3389
3390#ifdef HAVE_ALLOCA
3391  if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3392                         + ndests * 3 * sizeof (re_dfastate_t *)))
3393    dest_states = (re_dfastate_t **)
3394      alloca (ndests * 3 * sizeof (re_dfastate_t *));
3395  else
3396#endif
3397    {
3398      dest_states = (re_dfastate_t **)
3399        malloc (ndests * 3 * sizeof (re_dfastate_t *));
3400      if (BE (dest_states == NULL, 0))
3401        {
3402out_free:
3403          if (dest_states_malloced)
3404            free (dest_states);
3405          re_node_set_free (&follows);
3406          for (i = 0; i < ndests; ++i)
3407            re_node_set_free (dests_node + i);
3408          if (dests_node_malloced)
3409            free (dests_alloc);
3410          return 0;
3411        }
3412      dest_states_malloced = true;
3413    }
3414  dest_states_word = dest_states + ndests;
3415  dest_states_nl = dest_states_word + ndests;
3416  bitset_empty (acceptable);
3417
3418  /* Then build the states for all destinations.  */
3419  for (i = 0; i < ndests; ++i)
3420    {
3421      int next_node;
3422      re_node_set_empty (&follows);
3423      /* Merge the follows of this destination states.  */
3424      for (j = 0; j < dests_node[i].nelem; ++j)
3425        {
3426          next_node = dfa->nexts[dests_node[i].elems[j]];
3427          if (next_node != -1)
3428            {
3429              err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3430              if (BE (err != REG_NOERROR, 0))
3431                goto out_free;
3432            }
3433        }
3434      dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3435      if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3436        goto out_free;
3437      /* If the new state has context constraint,
3438         build appropriate states for these contexts.  */
3439      if (dest_states[i]->has_constraint)
3440        {
3441          dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3442                                                          CONTEXT_WORD);
3443          if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3444            goto out_free;
3445
3446          if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3447            need_word_trtable = 1;
3448
3449          dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3450                                                        CONTEXT_NEWLINE);
3451          if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3452            goto out_free;
3453        }
3454      else
3455        {
3456          dest_states_word[i] = dest_states[i];
3457          dest_states_nl[i] = dest_states[i];
3458        }
3459      bitset_merge (acceptable, dests_ch[i]);
3460    }
3461
3462  if (!BE (need_word_trtable, 0))
3463    {
3464      /* We don't care about whether the following character is a word
3465         character, or we are in a single-byte character set so we can
3466         discern by looking at the character code: allocate a
3467         256-entry transition table.  */
3468      trtable = state->trtable =
3469        (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3470      if (BE (trtable == NULL, 0))
3471        goto out_free;
3472
3473      /* For all characters ch...:  */
3474      for (i = 0; i < BITSET_WORDS; ++i)
3475        for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3476             elem;
3477             mask <<= 1, elem >>= 1, ++ch)
3478          if (BE (elem & 1, 0))
3479            {
3480              /* There must be exactly one destination which accepts
3481                 character ch.  See group_nodes_into_DFAstates.  */
3482              for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3483                ;
3484
3485              /* j-th destination accepts the word character ch.  */
3486              if (dfa->word_char[i] & mask)
3487                trtable[ch] = dest_states_word[j];
3488              else
3489                trtable[ch] = dest_states[j];
3490            }
3491    }
3492  else
3493    {
3494      /* We care about whether the following character is a word
3495         character, and we are in a multi-byte character set: discern
3496         by looking at the character code: build two 256-entry
3497         transition tables, one starting at trtable[0] and one
3498         starting at trtable[SBC_MAX].  */
3499      trtable = state->word_trtable =
3500        (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3501      if (BE (trtable == NULL, 0))
3502        goto out_free;
3503
3504      /* For all characters ch...:  */
3505      for (i = 0; i < BITSET_WORDS; ++i)
3506        for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3507             elem;
3508             mask <<= 1, elem >>= 1, ++ch)
3509          if (BE (elem & 1, 0))
3510            {
3511              /* There must be exactly one destination which accepts
3512                 character ch.  See group_nodes_into_DFAstates.  */
3513              for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3514                ;
3515
3516              /* j-th destination accepts the word character ch.  */
3517              trtable[ch] = dest_states[j];
3518              trtable[ch + SBC_MAX] = dest_states_word[j];
3519            }
3520    }
3521
3522  /* new line */
3523  if (bitset_contain (acceptable, NEWLINE_CHAR))
3524    {
3525      /* The current state accepts newline character.  */
3526      for (j = 0; j < ndests; ++j)
3527        if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3528          {
3529            /* k-th destination accepts newline character.  */
3530            trtable[NEWLINE_CHAR] = dest_states_nl[j];
3531            if (need_word_trtable)
3532              trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3533            /* There must be only one destination which accepts
3534               newline.  See group_nodes_into_DFAstates.  */
3535            break;
3536          }
3537    }
3538
3539  if (dest_states_malloced)
3540    free (dest_states);
3541
3542  re_node_set_free (&follows);
3543  for (i = 0; i < ndests; ++i)
3544    re_node_set_free (dests_node + i);
3545
3546  if (dests_node_malloced)
3547    free (dests_alloc);
3548
3549  return 1;
3550}
3551
3552/* Group all nodes belonging to STATE into several destinations.
3553   Then for all destinations, set the nodes belonging to the destination
3554   to DESTS_NODE[i] and set the characters accepted by the destination
3555   to DEST_CH[i].  This function return the number of destinations.  */
3556
3557static int
3558internal_function
3559group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3560                            re_node_set *dests_node, bitset_t *dests_ch)
3561{
3562  reg_errcode_t err;
3563  int result;
3564  int i, j, k;
3565  int ndests; /* Number of the destinations from `state'.  */
3566  bitset_t accepts; /* Characters a node can accept.  */
3567  const re_node_set *cur_nodes = &state->nodes;
3568  bitset_empty (accepts);
3569  ndests = 0;
3570
3571  /* For all the nodes belonging to `state',  */
3572  for (i = 0; i < cur_nodes->nelem; ++i)
3573    {
3574      re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3575      re_token_type_t type = node->type;
3576      unsigned int constraint = node->constraint;
3577
3578      /* Enumerate all single byte character this node can accept.  */
3579      if (type == CHARACTER)
3580        bitset_set (accepts, node->opr.c);
3581      else if (type == SIMPLE_BRACKET)
3582        {
3583          bitset_merge (accepts, node->opr.sbcset);
3584        }
3585      else if (type == OP_PERIOD)
3586        {
3587#ifdef RE_ENABLE_I18N
3588          if (dfa->mb_cur_max > 1)
3589            bitset_merge (accepts, dfa->sb_char);
3590          else
3591#endif
3592            bitset_set_all (accepts);
3593          if (!(dfa->syntax & RE_DOT_NEWLINE))
3594            bitset_clear (accepts, '\n');
3595          if (dfa->syntax & RE_DOT_NOT_NULL)
3596            bitset_clear (accepts, '\0');
3597        }
3598#ifdef RE_ENABLE_I18N
3599      else if (type == OP_UTF8_PERIOD)
3600        {
3601          memset (accepts, '\xff', sizeof (bitset_t) / 2);
3602          if (!(dfa->syntax & RE_DOT_NEWLINE))
3603            bitset_clear (accepts, '\n');
3604          if (dfa->syntax & RE_DOT_NOT_NULL)
3605            bitset_clear (accepts, '\0');
3606        }
3607#endif
3608      else
3609        continue;
3610
3611      /* Check the `accepts' and sift the characters which are not
3612         match it the context.  */
3613      if (constraint)
3614        {
3615          if (constraint & NEXT_NEWLINE_CONSTRAINT)
3616            {
3617              bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3618              bitset_empty (accepts);
3619              if (accepts_newline)
3620                bitset_set (accepts, NEWLINE_CHAR);
3621              else
3622                continue;
3623            }
3624          if (constraint & NEXT_ENDBUF_CONSTRAINT)
3625            {
3626              bitset_empty (accepts);
3627              continue;
3628            }
3629
3630          if (constraint & NEXT_WORD_CONSTRAINT)
3631            {
3632              bitset_word_t any_set = 0;
3633              if (type == CHARACTER && !node->word_char)
3634                {
3635                  bitset_empty (accepts);
3636                  continue;
3637                }
3638#ifdef RE_ENABLE_I18N
3639              if (dfa->mb_cur_max > 1)
3640                for (j = 0; j < BITSET_WORDS; ++j)
3641                  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3642              else
3643#endif
3644                for (j = 0; j < BITSET_WORDS; ++j)
3645                  any_set |= (accepts[j] &= dfa->word_char[j]);
3646              if (!any_set)
3647                continue;
3648            }
3649          if (constraint & NEXT_NOTWORD_CONSTRAINT)
3650            {
3651              bitset_word_t any_set = 0;
3652              if (type == CHARACTER && node->word_char)
3653                {
3654                  bitset_empty (accepts);
3655                  continue;
3656                }
3657#ifdef RE_ENABLE_I18N
3658              if (dfa->mb_cur_max > 1)
3659                for (j = 0; j < BITSET_WORDS; ++j)
3660                  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3661              else
3662#endif
3663                for (j = 0; j < BITSET_WORDS; ++j)
3664                  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3665              if (!any_set)
3666                continue;
3667            }
3668        }
3669
3670      /* Then divide `accepts' into DFA states, or create a new
3671         state.  Above, we make sure that accepts is not empty.  */
3672      for (j = 0; j < ndests; ++j)
3673        {
3674          bitset_t intersec; /* Intersection sets, see below.  */
3675          bitset_t remains;
3676          /* Flags, see below.  */
3677          bitset_word_t has_intersec, not_subset, not_consumed;
3678
3679          /* Optimization, skip if this state doesn't accept the character.  */
3680          if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3681            continue;
3682
3683          /* Enumerate the intersection set of this state and `accepts'.  */
3684          has_intersec = 0;
3685          for (k = 0; k < BITSET_WORDS; ++k)
3686            has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3687          /* And skip if the intersection set is empty.  */
3688          if (!has_intersec)
3689            continue;
3690
3691          /* Then check if this state is a subset of `accepts'.  */
3692          not_subset = not_consumed = 0;
3693          for (k = 0; k < BITSET_WORDS; ++k)
3694            {
3695              not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3696              not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3697            }
3698
3699          /* If this state isn't a subset of `accepts', create a
3700             new group state, which has the `remains'. */
3701          if (not_subset)
3702            {
3703              bitset_copy (dests_ch[ndests], remains);
3704              bitset_copy (dests_ch[j], intersec);
3705              err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3706              if (BE (err != REG_NOERROR, 0))
3707                goto error_return;
3708              ++ndests;
3709            }
3710
3711          /* Put the position in the current group. */
3712          result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3713          if (BE (result < 0, 0))
3714            goto error_return;
3715
3716          /* If all characters are consumed, go to next node. */
3717          if (!not_consumed)
3718            break;
3719        }
3720      /* Some characters remain, create a new group. */
3721      if (j == ndests)
3722        {
3723          bitset_copy (dests_ch[ndests], accepts);
3724          err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3725          if (BE (err != REG_NOERROR, 0))
3726            goto error_return;
3727          ++ndests;
3728          bitset_empty (accepts);
3729        }
3730    }
3731  return ndests;
3732 error_return:
3733  for (j = 0; j < ndests; ++j)
3734    re_node_set_free (dests_node + j);
3735  return -1;
3736}
3737
3738#ifdef RE_ENABLE_I18N
3739/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3740   Return the number of the bytes the node accepts.
3741   STR_IDX is the current index of the input string.
3742
3743   This function handles the nodes which can accept one character, or
3744   one collating element like '.', '[a-z]', opposite to the other nodes
3745   can only accept one byte.  */
3746
3747static int
3748internal_function
3749check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3750                         const re_string_t *input, int str_idx)
3751{
3752  const re_token_t *node = dfa->nodes + node_idx;
3753  int char_len, elem_len;
3754  int i;
3755  wint_t wc;
3756
3757  if (BE (node->type == OP_UTF8_PERIOD, 0))
3758    {
3759      unsigned char c = re_string_byte_at (input, str_idx), d;
3760      if (BE (c < 0xc2, 1))
3761        return 0;
3762
3763      if (str_idx + 2 > input->len)
3764        return 0;
3765
3766      d = re_string_byte_at (input, str_idx + 1);
3767      if (c < 0xe0)
3768        return (d < 0x80 || d > 0xbf) ? 0 : 2;
3769      else if (c < 0xf0)
3770        {
3771          char_len = 3;
3772          if (c == 0xe0 && d < 0xa0)
3773            return 0;
3774        }
3775      else if (c < 0xf8)
3776        {
3777          char_len = 4;
3778          if (c == 0xf0 && d < 0x90)
3779            return 0;
3780        }
3781      else if (c < 0xfc)
3782        {
3783          char_len = 5;
3784          if (c == 0xf8 && d < 0x88)
3785            return 0;
3786        }
3787      else if (c < 0xfe)
3788        {
3789          char_len = 6;
3790          if (c == 0xfc && d < 0x84)
3791            return 0;
3792        }
3793      else
3794        return 0;
3795
3796      if (str_idx + char_len > input->len)
3797        return 0;
3798
3799      for (i = 1; i < char_len; ++i)
3800        {
3801          d = re_string_byte_at (input, str_idx + i);
3802          if (d < 0x80 || d > 0xbf)
3803            return 0;
3804        }
3805      return char_len;
3806    }
3807
3808  char_len = re_string_char_size_at (input, str_idx);
3809  if (node->type == OP_PERIOD)
3810    {
3811      if (char_len <= 1)
3812        return 0;
3813      /* FIXME: I don't think this if is needed, as both '\n'
3814         and '\0' are char_len == 1.  */
3815      /* '.' accepts any one character except the following two cases.  */
3816      if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3817           re_string_byte_at (input, str_idx) == '\n') ||
3818          ((dfa->syntax & RE_DOT_NOT_NULL) &&
3819           re_string_byte_at (input, str_idx) == '\0'))
3820        return 0;
3821      return char_len;
3822    }
3823
3824  elem_len = re_string_elem_size_at (input, str_idx);
3825  wc = __btowc(*(input->mbs+str_idx));
3826  if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3827    return 0;
3828
3829  if (node->type == COMPLEX_BRACKET)
3830    {
3831      const re_charset_t *cset = node->opr.mbcset;
3832# ifdef _LIBC
3833      const unsigned char *pin
3834        = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3835      int j;
3836      uint32_t nrules;
3837# endif /* _LIBC */
3838      int match_len = 0;
3839      wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3840                    ? re_string_wchar_at (input, str_idx) : 0);
3841
3842      /* match with multibyte character?  */
3843      for (i = 0; i < cset->nmbchars; ++i)
3844        if (wc == cset->mbchars[i])
3845          {
3846            match_len = char_len;
3847            goto check_node_accept_bytes_match;
3848          }
3849      /* match with character_class?  */
3850      for (i = 0; i < cset->nchar_classes; ++i)
3851        {
3852          wctype_t wt = cset->char_classes[i];
3853          if (__iswctype (wc, wt))
3854            {
3855              match_len = char_len;
3856              goto check_node_accept_bytes_match;
3857            }
3858        }
3859
3860# ifdef _LIBC
3861      nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3862      if (nrules != 0)
3863        {
3864          unsigned int in_collseq = 0;
3865          const int32_t *table, *indirect;
3866          const unsigned char *weights, *extra;
3867          const char *collseqwc;
3868          /* This #include defines a local function!  */
3869#  include <locale/weight.h>
3870
3871          /* match with collating_symbol?  */
3872          if (cset->ncoll_syms)
3873            extra = (const unsigned char *)
3874              _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3875          for (i = 0; i < cset->ncoll_syms; ++i)
3876            {
3877              const unsigned char *coll_sym = extra + cset->coll_syms[i];
3878              /* Compare the length of input collating element and
3879                 the length of current collating element.  */
3880              if (*coll_sym != elem_len)
3881                continue;
3882              /* Compare each bytes.  */
3883              for (j = 0; j < *coll_sym; j++)
3884                if (pin[j] != coll_sym[1 + j])
3885                  break;
3886              if (j == *coll_sym)
3887                {
3888                  /* Match if every bytes is equal.  */
3889                  match_len = j;
3890                  goto check_node_accept_bytes_match;
3891                }
3892            }
3893
3894          if (cset->nranges)
3895            {
3896              if (elem_len <= char_len)
3897                {
3898                  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3899                  in_collseq = __collseq_table_lookup (collseqwc, wc);
3900                }
3901              else
3902                in_collseq = find_collation_sequence_value (pin, elem_len);
3903            }
3904          /* match with range expression?  */
3905          for (i = 0; i < cset->nranges; ++i)
3906            if (cset->range_starts[i] <= in_collseq
3907                && in_collseq <= cset->range_ends[i])
3908              {
3909                match_len = elem_len;
3910                goto check_node_accept_bytes_match;
3911              }
3912
3913          /* match with equivalence_class?  */
3914          if (cset->nequiv_classes)
3915            {
3916              const unsigned char *cp = pin;
3917              table = (const int32_t *)
3918                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3919              weights = (const unsigned char *)
3920                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3921              extra = (const unsigned char *)
3922                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3923              indirect = (const int32_t *)
3924                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3925              int32_t idx = findidx (&cp);
3926              if (idx > 0)
3927                for (i = 0; i < cset->nequiv_classes; ++i)
3928                  {
3929                    int32_t equiv_class_idx = cset->equiv_classes[i];
3930                    size_t weight_len = weights[idx & 0xffffff];
3931                    if (weight_len == weights[equiv_class_idx & 0xffffff]
3932                        && (idx >> 24) == (equiv_class_idx >> 24))
3933                      {
3934                        int cnt = 0;
3935
3936                        idx &= 0xffffff;
3937                        equiv_class_idx &= 0xffffff;
3938
3939                        while (cnt <= weight_len
3940                               && (weights[equiv_class_idx + 1 + cnt]
3941                                   == weights[idx + 1 + cnt]))
3942                          ++cnt;
3943                        if (cnt > weight_len)
3944                          {
3945                            match_len = elem_len;
3946                            goto check_node_accept_bytes_match;
3947                          }
3948                      }
3949                  }
3950            }
3951        }
3952      else
3953# endif /* _LIBC */
3954        {
3955          /* match with range expression?  */
3956#if __GNUC__ >= 2
3957          wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3958#else
3959          wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3960          cmp_buf[2] = wc;
3961#endif
3962          for (i = 0; i < cset->nranges; ++i)
3963            {
3964              cmp_buf[0] = cset->range_starts[i];
3965              cmp_buf[4] = cset->range_ends[i];
3966              if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3967                  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3968                {
3969                  match_len = char_len;
3970                  goto check_node_accept_bytes_match;
3971                }
3972            }
3973        }
3974    check_node_accept_bytes_match:
3975      if (!cset->non_match)
3976        return match_len;
3977      else
3978        {
3979          if (match_len > 0)
3980            return 0;
3981          else
3982            return (elem_len > char_len) ? elem_len : char_len;
3983        }
3984    }
3985  return 0;
3986}
3987
3988# ifdef _LIBC
3989static unsigned int
3990internal_function
3991find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3992{
3993  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3994  if (nrules == 0)
3995    {
3996      if (mbs_len == 1)
3997        {
3998          /* No valid character.  Match it as a single byte character.  */
3999          const unsigned char *collseq = (const unsigned char *)
4000            _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4001          return collseq[mbs[0]];
4002        }
4003      return UINT_MAX;
4004    }
4005  else
4006    {
4007      int32_t idx;
4008      const unsigned char *extra = (const unsigned char *)
4009        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4010      int32_t extrasize = (const unsigned char *)
4011        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4012
4013      for (idx = 0; idx < extrasize;)
4014        {
4015          int mbs_cnt, found = 0;
4016          int32_t elem_mbs_len;
4017          /* Skip the name of collating element name.  */
4018          idx = idx + extra[idx] + 1;
4019          elem_mbs_len = extra[idx++];
4020          if (mbs_len == elem_mbs_len)
4021            {
4022              for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4023                if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4024                  break;
4025              if (mbs_cnt == elem_mbs_len)
4026                /* Found the entry.  */
4027                found = 1;
4028            }
4029          /* Skip the byte sequence of the collating element.  */
4030          idx += elem_mbs_len;
4031          /* Adjust for the alignment.  */
4032          idx = (idx + 3) & ~3;
4033          /* Skip the collation sequence value.  */
4034          idx += sizeof (uint32_t);
4035          /* Skip the wide char sequence of the collating element.  */
4036          idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4037          /* If we found the entry, return the sequence value.  */
4038          if (found)
4039            return *(uint32_t *) (extra + idx);
4040          /* Skip the collation sequence value.  */
4041          idx += sizeof (uint32_t);
4042        }
4043      return UINT_MAX;
4044    }
4045}
4046# endif /* _LIBC */
4047#endif /* RE_ENABLE_I18N */
4048
4049/* Check whether the node accepts the byte which is IDX-th
4050   byte of the INPUT.  */
4051
4052static int
4053internal_function
4054check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4055                   int idx)
4056{
4057  unsigned char ch;
4058  ch = re_string_byte_at (&mctx->input, idx);
4059  switch (node->type)
4060    {
4061    case CHARACTER:
4062      if (node->opr.c != ch)
4063        return 0;
4064      break;
4065
4066    case SIMPLE_BRACKET:
4067      if (!bitset_contain (node->opr.sbcset, ch))
4068        return 0;
4069      break;
4070
4071#ifdef RE_ENABLE_I18N
4072    case OP_UTF8_PERIOD:
4073      if (ch >= 0x80)
4074        return 0;
4075      /* FALLTHROUGH */
4076#endif
4077    case OP_PERIOD:
4078      if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4079          || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4080        return 0;
4081      break;
4082
4083    default:
4084      return 0;
4085    }
4086
4087  if (node->constraint)
4088    {
4089      /* The node has constraints.  Check whether the current context
4090         satisfies the constraints.  */
4091      unsigned int context = re_string_context_at (&mctx->input, idx,
4092                                                   mctx->eflags);
4093      if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4094        return 0;
4095    }
4096
4097  return 1;
4098}
4099
4100/* Extend the buffers, if the buffers have run out.  */
4101
4102static reg_errcode_t
4103internal_function
4104extend_buffers (re_match_context_t *mctx)
4105{
4106  reg_errcode_t ret;
4107  re_string_t *pstr = &mctx->input;
4108
4109  /* Avoid overflow.  */
4110  if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4111    return REG_ESPACE;
4112
4113  /* Double the lengthes of the buffers.  */
4114  ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4115  if (BE (ret != REG_NOERROR, 0))
4116    return ret;
4117
4118  if (mctx->state_log != NULL)
4119    {
4120      /* And double the length of state_log.  */
4121      /* XXX We have no indication of the size of this buffer.  If this
4122         allocation fail we have no indication that the state_log array
4123         does not have the right size.  */
4124      re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4125                                              pstr->bufs_len + 1);
4126      if (BE (new_array == NULL, 0))
4127        return REG_ESPACE;
4128      mctx->state_log = new_array;
4129    }
4130
4131  /* Then reconstruct the buffers.  */
4132  if (pstr->icase)
4133    {
4134#ifdef RE_ENABLE_I18N
4135      if (pstr->mb_cur_max > 1)
4136        {
4137          ret = build_wcs_upper_buffer (pstr);
4138          if (BE (ret != REG_NOERROR, 0))
4139            return ret;
4140        }
4141      else
4142#endif /* RE_ENABLE_I18N  */
4143        build_upper_buffer (pstr);
4144    }
4145  else
4146    {
4147#ifdef RE_ENABLE_I18N
4148      if (pstr->mb_cur_max > 1)
4149        build_wcs_buffer (pstr);
4150      else
4151#endif /* RE_ENABLE_I18N  */
4152        {
4153          if (pstr->trans != NULL)
4154            re_string_translate_buffer (pstr);
4155        }
4156    }
4157  return REG_NOERROR;
4158}
4159
4160\f
4161/* Functions for matching context.  */
4162
4163/* Initialize MCTX.  */
4164
4165static reg_errcode_t
4166internal_function
4167match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4168{
4169  mctx->eflags = eflags;
4170  mctx->match_last = -1;
4171  if (n > 0)
4172    {
4173      mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4174      mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4175      if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4176        return REG_ESPACE;
4177    }
4178  /* Already zero-ed by the caller.
4179     else
4180       mctx->bkref_ents = NULL;
4181     mctx->nbkref_ents = 0;
4182     mctx->nsub_tops = 0;  */
4183  mctx->abkref_ents = n;
4184  mctx->max_mb_elem_len = 1;
4185  mctx->asub_tops = n;
4186  return REG_NOERROR;
4187}
4188
4189/* Clean the entries which depend on the current input in MCTX.
4190   This function must be invoked when the matcher changes the start index
4191   of the input, or changes the input string.  */
4192
4193static void
4194internal_function
4195match_ctx_clean (re_match_context_t *mctx)
4196{
4197  int st_idx;
4198  for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4199    {
4200      int sl_idx;
4201      re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4202      for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4203        {
4204          re_sub_match_last_t *last = top->lasts[sl_idx];
4205          re_free (last->path.array);
4206          re_free (last);
4207        }
4208      re_free (top->lasts);
4209      if (top->path)
4210        {
4211          re_free (top->path->array);
4212          re_free (top->path);
4213        }
4214      free (top);
4215    }
4216
4217  mctx->nsub_tops = 0;
4218  mctx->nbkref_ents = 0;
4219}
4220
4221/* Free all the memory associated with MCTX.  */
4222
4223static void
4224internal_function
4225match_ctx_free (re_match_context_t *mctx)
4226{
4227  /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4228  match_ctx_clean (mctx);
4229  re_free (mctx->sub_tops);
4230  re_free (mctx->bkref_ents);
4231}
4232
4233/* Add a new backreference entry to MCTX.
4234   Note that we assume that caller never call this function with duplicate
4235   entry, and call with STR_IDX which isn't smaller than any existing entry.
4236*/
4237
4238static reg_errcode_t
4239internal_function
4240match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4241                     int to)
4242{
4243  if (mctx->nbkref_ents >= mctx->abkref_ents)
4244    {
4245      struct re_backref_cache_entry* new_entry;
4246      new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4247                              mctx->abkref_ents * 2);
4248      if (BE (new_entry == NULL, 0))
4249        {
4250          re_free (mctx->bkref_ents);
4251          return REG_ESPACE;
4252        }
4253      mctx->bkref_ents = new_entry;
4254      memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4255              sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4256      mctx->abkref_ents *= 2;
4257    }
4258  if (mctx->nbkref_ents > 0
4259      && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4260    mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4261
4262  mctx->bkref_ents[mctx->nbkref_ents].node = node;
4263  mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4264  mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4265  mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4266
4267  /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4268     If bit N is clear, means that this entry won't epsilon-transition to
4269     an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4270     it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4271     such node.
4272
4273     A backreference does not epsilon-transition unless it is empty, so set
4274     to all zeros if FROM != TO.  */
4275  mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4276    = (from == to ? ~0 : 0);
4277
4278  mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4279  if (mctx->max_mb_elem_len < to - from)
4280    mctx->max_mb_elem_len = to - from;
4281  return REG_NOERROR;
4282}
4283
4284/* Search for the first entry which has the same str_idx, or -1 if none is
4285   found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4286
4287static int
4288internal_function
4289search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4290{
4291  int left, right, mid, last;
4292  last = right = mctx->nbkref_ents;
4293  for (left = 0; left < right;)
4294    {
4295      mid = (left + right) / 2;
4296      if (mctx->bkref_ents[mid].str_idx < str_idx)
4297        left = mid + 1;
4298      else
4299        right = mid;
4300    }
4301  if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4302    return left;
4303  else
4304    return -1;
4305}
4306
4307/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4308   at STR_IDX.  */
4309
4310static reg_errcode_t
4311internal_function
4312match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4313{
4314#ifdef DEBUG
4315  assert (mctx->sub_tops != NULL);
4316  assert (mctx->asub_tops > 0);
4317#endif
4318  if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4319    {
4320      int new_asub_tops = mctx->asub_tops * 2;
4321      re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4322                                                   re_sub_match_top_t *,
4323                                                   new_asub_tops);
4324      if (BE (new_array == NULL, 0))
4325        return REG_ESPACE;
4326      mctx->sub_tops = new_array;
4327      mctx->asub_tops = new_asub_tops;
4328    }
4329  mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4330  if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4331    return REG_ESPACE;
4332  mctx->sub_tops[mctx->nsub_tops]->node = node;
4333  mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4334  return REG_NOERROR;
4335}
4336
4337/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4338   at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4339
4340static re_sub_match_last_t *
4341internal_function
4342match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4343{
4344  re_sub_match_last_t *new_entry;
4345  if (BE (subtop->nlasts == subtop->alasts, 0))
4346    {
4347      int new_alasts = 2 * subtop->alasts + 1;
4348      re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4349                                                    re_sub_match_last_t *,
4350                                                    new_alasts);
4351      if (BE (new_array == NULL, 0))
4352        return NULL;
4353      subtop->lasts = new_array;
4354      subtop->alasts = new_alasts;
4355    }
4356  new_entry = calloc (1, sizeof (re_sub_match_last_t));
4357  if (BE (new_entry != NULL, 1))
4358    {
4359      subtop->lasts[subtop->nlasts] = new_entry;
4360      new_entry->node = node;
4361      new_entry->str_idx = str_idx;
4362      ++subtop->nlasts;
4363    }
4364  return new_entry;
4365}
4366
4367static void
4368internal_function
4369sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4370               re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4371{
4372  sctx->sifted_states = sifted_sts;
4373  sctx->limited_states = limited_sts;
4374  sctx->last_node = last_node;
4375  sctx->last_str_idx = last_str_idx;
4376  re_node_set_init_empty (&sctx->limits);
4377}