Andrew's git - gitweb.git/blob - compat/regex/regexec.c

   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, write to the Free
  18   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19   02110-1301 USA.  */
  20
  21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                     int n) internal_function;
  23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24static void match_ctx_free (re_match_context_t *cache) internal_function;
  25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                          int str_idx, int from, int to)
  27     internal_function;
  28static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29     internal_function;
  30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                           int str_idx) internal_function;
  32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                   int node, int str_idx)
  34     internal_function;
  35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                           re_dfastate_t **limited_sts, int last_node,
  37                           int last_str_idx)
  38     internal_function;
  39static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                         const char *string, int length,
  41                                         int start, int range, int stop,
  42                                         size_t nmatch, regmatch_t pmatch[],
  43                                         int eflags) internal_function;
  44static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                             const char *string1, int length1,
  46                             const char *string2, int length2,
  47                             int start, int range, struct re_registers *regs,
  48                             int stop, int ret_len) internal_function;
  49static int re_search_stub (struct re_pattern_buffer *bufp,
  50                           const char *string, int length, int start,
  51                           int range, int stop, struct re_registers *regs,
  52                           int ret_len) internal_function;
  53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                              int nregs, int regs_allocated) internal_function;
  55static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56     internal_function;
  57static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                           int *p_match_first) internal_function;
  59static int check_halt_state_context (const re_match_context_t *mctx,
  60                                     const re_dfastate_t *state, int idx)
  61     internal_function;
  62static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                         regmatch_t *prev_idx_match, int cur_node,
  64                         int cur_idx, int nmatch) internal_function;
  65static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                      int str_idx, int dest_node, int nregs,
  67                                      regmatch_t *regs,
  68                                      re_node_set *eps_via_nodes)
  69     internal_function;
  70static reg_errcode_t set_regs (const regex_t *preg,
  71                               const re_match_context_t *mctx,
  72                               size_t nmatch, regmatch_t *pmatch,
  73                               int fl_backtrack) internal_function;
  74static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75     internal_function;
  76
  77#ifdef RE_ENABLE_I18N
  78static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                re_sift_context_t *sctx,
  80                                int node_idx, int str_idx, int max_str_idx)
  81     internal_function;
  82#endif /* RE_ENABLE_I18N */
  83static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                           re_sift_context_t *sctx)
  85     internal_function;
  86static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                          re_sift_context_t *sctx, int str_idx,
  88                                          re_node_set *cur_dest)
  89     internal_function;
  90static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                              re_sift_context_t *sctx,
  92                                              int str_idx,
  93                                              re_node_set *dest_nodes)
  94     internal_function;
  95static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                            re_node_set *dest_nodes,
  97                                            const re_node_set *candidates)
  98     internal_function;
  99static int check_dst_limits (const re_match_context_t *mctx,
 100                             re_node_set *limits,
 101                             int dst_node, int dst_idx, int src_node,
 102                             int src_idx) internal_function;
 103static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                        int boundaries, int subexp_idx,
 105                                        int from_node, int bkref_idx)
 106     internal_function;
 107static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                      int limit, int subexp_idx,
 109                                      int node, int str_idx,
 110                                      int bkref_idx) internal_function;
 111static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                          re_node_set *dest_nodes,
 113                                          const re_node_set *candidates,
 114                                          re_node_set *limits,
 115                                          struct re_backref_cache_entry *bkref_ents,
 116                                          int str_idx) internal_function;
 117static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                        re_sift_context_t *sctx,
 119                                        int str_idx, const re_node_set *candidates)
 120     internal_function;
 121static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                        re_dfastate_t **dst,
 123                                        re_dfastate_t **src, int num)
 124     internal_function;
 125static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                         re_match_context_t *mctx) internal_function;
 127static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                     re_match_context_t *mctx,
 129                                     re_dfastate_t *state) internal_function;
 130static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                            re_match_context_t *mctx,
 132                                            re_dfastate_t *next_state)
 133     internal_function;
 134static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                re_node_set *cur_nodes,
 136                                                int str_idx) internal_function;
 137#if 0
 138static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                        re_match_context_t *mctx,
 140                                        re_dfastate_t *pstate)
 141     internal_function;
 142#endif
 143#ifdef RE_ENABLE_I18N
 144static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                       re_dfastate_t *pstate)
 146     internal_function;
 147#endif /* RE_ENABLE_I18N */
 148static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                          const re_node_set *nodes)
 150     internal_function;
 151static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                 int bkref_node, int bkref_str_idx)
 153     internal_function;
 154static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                     const re_sub_match_top_t *sub_top,
 156                                     re_sub_match_last_t *sub_last,
 157                                     int bkref_node, int bkref_str)
 158     internal_function;
 159static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                             int subexp_idx, int type) internal_function;
 161static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                    state_array_t *path, int top_node,
 163                                    int top_str, int last_node, int last_str,
 164                                    int type) internal_function;
 165static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                   int str_idx,
 167                                                   re_node_set *cur_nodes,
 168                                                   re_node_set *next_nodes)
 169     internal_function;
 170static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                               re_node_set *cur_nodes,
 172                                               int ex_subexp, int type)
 173     internal_function;
 174static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                   re_node_set *dst_nodes,
 176                                                   int target, int ex_subexp,
 177                                                   int type) internal_function;
 178static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                         re_node_set *cur_nodes, int cur_str,
 180                                         int subexp_num, int type)
 181     internal_function;
 182static int build_trtable (const re_dfa_t *dfa,
 183                          re_dfastate_t *state) internal_function;
 184#ifdef RE_ENABLE_I18N
 185static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                    const re_string_t *input, int idx)
 187     internal_function;
 188# ifdef _LIBC
 189static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                   size_t name_len)
 191     internal_function;
 192# endif /* _LIBC */
 193#endif /* RE_ENABLE_I18N */
 194static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                       const re_dfastate_t *state,
 196                                       re_node_set *states_node,
 197                                       bitset_t *states_ch) internal_function;
 198static int check_node_accept (const re_match_context_t *mctx,
 199                              const re_token_t *node, int idx)
 200     internal_function;
 201static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202     internal_function;
 203\f
 204/* Entry point for POSIX code.  */
 205
 206/* regexec searches for a given pattern, specified by PREG, in the
 207   string STRING.
 208
 209   If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210   `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211   least NMATCH elements, and we set them to the offsets of the
 212   corresponding matched substrings.
 213
 214   EFLAGS specifies `execution flags' which affect matching: if
 215   REG_NOTBOL is set, then ^ does not match at the beginning of the
 216   string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218   We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220int
 221regexec (preg, string, nmatch, pmatch, eflags)
 222    const regex_t *__restrict preg;
 223    const char *__restrict string;
 224    size_t nmatch;
 225    regmatch_t pmatch[];
 226    int eflags;
 227{
 228  reg_errcode_t err;
 229  int start, length;
 230
 231  if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 232    return REG_BADPAT;
 233
 234  if (eflags & REG_STARTEND)
 235    {
 236      start = pmatch[0].rm_so;
 237      length = pmatch[0].rm_eo;
 238    }
 239  else
 240    {
 241      start = 0;
 242      length = strlen (string);
 243    }
 244
 245  __libc_lock_lock (dfa->lock);
 246  if (preg->no_sub)
 247    err = re_search_internal (preg, string, length, start, length - start,
 248                              length, 0, NULL, eflags);
 249  else
 250    err = re_search_internal (preg, string, length, start, length - start,
 251                              length, nmatch, pmatch, eflags);
 252  __libc_lock_unlock (dfa->lock);
 253  return err != REG_NOERROR;
 254}
 255
 256#ifdef _LIBC
 257# include <shlib-compat.h>
 258versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 259
 260# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 261__typeof__ (__regexec) __compat_regexec;
 262
 263int
 264attribute_compat_text_section
 265__compat_regexec (const regex_t *__restrict preg,
 266                  const char *__restrict string, size_t nmatch,
 267                  regmatch_t pmatch[], int eflags)
 268{
 269  return regexec (preg, string, nmatch, pmatch,
 270                  eflags & (REG_NOTBOL | REG_NOTEOL));
 271}
 272compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 273# endif
 274#endif
 275
 276/* Entry points for GNU code.  */
 277
 278/* re_match, re_search, re_match_2, re_search_2
 279
 280   The former two functions operate on STRING with length LENGTH,
 281   while the later two operate on concatenation of STRING1 and STRING2
 282   with lengths LENGTH1 and LENGTH2, respectively.
 283
 284   re_match() matches the compiled pattern in BUFP against the string,
 285   starting at index START.
 286
 287   re_search() first tries matching at index START, then it tries to match
 288   starting from index START + 1, and so on.  The last start position tried
 289   is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 290   way as re_match().)
 291
 292   The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 293   the first STOP characters of the concatenation of the strings should be
 294   concerned.
 295
 296   If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 297   and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 298   computed relative to the concatenation, not relative to the individual
 299   strings.)
 300
 301   On success, re_match* functions return the length of the match, re_search*
 302   return the position of the start of the match.  Return value -1 means no
 303   match was found and -2 indicates an internal error.  */
 304
 305int
 306re_match (bufp, string, length, start, regs)
 307    struct re_pattern_buffer *bufp;
 308    const char *string;
 309    int length, start;
 310    struct re_registers *regs;
 311{
 312  return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 313}
 314#ifdef _LIBC
 315weak_alias (__re_match, re_match)
 316#endif
 317
 318int
 319re_search (bufp, string, length, start, range, regs)
 320    struct re_pattern_buffer *bufp;
 321    const char *string;
 322    int length, start, range;
 323    struct re_registers *regs;
 324{
 325  return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 326}
 327#ifdef _LIBC
 328weak_alias (__re_search, re_search)
 329#endif
 330
 331int
 332re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 333    struct re_pattern_buffer *bufp;
 334    const char *string1, *string2;
 335    int length1, length2, start, stop;
 336    struct re_registers *regs;
 337{
 338  return re_search_2_stub (bufp, string1, length1, string2, length2,
 339                           start, 0, regs, stop, 1);
 340}
 341#ifdef _LIBC
 342weak_alias (__re_match_2, re_match_2)
 343#endif
 344
 345int
 346re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 347    struct re_pattern_buffer *bufp;
 348    const char *string1, *string2;
 349    int length1, length2, start, range, stop;
 350    struct re_registers *regs;
 351{
 352  return re_search_2_stub (bufp, string1, length1, string2, length2,
 353                           start, range, regs, stop, 0);
 354}
 355#ifdef _LIBC
 356weak_alias (__re_search_2, re_search_2)
 357#endif
 358
 359static int
 360re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 361                  stop, ret_len)
 362    struct re_pattern_buffer *bufp;
 363    const char *string1, *string2;
 364    int length1, length2, start, range, stop, ret_len;
 365    struct re_registers *regs;
 366{
 367  const char *str;
 368  int rval;
 369  int len = length1 + length2;
 370  int free_str = 0;
 371
 372  if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 373    return -2;
 374
 375  /* Concatenate the strings.  */
 376  if (length2 > 0)
 377    if (length1 > 0)
 378      {
 379        char *s = re_malloc (char, len);
 380
 381        if (BE (s == NULL, 0))
 382          return -2;
 383        memcpy (s, string1, length1);
 384        memcpy (s + length1, string2, length2);
 385        str = s;
 386        free_str = 1;
 387      }
 388    else
 389      str = string2;
 390  else
 391    str = string1;
 392
 393  rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 394  if (free_str)
 395    re_free ((char *) str);
 396  return rval;
 397}
 398
 399/* The parameters have the same meaning as those of re_search.
 400   Additional parameters:
 401   If RET_LEN is nonzero the length of the match is returned (re_match style);
 402   otherwise the position of the match is returned.  */
 403
 404static int
 405re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 406    struct re_pattern_buffer *bufp;
 407    const char *string;
 408    int length, start, range, stop, ret_len;
 409    struct re_registers *regs;
 410{
 411  reg_errcode_t result;
 412  regmatch_t *pmatch;
 413  int nregs, rval;
 414  int eflags = 0;
 415
 416  /* Check for out-of-range.  */
 417  if (BE (start < 0 || start > length, 0))
 418    return -1;
 419  if (BE (start + range > length, 0))
 420    range = length - start;
 421  else if (BE (start + range < 0, 0))
 422    range = -start;
 423
 424  __libc_lock_lock (dfa->lock);
 425
 426  eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 427  eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 428
 429  /* Compile fastmap if we haven't yet.  */
 430  if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 431    re_compile_fastmap (bufp);
 432
 433  if (BE (bufp->no_sub, 0))
 434    regs = NULL;
 435
 436  /* We need at least 1 register.  */
 437  if (regs == NULL)
 438    nregs = 1;
 439  else if (BE (bufp->regs_allocated == REGS_FIXED &&
 440               regs->num_regs < bufp->re_nsub + 1, 0))
 441    {
 442      nregs = regs->num_regs;
 443      if (BE (nregs < 1, 0))
 444        {
 445          /* Nothing can be copied to regs.  */
 446          regs = NULL;
 447          nregs = 1;
 448        }
 449    }
 450  else
 451    nregs = bufp->re_nsub + 1;
 452  pmatch = re_malloc (regmatch_t, nregs);
 453  if (BE (pmatch == NULL, 0))
 454    {
 455      rval = -2;
 456      goto out;
 457    }
 458
 459  result = re_search_internal (bufp, string, length, start, range, stop,
 460                               nregs, pmatch, eflags);
 461
 462  rval = 0;
 463
 464  /* I hope we needn't fill ther regs with -1's when no match was found.  */
 465  if (result != REG_NOERROR)
 466    rval = -1;
 467  else if (regs != NULL)
 468    {
 469      /* If caller wants register contents data back, copy them.  */
 470      bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 471                                           bufp->regs_allocated);
 472      if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 473        rval = -2;
 474    }
 475
 476  if (BE (rval == 0, 1))
 477    {
 478      if (ret_len)
 479        {
 480          assert (pmatch[0].rm_so == start);
 481          rval = pmatch[0].rm_eo - start;
 482        }
 483      else
 484        rval = pmatch[0].rm_so;
 485    }
 486  re_free (pmatch);
 487 out:
 488  __libc_lock_unlock (dfa->lock);
 489  return rval;
 490}
 491
 492static unsigned
 493re_copy_regs (regs, pmatch, nregs, regs_allocated)
 494    struct re_registers *regs;
 495    regmatch_t *pmatch;
 496    int nregs, regs_allocated;
 497{
 498  int rval = REGS_REALLOCATE;
 499  int i;
 500  int need_regs = nregs + 1;
 501  /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 502     uses.  */
 503
 504  /* Have the register data arrays been allocated?  */
 505  if (regs_allocated == REGS_UNALLOCATED)
 506    { /* No.  So allocate them with malloc.  */
 507      regs->start = re_malloc (regoff_t, need_regs);
 508      if (BE (regs->start == NULL, 0))
 509        return REGS_UNALLOCATED;
 510      regs->end = re_malloc (regoff_t, need_regs);
 511      if (BE (regs->end == NULL, 0))
 512        {
 513          re_free (regs->start);
 514          return REGS_UNALLOCATED;
 515        }
 516      regs->num_regs = need_regs;
 517    }
 518  else if (regs_allocated == REGS_REALLOCATE)
 519    { /* Yes.  If we need more elements than were already
 520         allocated, reallocate them.  If we need fewer, just
 521         leave it alone.  */
 522      if (BE (need_regs > regs->num_regs, 0))
 523        {
 524          regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 525          regoff_t *new_end;
 526          if (BE (new_start == NULL, 0))
 527            return REGS_UNALLOCATED;
 528          new_end = re_realloc (regs->end, regoff_t, need_regs);
 529          if (BE (new_end == NULL, 0))
 530            {
 531              re_free (new_start);
 532              return REGS_UNALLOCATED;
 533            }
 534          regs->start = new_start;
 535          regs->end = new_end;
 536          regs->num_regs = need_regs;
 537        }
 538    }
 539  else
 540    {
 541      assert (regs_allocated == REGS_FIXED);
 542      /* This function may not be called with REGS_FIXED and nregs too big.  */
 543      assert (regs->num_regs >= nregs);
 544      rval = REGS_FIXED;
 545    }
 546
 547  /* Copy the regs.  */
 548  for (i = 0; i < nregs; ++i)
 549    {
 550      regs->start[i] = pmatch[i].rm_so;
 551      regs->end[i] = pmatch[i].rm_eo;
 552    }
 553  for ( ; i < regs->num_regs; ++i)
 554    regs->start[i] = regs->end[i] = -1;
 555
 556  return rval;
 557}
 558
 559/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 560   ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 561   this memory for recording register information.  STARTS and ENDS
 562   must be allocated using the malloc library routine, and must each
 563   be at least NUM_REGS * sizeof (regoff_t) bytes long.
 564
 565   If NUM_REGS == 0, then subsequent matches should allocate their own
 566   register data.
 567
 568   Unless this function is called, the first search or match using
 569   PATTERN_BUFFER will allocate its own register data, without
 570   freeing the old data.  */
 571
 572void
 573re_set_registers (bufp, regs, num_regs, starts, ends)
 574    struct re_pattern_buffer *bufp;
 575    struct re_registers *regs;
 576    unsigned num_regs;
 577    regoff_t *starts, *ends;
 578{
 579  if (num_regs)
 580    {
 581      bufp->regs_allocated = REGS_REALLOCATE;
 582      regs->num_regs = num_regs;
 583      regs->start = starts;
 584      regs->end = ends;
 585    }
 586  else
 587    {
 588      bufp->regs_allocated = REGS_UNALLOCATED;
 589      regs->num_regs = 0;
 590      regs->start = regs->end = (regoff_t *) 0;
 591    }
 592}
 593#ifdef _LIBC
 594weak_alias (__re_set_registers, re_set_registers)
 595#endif
 596\f
 597/* Entry points compatible with 4.2 BSD regex library.  We don't define
 598   them unless specifically requested.  */
 599
 600#if defined _REGEX_RE_COMP || defined _LIBC
 601int
 602# ifdef _LIBC
 603weak_function
 604# endif
 605re_exec (s)
 606     const char *s;
 607{
 608  return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 609}
 610#endif /* _REGEX_RE_COMP */
 611\f
 612/* Internal entry point.  */
 613
 614/* Searches for a compiled pattern PREG in the string STRING, whose
 615   length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 616   mingings with regexec.  START, and RANGE have the same meanings
 617   with re_search.
 618   Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 619   otherwise return the error code.
 620   Note: We assume front end functions already check ranges.
 621   (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 622
 623static reg_errcode_t
 624re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 625                    eflags)
 626    const regex_t *preg;
 627    const char *string;
 628    int length, start, range, stop, eflags;
 629    size_t nmatch;
 630    regmatch_t pmatch[];
 631{
 632  reg_errcode_t err;
 633  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 634  int left_lim, right_lim, incr;
 635  int fl_longest_match, match_first, match_kind, match_last = -1;
 636  int extra_nmatch;
 637  int sb, ch;
 638#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 639  re_match_context_t mctx = { .dfa = dfa };
 640#else
 641  re_match_context_t mctx;
 642#endif
 643  char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 644                   && range && !preg->can_be_null) ? preg->fastmap : NULL;
 645  RE_TRANSLATE_TYPE t = preg->translate;
 646
 647#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 648  memset (&mctx, '\0', sizeof (re_match_context_t));
 649  mctx.dfa = dfa;
 650#endif
 651
 652  extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 653  nmatch -= extra_nmatch;
 654
 655  /* Check if the DFA haven't been compiled.  */
 656  if (BE (preg->used == 0 || dfa->init_state == NULL
 657          || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 658          || dfa->init_state_begbuf == NULL, 0))
 659    return REG_NOMATCH;
 660
 661#ifdef DEBUG
 662  /* We assume front-end functions already check them.  */
 663  assert (start + range >= 0 && start + range <= length);
 664#endif
 665
 666  /* If initial states with non-begbuf contexts have no elements,
 667     the regex must be anchored.  If preg->newline_anchor is set,
 668     we'll never use init_state_nl, so do not check it.  */
 669  if (dfa->init_state->nodes.nelem == 0
 670      && dfa->init_state_word->nodes.nelem == 0
 671      && (dfa->init_state_nl->nodes.nelem == 0
 672          || !preg->newline_anchor))
 673    {
 674      if (start != 0 && start + range != 0)
 675        return REG_NOMATCH;
 676      start = range = 0;
 677    }
 678
 679  /* We must check the longest matching, if nmatch > 0.  */
 680  fl_longest_match = (nmatch != 0 || dfa->nbackref);
 681
 682  err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 683                            preg->translate, preg->syntax & RE_ICASE, dfa);
 684  if (BE (err != REG_NOERROR, 0))
 685    goto free_return;
 686  mctx.input.stop = stop;
 687  mctx.input.raw_stop = stop;
 688  mctx.input.newline_anchor = preg->newline_anchor;
 689
 690  err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 691  if (BE (err != REG_NOERROR, 0))
 692    goto free_return;
 693
 694  /* We will log all the DFA states through which the dfa pass,
 695     if nmatch > 1, or this dfa has "multibyte node", which is a
 696     back-reference or a node which can accept multibyte character or
 697     multi character collating element.  */
 698  if (nmatch > 1 || dfa->has_mb_node)
 699    {
 700      /* Avoid overflow.  */
 701      if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 702        {
 703          err = REG_ESPACE;
 704          goto free_return;
 705        }
 706
 707      mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 708      if (BE (mctx.state_log == NULL, 0))
 709        {
 710          err = REG_ESPACE;
 711          goto free_return;
 712        }
 713    }
 714  else
 715    mctx.state_log = NULL;
 716
 717  match_first = start;
 718  mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 719                           : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 720
 721  /* Check incrementally whether of not the input string match.  */
 722  incr = (range < 0) ? -1 : 1;
 723  left_lim = (range < 0) ? start + range : start;
 724  right_lim = (range < 0) ? start : start + range;
 725  sb = dfa->mb_cur_max == 1;
 726  match_kind =
 727    (fastmap
 728     ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 729        | (range >= 0 ? 2 : 0)
 730        | (t != NULL ? 1 : 0))
 731     : 8);
 732
 733  for (;; match_first += incr)
 734    {
 735      err = REG_NOMATCH;
 736      if (match_first < left_lim || right_lim < match_first)
 737        goto free_return;
 738
 739      /* Advance as rapidly as possible through the string, until we
 740         find a plausible place to start matching.  This may be done
 741         with varying efficiency, so there are various possibilities:
 742         only the most common of them are specialized, in order to
 743         save on code size.  We use a switch statement for speed.  */
 744      switch (match_kind)
 745        {
 746        case 8:
 747          /* No fastmap.  */
 748          break;
 749
 750        case 7:
 751          /* Fastmap with single-byte translation, match forward.  */
 752          while (BE (match_first < right_lim, 1)
 753                 && !fastmap[t[(unsigned char) string[match_first]]])
 754            ++match_first;
 755          goto forward_match_found_start_or_reached_end;
 756
 757        case 6:
 758          /* Fastmap without translation, match forward.  */
 759          while (BE (match_first < right_lim, 1)
 760                 && !fastmap[(unsigned char) string[match_first]])
 761            ++match_first;
 762
 763        forward_match_found_start_or_reached_end:
 764          if (BE (match_first == right_lim, 0))
 765            {
 766              ch = match_first >= length
 767                       ? 0 : (unsigned char) string[match_first];
 768              if (!fastmap[t ? t[ch] : ch])
 769                goto free_return;
 770            }
 771          break;
 772
 773        case 4:
 774        case 5:
 775          /* Fastmap without multi-byte translation, match backwards.  */
 776          while (match_first >= left_lim)
 777            {
 778              ch = match_first >= length
 779                       ? 0 : (unsigned char) string[match_first];
 780              if (fastmap[t ? t[ch] : ch])
 781                break;
 782              --match_first;
 783            }
 784          if (match_first < left_lim)
 785            goto free_return;
 786          break;
 787
 788        default:
 789          /* In this case, we can't determine easily the current byte,
 790             since it might be a component byte of a multibyte
 791             character.  Then we use the constructed buffer instead.  */
 792          for (;;)
 793            {
 794              /* If MATCH_FIRST is out of the valid range, reconstruct the
 795                 buffers.  */
 796              unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 797              if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 798                {
 799                  err = re_string_reconstruct (&mctx.input, match_first,
 800                                               eflags);
 801                  if (BE (err != REG_NOERROR, 0))
 802                    goto free_return;
 803
 804                  offset = match_first - mctx.input.raw_mbs_idx;
 805                }
 806              /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 807                 Note that MATCH_FIRST must not be smaller than 0.  */
 808              ch = (match_first >= length
 809                    ? 0 : re_string_byte_at (&mctx.input, offset));
 810              if (fastmap[ch])
 811                break;
 812              match_first += incr;
 813              if (match_first < left_lim || match_first > right_lim)
 814                {
 815                  err = REG_NOMATCH;
 816                  goto free_return;
 817                }
 818            }
 819          break;
 820        }
 821
 822      /* Reconstruct the buffers so that the matcher can assume that
 823         the matching starts from the beginning of the buffer.  */
 824      err = re_string_reconstruct (&mctx.input, match_first, eflags);
 825      if (BE (err != REG_NOERROR, 0))
 826        goto free_return;
 827
 828#ifdef RE_ENABLE_I18N
 829     /* Don't consider this char as a possible match start if it part,
 830        yet isn't the head, of a multibyte character.  */
 831      if (!sb && !re_string_first_byte (&mctx.input, 0))
 832        continue;
 833#endif
 834
 835      /* It seems to be appropriate one, then use the matcher.  */
 836      /* We assume that the matching starts from 0.  */
 837      mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 838      match_last = check_matching (&mctx, fl_longest_match,
 839                                   range >= 0 ? &match_first : NULL);
 840      if (match_last != -1)
 841        {
 842          if (BE (match_last == -2, 0))
 843            {
 844              err = REG_ESPACE;
 845              goto free_return;
 846            }
 847          else
 848            {
 849              mctx.match_last = match_last;
 850              if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 851                {
 852                  re_dfastate_t *pstate = mctx.state_log[match_last];
 853                  mctx.last_node = check_halt_state_context (&mctx, pstate,
 854                                                             match_last);
 855                }
 856              if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 857                  || dfa->nbackref)
 858                {
 859                  err = prune_impossible_nodes (&mctx);
 860                  if (err == REG_NOERROR)
 861                    break;
 862                  if (BE (err != REG_NOMATCH, 0))
 863                    goto free_return;
 864                  match_last = -1;
 865                }
 866              else
 867                break; /* We found a match.  */
 868            }
 869        }
 870
 871      match_ctx_clean (&mctx);
 872    }
 873
 874#ifdef DEBUG
 875  assert (match_last != -1);
 876  assert (err == REG_NOERROR);
 877#endif
 878
 879  /* Set pmatch[] if we need.  */
 880  if (nmatch > 0)
 881    {
 882      int reg_idx;
 883
 884      /* Initialize registers.  */
 885      for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 886        pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 887
 888      /* Set the points where matching start/end.  */
 889      pmatch[0].rm_so = 0;
 890      pmatch[0].rm_eo = mctx.match_last;
 891
 892      if (!preg->no_sub && nmatch > 1)
 893        {
 894          err = set_regs (preg, &mctx, nmatch, pmatch,
 895                          dfa->has_plural_match && dfa->nbackref > 0);
 896          if (BE (err != REG_NOERROR, 0))
 897            goto free_return;
 898        }
 899
 900      /* At last, add the offset to the each registers, since we slided
 901         the buffers so that we could assume that the matching starts
 902         from 0.  */
 903      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 904        if (pmatch[reg_idx].rm_so != -1)
 905          {
 906#ifdef RE_ENABLE_I18N
 907            if (BE (mctx.input.offsets_needed != 0, 0))
 908              {
 909                pmatch[reg_idx].rm_so =
 910                  (pmatch[reg_idx].rm_so == mctx.input.valid_len
 911                   ? mctx.input.valid_raw_len
 912                   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 913                pmatch[reg_idx].rm_eo =
 914                  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 915                   ? mctx.input.valid_raw_len
 916                   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 917              }
 918#else
 919            assert (mctx.input.offsets_needed == 0);
 920#endif
 921            pmatch[reg_idx].rm_so += match_first;
 922            pmatch[reg_idx].rm_eo += match_first;
 923          }
 924      for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 925        {
 926          pmatch[nmatch + reg_idx].rm_so = -1;
 927          pmatch[nmatch + reg_idx].rm_eo = -1;
 928        }
 929
 930      if (dfa->subexp_map)
 931        for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 932          if (dfa->subexp_map[reg_idx] != reg_idx)
 933            {
 934              pmatch[reg_idx + 1].rm_so
 935                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 936              pmatch[reg_idx + 1].rm_eo
 937                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 938            }
 939    }
 940
 941 free_return:
 942  re_free (mctx.state_log);
 943  if (dfa->nbackref)
 944    match_ctx_free (&mctx);
 945  re_string_destruct (&mctx.input);
 946  return err;
 947}
 948
 949static reg_errcode_t
 950prune_impossible_nodes (mctx)
 951     re_match_context_t *mctx;
 952{
 953  const re_dfa_t *const dfa = mctx->dfa;
 954  int halt_node, match_last;
 955  reg_errcode_t ret;
 956  re_dfastate_t **sifted_states;
 957  re_dfastate_t **lim_states = NULL;
 958  re_sift_context_t sctx;
 959#ifdef DEBUG
 960  assert (mctx->state_log != NULL);
 961#endif
 962  match_last = mctx->match_last;
 963  halt_node = mctx->last_node;
 964
 965  /* Avoid overflow.  */
 966  if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 967    return REG_ESPACE;
 968
 969  sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 970  if (BE (sifted_states == NULL, 0))
 971    {
 972      ret = REG_ESPACE;
 973      goto free_return;
 974    }
 975  if (dfa->nbackref)
 976    {
 977      lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 978      if (BE (lim_states == NULL, 0))
 979        {
 980          ret = REG_ESPACE;
 981          goto free_return;
 982        }
 983      while (1)
 984        {
 985          memset (lim_states, '\0',
 986                  sizeof (re_dfastate_t *) * (match_last + 1));
 987          sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 988                         match_last);
 989          ret = sift_states_backward (mctx, &sctx);
 990          re_node_set_free (&sctx.limits);
 991          if (BE (ret != REG_NOERROR, 0))
 992              goto free_return;
 993          if (sifted_states[0] != NULL || lim_states[0] != NULL)
 994            break;
 995          do
 996            {
 997              --match_last;
 998              if (match_last < 0)
 999                {
1000                  ret = REG_NOMATCH;
1001                  goto free_return;
1002                }
1003            } while (mctx->state_log[match_last] == NULL
1004                     || !mctx->state_log[match_last]->halt);
1005          halt_node = check_halt_state_context (mctx,
1006                                                mctx->state_log[match_last],
1007                                                match_last);
1008        }
1009      ret = merge_state_array (dfa, sifted_states, lim_states,
1010                               match_last + 1);
1011      re_free (lim_states);
1012      lim_states = NULL;
1013      if (BE (ret != REG_NOERROR, 0))
1014        goto free_return;
1015    }
1016  else
1017    {
1018      sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1019      ret = sift_states_backward (mctx, &sctx);
1020      re_node_set_free (&sctx.limits);
1021      if (BE (ret != REG_NOERROR, 0))
1022        goto free_return;
1023      if (sifted_states[0] == NULL)
1024        {
1025          ret = REG_NOMATCH;
1026          goto free_return;
1027        }
1028    }
1029  re_free (mctx->state_log);
1030  mctx->state_log = sifted_states;
1031  sifted_states = NULL;
1032  mctx->last_node = halt_node;
1033  mctx->match_last = match_last;
1034  ret = REG_NOERROR;
1035 free_return:
1036  re_free (sifted_states);
1037  re_free (lim_states);
1038  return ret;
1039}
1040
1041/* Acquire an initial state and return it.
1042   We must select appropriate initial state depending on the context,
1043   since initial states may have constraints like "\<", "^", etc..  */
1044
1045static inline re_dfastate_t *
1046__attribute ((always_inline)) internal_function
1047acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1048                            int idx)
1049{
1050  const re_dfa_t *const dfa = mctx->dfa;
1051  if (dfa->init_state->has_constraint)
1052    {
1053      unsigned int context;
1054      context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1055      if (IS_WORD_CONTEXT (context))
1056        return dfa->init_state_word;
1057      else if (IS_ORDINARY_CONTEXT (context))
1058        return dfa->init_state;
1059      else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1060        return dfa->init_state_begbuf;
1061      else if (IS_NEWLINE_CONTEXT (context))
1062        return dfa->init_state_nl;
1063      else if (IS_BEGBUF_CONTEXT (context))
1064        {
1065          /* It is relatively rare case, then calculate on demand.  */
1066          return re_acquire_state_context (err, dfa,
1067                                           dfa->init_state->entrance_nodes,
1068                                           context);
1069        }
1070      else
1071        /* Must not happen?  */
1072        return dfa->init_state;
1073    }
1074  else
1075    return dfa->init_state;
1076}
1077
1078/* Check whether the regular expression match input string INPUT or not,
1079   and return the index where the matching end, return -1 if not match,
1080   or return -2 in case of an error.
1081   FL_LONGEST_MATCH means we want the POSIX longest matching.
1082   If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1083   next place where we may want to try matching.
1084   Note that the matcher assume that the maching starts from the current
1085   index of the buffer.  */
1086
1087static int
1088internal_function
1089check_matching (re_match_context_t *mctx, int fl_longest_match,
1090                int *p_match_first)
1091{
1092  const re_dfa_t *const dfa = mctx->dfa;
1093  reg_errcode_t err;
1094  int match = 0;
1095  int match_last = -1;
1096  int cur_str_idx = re_string_cur_idx (&mctx->input);
1097  re_dfastate_t *cur_state;
1098  int at_init_state = p_match_first != NULL;
1099  int next_start_idx = cur_str_idx;
1100
1101  err = REG_NOERROR;
1102  cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1103  /* An initial state must not be NULL (invalid).  */
1104  if (BE (cur_state == NULL, 0))
1105    {
1106      assert (err == REG_ESPACE);
1107      return -2;
1108    }
1109
1110  if (mctx->state_log != NULL)
1111    {
1112      mctx->state_log[cur_str_idx] = cur_state;
1113
1114      /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1115         later.  E.g. Processing back references.  */
1116      if (BE (dfa->nbackref, 0))
1117        {
1118          at_init_state = 0;
1119          err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1120          if (BE (err != REG_NOERROR, 0))
1121            return err;
1122
1123          if (cur_state->has_backref)
1124            {
1125              err = transit_state_bkref (mctx, &cur_state->nodes);
1126              if (BE (err != REG_NOERROR, 0))
1127                return err;
1128            }
1129        }
1130    }
1131
1132  /* If the RE accepts NULL string.  */
1133  if (BE (cur_state->halt, 0))
1134    {
1135      if (!cur_state->has_constraint
1136          || check_halt_state_context (mctx, cur_state, cur_str_idx))
1137        {
1138          if (!fl_longest_match)
1139            return cur_str_idx;
1140          else
1141            {
1142              match_last = cur_str_idx;
1143              match = 1;
1144            }
1145        }
1146    }
1147
1148  while (!re_string_eoi (&mctx->input))
1149    {
1150      re_dfastate_t *old_state = cur_state;
1151      int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1152
1153      if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1154          || (BE (next_char_idx >= mctx->input.valid_len, 0)
1155              && mctx->input.valid_len < mctx->input.len))
1156        {
1157          err = extend_buffers (mctx);
1158          if (BE (err != REG_NOERROR, 0))
1159            {
1160              assert (err == REG_ESPACE);
1161              return -2;
1162            }
1163        }
1164
1165      cur_state = transit_state (&err, mctx, cur_state);
1166      if (mctx->state_log != NULL)
1167        cur_state = merge_state_with_log (&err, mctx, cur_state);
1168
1169      if (cur_state == NULL)
1170        {
1171          /* Reached the invalid state or an error.  Try to recover a valid
1172             state using the state log, if available and if we have not
1173             already found a valid (even if not the longest) match.  */
1174          if (BE (err != REG_NOERROR, 0))
1175            return -2;
1176
1177          if (mctx->state_log == NULL
1178              || (match && !fl_longest_match)
1179              || (cur_state = find_recover_state (&err, mctx)) == NULL)
1180            break;
1181        }
1182
1183      if (BE (at_init_state, 0))
1184        {
1185          if (old_state == cur_state)
1186            next_start_idx = next_char_idx;
1187          else
1188            at_init_state = 0;
1189        }
1190
1191      if (cur_state->halt)
1192        {
1193          /* Reached a halt state.
1194             Check the halt state can satisfy the current context.  */
1195          if (!cur_state->has_constraint
1196              || check_halt_state_context (mctx, cur_state,
1197                                           re_string_cur_idx (&mctx->input)))
1198            {
1199              /* We found an appropriate halt state.  */
1200              match_last = re_string_cur_idx (&mctx->input);
1201              match = 1;
1202
1203              /* We found a match, do not modify match_first below.  */
1204              p_match_first = NULL;
1205              if (!fl_longest_match)
1206                break;
1207            }
1208        }
1209    }
1210
1211  if (p_match_first)
1212    *p_match_first += next_start_idx;
1213
1214  return match_last;
1215}
1216
1217/* Check NODE match the current context.  */
1218
1219static int
1220internal_function
1221check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1222{
1223  re_token_type_t type = dfa->nodes[node].type;
1224  unsigned int constraint = dfa->nodes[node].constraint;
1225  if (type != END_OF_RE)
1226    return 0;
1227  if (!constraint)
1228    return 1;
1229  if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1230    return 0;
1231  return 1;
1232}
1233
1234/* Check the halt state STATE match the current context.
1235   Return 0 if not match, if the node, STATE has, is a halt node and
1236   match the context, return the node.  */
1237
1238static int
1239internal_function
1240check_halt_state_context (const re_match_context_t *mctx,
1241                          const re_dfastate_t *state, int idx)
1242{
1243  int i;
1244  unsigned int context;
1245#ifdef DEBUG
1246  assert (state->halt);
1247#endif
1248  context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1249  for (i = 0; i < state->nodes.nelem; ++i)
1250    if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1251      return state->nodes.elems[i];
1252  return 0;
1253}
1254
1255/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1256   corresponding to the DFA).
1257   Return the destination node, and update EPS_VIA_NODES, return -1 in case
1258   of errors.  */
1259
1260static int
1261internal_function
1262proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1263                   int *pidx, int node, re_node_set *eps_via_nodes,
1264                   struct re_fail_stack_t *fs)
1265{
1266  const re_dfa_t *const dfa = mctx->dfa;
1267  int i, err;
1268  if (IS_EPSILON_NODE (dfa->nodes[node].type))
1269    {
1270      re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1271      re_node_set *edests = &dfa->edests[node];
1272      int dest_node;
1273      err = re_node_set_insert (eps_via_nodes, node);
1274      if (BE (err < 0, 0))
1275        return -2;
1276      /* Pick up a valid destination, or return -1 if none is found.  */
1277      for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1278        {
1279          int candidate = edests->elems[i];
1280          if (!re_node_set_contains (cur_nodes, candidate))
1281            continue;
1282          if (dest_node == -1)
1283            dest_node = candidate;
1284
1285          else
1286            {
1287              /* In order to avoid infinite loop like "(a*)*", return the second
1288                 epsilon-transition if the first was already considered.  */
1289              if (re_node_set_contains (eps_via_nodes, dest_node))
1290                return candidate;
1291
1292              /* Otherwise, push the second epsilon-transition on the fail stack.  */
1293              else if (fs != NULL
1294                       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1295                                           eps_via_nodes))
1296                return -2;
1297
1298              /* We know we are going to exit.  */
1299              break;
1300            }
1301        }
1302      return dest_node;
1303    }
1304  else
1305    {
1306      int naccepted = 0;
1307      re_token_type_t type = dfa->nodes[node].type;
1308
1309#ifdef RE_ENABLE_I18N
1310      if (dfa->nodes[node].accept_mb)
1311        naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1312      else
1313#endif /* RE_ENABLE_I18N */
1314      if (type == OP_BACK_REF)
1315        {
1316          int subexp_idx = dfa->nodes[node].opr.idx + 1;
1317          naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1318          if (fs != NULL)
1319            {
1320              if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1321                return -1;
1322              else if (naccepted)
1323                {
1324                  char *buf = (char *) re_string_get_buffer (&mctx->input);
1325                  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1326                              naccepted) != 0)
1327                    return -1;
1328                }
1329            }
1330
1331          if (naccepted == 0)
1332            {
1333              int dest_node;
1334              err = re_node_set_insert (eps_via_nodes, node);
1335              if (BE (err < 0, 0))
1336                return -2;
1337              dest_node = dfa->edests[node].elems[0];
1338              if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1339                                        dest_node))
1340                return dest_node;
1341            }
1342        }
1343
1344      if (naccepted != 0
1345          || check_node_accept (mctx, dfa->nodes + node, *pidx))
1346        {
1347          int dest_node = dfa->nexts[node];
1348          *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1349          if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1350                     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1351                                               dest_node)))
1352            return -1;
1353          re_node_set_empty (eps_via_nodes);
1354          return dest_node;
1355        }
1356    }
1357  return -1;
1358}
1359
1360static reg_errcode_t
1361internal_function
1362push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1363                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1364{
1365  reg_errcode_t err;
1366  int num = fs->num++;
1367  if (fs->num == fs->alloc)
1368    {
1369      struct re_fail_stack_ent_t *new_array;
1370      new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1371                                       * fs->alloc * 2));
1372      if (new_array == NULL)
1373        return REG_ESPACE;
1374      fs->alloc *= 2;
1375      fs->stack = new_array;
1376    }
1377  fs->stack[num].idx = str_idx;
1378  fs->stack[num].node = dest_node;
1379  fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1380  if (fs->stack[num].regs == NULL)
1381    return REG_ESPACE;
1382  memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1383  err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1384  return err;
1385}
1386
1387static int
1388internal_function
1389pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1390                regmatch_t *regs, re_node_set *eps_via_nodes)
1391{
1392  int num = --fs->num;
1393  assert (num >= 0);
1394  *pidx = fs->stack[num].idx;
1395  memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1396  re_node_set_free (eps_via_nodes);
1397  re_free (fs->stack[num].regs);
1398  *eps_via_nodes = fs->stack[num].eps_via_nodes;
1399  return fs->stack[num].node;
1400}
1401
1402/* Set the positions where the subexpressions are starts/ends to registers
1403   PMATCH.
1404   Note: We assume that pmatch[0] is already set, and
1405   pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1406
1407static reg_errcode_t
1408internal_function
1409set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1410          regmatch_t *pmatch, int fl_backtrack)
1411{
1412  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1413  int idx, cur_node;
1414  re_node_set eps_via_nodes;
1415  struct re_fail_stack_t *fs;
1416  struct re_fail_stack_t fs_body = { 0, 2, NULL };
1417  regmatch_t *prev_idx_match;
1418  int prev_idx_match_malloced = 0;
1419
1420#ifdef DEBUG
1421  assert (nmatch > 1);
1422  assert (mctx->state_log != NULL);
1423#endif
1424  if (fl_backtrack)
1425    {
1426      fs = &fs_body;
1427      fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1428      if (fs->stack == NULL)
1429        return REG_ESPACE;
1430    }
1431  else
1432    fs = NULL;
1433
1434  cur_node = dfa->init_node;
1435  re_node_set_init_empty (&eps_via_nodes);
1436
1437#ifdef HAVE_ALLOCA
1438  if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1439    prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1440  else
1441#endif
1442    {
1443      prev_idx_match = re_malloc (regmatch_t, nmatch);
1444      if (prev_idx_match == NULL)
1445        {
1446          free_fail_stack_return (fs);
1447          return REG_ESPACE;
1448        }
1449      prev_idx_match_malloced = 1;
1450    }
1451  memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1452
1453  for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1454    {
1455      update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1456
1457      if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1458        {
1459          int reg_idx;
1460          if (fs)
1461            {
1462              for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1463                if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1464                  break;
1465              if (reg_idx == nmatch)
1466                {
1467                  re_node_set_free (&eps_via_nodes);
1468                  if (prev_idx_match_malloced)
1469                    re_free (prev_idx_match);
1470                  return free_fail_stack_return (fs);
1471                }
1472              cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1473                                         &eps_via_nodes);
1474            }
1475          else
1476            {
1477              re_node_set_free (&eps_via_nodes);
1478              if (prev_idx_match_malloced)
1479                re_free (prev_idx_match);
1480              return REG_NOERROR;
1481            }
1482        }
1483
1484      /* Proceed to next node.  */
1485      cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1486                                    &eps_via_nodes, fs);
1487
1488      if (BE (cur_node < 0, 0))
1489        {
1490          if (BE (cur_node == -2, 0))
1491            {
1492              re_node_set_free (&eps_via_nodes);
1493              if (prev_idx_match_malloced)
1494                re_free (prev_idx_match);
1495              free_fail_stack_return (fs);
1496              return REG_ESPACE;
1497            }
1498          if (fs)
1499            cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1500                                       &eps_via_nodes);
1501          else
1502            {
1503              re_node_set_free (&eps_via_nodes);
1504              if (prev_idx_match_malloced)
1505                re_free (prev_idx_match);
1506              return REG_NOMATCH;
1507            }
1508        }
1509    }
1510  re_node_set_free (&eps_via_nodes);
1511  if (prev_idx_match_malloced)
1512    re_free (prev_idx_match);
1513  return free_fail_stack_return (fs);
1514}
1515
1516static reg_errcode_t
1517internal_function
1518free_fail_stack_return (struct re_fail_stack_t *fs)
1519{
1520  if (fs)
1521    {
1522      int fs_idx;
1523      for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1524        {
1525          re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1526          re_free (fs->stack[fs_idx].regs);
1527        }
1528      re_free (fs->stack);
1529    }
1530  return REG_NOERROR;
1531}
1532
1533static void
1534internal_function
1535update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1536             regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1537{
1538  int type = dfa->nodes[cur_node].type;
1539  if (type == OP_OPEN_SUBEXP)
1540    {
1541      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1542
1543      /* We are at the first node of this sub expression.  */
1544      if (reg_num < nmatch)
1545        {
1546          pmatch[reg_num].rm_so = cur_idx;
1547          pmatch[reg_num].rm_eo = -1;
1548        }
1549    }
1550  else if (type == OP_CLOSE_SUBEXP)
1551    {
1552      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1553      if (reg_num < nmatch)
1554        {
1555          /* We are at the last node of this sub expression.  */
1556          if (pmatch[reg_num].rm_so < cur_idx)
1557            {
1558              pmatch[reg_num].rm_eo = cur_idx;
1559              /* This is a non-empty match or we are not inside an optional
1560                 subexpression.  Accept this right away.  */
1561              memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1562            }
1563          else
1564            {
1565              if (dfa->nodes[cur_node].opt_subexp
1566                  && prev_idx_match[reg_num].rm_so != -1)
1567                /* We transited through an empty match for an optional
1568                   subexpression, like (a?)*, and this is not the subexp's
1569                   first match.  Copy back the old content of the registers
1570                   so that matches of an inner subexpression are undone as
1571                   well, like in ((a?))*.  */
1572                memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1573              else
1574                /* We completed a subexpression, but it may be part of
1575                   an optional one, so do not update PREV_IDX_MATCH.  */
1576                pmatch[reg_num].rm_eo = cur_idx;
1577            }
1578        }
1579    }
1580}
1581
1582/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1583   and sift the nodes in each states according to the following rules.
1584   Updated state_log will be wrote to STATE_LOG.
1585
1586   Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1587     1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1588        If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1589        the LAST_NODE, we throw away the node `a'.
1590     2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1591        string `s' and transit to `b':
1592        i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1593           away the node `a'.
1594        ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1595            thrown away, we throw away the node `a'.
1596     3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1597        i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1598           node `a'.
1599        ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1600            we throw away the node `a'.  */
1601
1602#define STATE_NODE_CONTAINS(state,node) \
1603  ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1604
1605static reg_errcode_t
1606internal_function
1607sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1608{
1609  reg_errcode_t err;
1610  int null_cnt = 0;
1611  int str_idx = sctx->last_str_idx;
1612  re_node_set cur_dest;
1613
1614#ifdef DEBUG
1615  assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1616#endif
1617
1618  /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1619     transit to the last_node and the last_node itself.  */
1620  err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1621  if (BE (err != REG_NOERROR, 0))
1622    return err;
1623  err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1624  if (BE (err != REG_NOERROR, 0))
1625    goto free_return;
1626
1627  /* Then check each states in the state_log.  */
1628  while (str_idx > 0)
1629    {
1630      /* Update counters.  */
1631      null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1632      if (null_cnt > mctx->max_mb_elem_len)
1633        {
1634          memset (sctx->sifted_states, '\0',
1635                  sizeof (re_dfastate_t *) * str_idx);
1636          re_node_set_free (&cur_dest);
1637          return REG_NOERROR;
1638        }
1639      re_node_set_empty (&cur_dest);
1640      --str_idx;
1641
1642      if (mctx->state_log[str_idx])
1643        {
1644          err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1645          if (BE (err != REG_NOERROR, 0))
1646            goto free_return;
1647        }
1648
1649      /* Add all the nodes which satisfy the following conditions:
1650         - It can epsilon transit to a node in CUR_DEST.
1651         - It is in CUR_SRC.
1652         And update state_log.  */
1653      err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1654      if (BE (err != REG_NOERROR, 0))
1655        goto free_return;
1656    }
1657  err = REG_NOERROR;
1658 free_return:
1659  re_node_set_free (&cur_dest);
1660  return err;
1661}
1662
1663static reg_errcode_t
1664internal_function
1665build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1666                     int str_idx, re_node_set *cur_dest)
1667{
1668  const re_dfa_t *const dfa = mctx->dfa;
1669  const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1670  int i;
1671
1672  /* Then build the next sifted state.
1673     We build the next sifted state on `cur_dest', and update
1674     `sifted_states[str_idx]' with `cur_dest'.
1675     Note:
1676     `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1677     `cur_src' points the node_set of the old `state_log[str_idx]'
1678     (with the epsilon nodes pre-filtered out).  */
1679  for (i = 0; i < cur_src->nelem; i++)
1680    {
1681      int prev_node = cur_src->elems[i];
1682      int naccepted = 0;
1683      int ret;
1684
1685#ifdef DEBUG
1686      re_token_type_t type = dfa->nodes[prev_node].type;
1687      assert (!IS_EPSILON_NODE (type));
1688#endif
1689#ifdef RE_ENABLE_I18N
1690      /* If the node may accept `multi byte'.  */
1691      if (dfa->nodes[prev_node].accept_mb)
1692        naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1693                                         str_idx, sctx->last_str_idx);
1694#endif /* RE_ENABLE_I18N */
1695
1696      /* We don't check backreferences here.
1697         See update_cur_sifted_state().  */
1698      if (!naccepted
1699          && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1700          && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1701                                  dfa->nexts[prev_node]))
1702        naccepted = 1;
1703
1704      if (naccepted == 0)
1705        continue;
1706
1707      if (sctx->limits.nelem)
1708        {
1709          int to_idx = str_idx + naccepted;
1710          if (check_dst_limits (mctx, &sctx->limits,
1711                                dfa->nexts[prev_node], to_idx,
1712                                prev_node, str_idx))
1713            continue;
1714        }
1715      ret = re_node_set_insert (cur_dest, prev_node);
1716      if (BE (ret == -1, 0))
1717        return REG_ESPACE;
1718    }
1719
1720  return REG_NOERROR;
1721}
1722
1723/* Helper functions.  */
1724
1725static reg_errcode_t
1726internal_function
1727clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1728{
1729  int top = mctx->state_log_top;
1730
1731  if (next_state_log_idx >= mctx->input.bufs_len
1732      || (next_state_log_idx >= mctx->input.valid_len
1733          && mctx->input.valid_len < mctx->input.len))
1734    {
1735      reg_errcode_t err;
1736      err = extend_buffers (mctx);
1737      if (BE (err != REG_NOERROR, 0))
1738        return err;
1739    }
1740
1741  if (top < next_state_log_idx)
1742    {
1743      memset (mctx->state_log + top + 1, '\0',
1744              sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1745      mctx->state_log_top = next_state_log_idx;
1746    }
1747  return REG_NOERROR;
1748}
1749
1750static reg_errcode_t
1751internal_function
1752merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1753                   re_dfastate_t **src, int num)
1754{
1755  int st_idx;
1756  reg_errcode_t err;
1757  for (st_idx = 0; st_idx < num; ++st_idx)
1758    {
1759      if (dst[st_idx] == NULL)
1760        dst[st_idx] = src[st_idx];
1761      else if (src[st_idx] != NULL)
1762        {
1763          re_node_set merged_set;
1764          err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1765                                        &src[st_idx]->nodes);
1766          if (BE (err != REG_NOERROR, 0))
1767            return err;
1768          dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1769          re_node_set_free (&merged_set);
1770          if (BE (err != REG_NOERROR, 0))
1771            return err;
1772        }
1773    }
1774  return REG_NOERROR;
1775}
1776
1777static reg_errcode_t
1778internal_function
1779update_cur_sifted_state (const re_match_context_t *mctx,
1780                         re_sift_context_t *sctx, int str_idx,
1781                         re_node_set *dest_nodes)
1782{
1783  const re_dfa_t *const dfa = mctx->dfa;
1784  reg_errcode_t err = REG_NOERROR;
1785  const re_node_set *candidates;
1786  candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1787                : &mctx->state_log[str_idx]->nodes);
1788
1789  if (dest_nodes->nelem == 0)
1790    sctx->sifted_states[str_idx] = NULL;
1791  else
1792    {
1793      if (candidates)
1794        {
1795          /* At first, add the nodes which can epsilon transit to a node in
1796             DEST_NODE.  */
1797          err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1798          if (BE (err != REG_NOERROR, 0))
1799            return err;
1800
1801          /* Then, check the limitations in the current sift_context.  */
1802          if (sctx->limits.nelem)
1803            {
1804              err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1805                                         mctx->bkref_ents, str_idx);
1806              if (BE (err != REG_NOERROR, 0))
1807                return err;
1808            }
1809        }
1810
1811      sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1812      if (BE (err != REG_NOERROR, 0))
1813        return err;
1814    }
1815
1816  if (candidates && mctx->state_log[str_idx]->has_backref)
1817    {
1818      err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1819      if (BE (err != REG_NOERROR, 0))
1820        return err;
1821    }
1822  return REG_NOERROR;
1823}
1824
1825static reg_errcode_t
1826internal_function
1827add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1828                       const re_node_set *candidates)
1829{
1830  reg_errcode_t err = REG_NOERROR;
1831  int i;
1832
1833  re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1834  if (BE (err != REG_NOERROR, 0))
1835    return err;
1836
1837  if (!state->inveclosure.alloc)
1838    {
1839      err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1840      if (BE (err != REG_NOERROR, 0))
1841        return REG_ESPACE;
1842      for (i = 0; i < dest_nodes->nelem; i++)
1843        {
1844          err = re_node_set_merge (&state->inveclosure,
1845                                   dfa->inveclosures + dest_nodes->elems[i]);
1846          if (BE (err != REG_NOERROR, 0))
1847            return REG_ESPACE;
1848        }
1849    }
1850  return re_node_set_add_intersect (dest_nodes, candidates,
1851                                    &state->inveclosure);
1852}
1853
1854static reg_errcode_t
1855internal_function
1856sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1857                       const re_node_set *candidates)
1858{
1859    int ecl_idx;
1860    reg_errcode_t err;
1861    re_node_set *inv_eclosure = dfa->inveclosures + node;
1862    re_node_set except_nodes;
1863    re_node_set_init_empty (&except_nodes);
1864    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1865      {
1866        int cur_node = inv_eclosure->elems[ecl_idx];
1867        if (cur_node == node)
1868          continue;
1869        if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1870          {
1871            int edst1 = dfa->edests[cur_node].elems[0];
1872            int edst2 = ((dfa->edests[cur_node].nelem > 1)
1873                         ? dfa->edests[cur_node].elems[1] : -1);
1874            if ((!re_node_set_contains (inv_eclosure, edst1)
1875                 && re_node_set_contains (dest_nodes, edst1))
1876                || (edst2 > 0
1877                    && !re_node_set_contains (inv_eclosure, edst2)
1878                    && re_node_set_contains (dest_nodes, edst2)))
1879              {
1880                err = re_node_set_add_intersect (&except_nodes, candidates,
1881                                                 dfa->inveclosures + cur_node);
1882                if (BE (err != REG_NOERROR, 0))
1883                  {
1884                    re_node_set_free (&except_nodes);
1885                    return err;
1886                  }
1887              }
1888          }
1889      }
1890    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1891      {
1892        int cur_node = inv_eclosure->elems[ecl_idx];
1893        if (!re_node_set_contains (&except_nodes, cur_node))
1894          {
1895            int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1896            re_node_set_remove_at (dest_nodes, idx);
1897          }
1898      }
1899    re_node_set_free (&except_nodes);
1900    return REG_NOERROR;
1901}
1902
1903static int
1904internal_function
1905check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1906                  int dst_node, int dst_idx, int src_node, int src_idx)
1907{
1908  const re_dfa_t *const dfa = mctx->dfa;
1909  int lim_idx, src_pos, dst_pos;
1910
1911  int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1912  int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1913  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1914    {
1915      int subexp_idx;
1916      struct re_backref_cache_entry *ent;
1917      ent = mctx->bkref_ents + limits->elems[lim_idx];
1918      subexp_idx = dfa->nodes[ent->node].opr.idx;
1919
1920      dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1921                                           subexp_idx, dst_node, dst_idx,
1922                                           dst_bkref_idx);
1923      src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1924                                           subexp_idx, src_node, src_idx,
1925                                           src_bkref_idx);
1926
1927      /* In case of:
1928         <src> <dst> ( <subexp> )
1929         ( <subexp> ) <src> <dst>
1930         ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1931      if (src_pos == dst_pos)
1932        continue; /* This is unrelated limitation.  */
1933      else
1934        return 1;
1935    }
1936  return 0;
1937}
1938
1939static int
1940internal_function
1941check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1942                             int subexp_idx, int from_node, int bkref_idx)
1943{
1944  const re_dfa_t *const dfa = mctx->dfa;
1945  const re_node_set *eclosures = dfa->eclosures + from_node;
1946  int node_idx;
1947
1948  /* Else, we are on the boundary: examine the nodes on the epsilon
1949     closure.  */
1950  for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1951    {
1952      int node = eclosures->elems[node_idx];
1953      switch (dfa->nodes[node].type)
1954        {
1955        case OP_BACK_REF:
1956          if (bkref_idx != -1)
1957            {
1958              struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1959              do
1960                {
1961                  int dst, cpos;
1962
1963                  if (ent->node != node)
1964                    continue;
1965
1966                  if (subexp_idx < BITSET_WORD_BITS
1967                      && !(ent->eps_reachable_subexps_map
1968                           & ((bitset_word_t) 1 << subexp_idx)))
1969                    continue;
1970
1971                  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1972                     OP_CLOSE_SUBEXP cases below.  But, if the
1973                     destination node is the same node as the source
1974                     node, don't recurse because it would cause an
1975                     infinite loop: a regex that exhibits this behavior
1976                     is ()\1*\1*  */
1977                  dst = dfa->edests[node].elems[0];
1978                  if (dst == from_node)
1979                    {
1980                      if (boundaries & 1)
1981                        return -1;
1982                      else /* if (boundaries & 2) */
1983                        return 0;
1984                    }
1985
1986                  cpos =
1987                    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1988                                                 dst, bkref_idx);
1989                  if (cpos == -1 /* && (boundaries & 1) */)
1990                    return -1;
1991                  if (cpos == 0 && (boundaries & 2))
1992                    return 0;
1993
1994                  if (subexp_idx < BITSET_WORD_BITS)
1995                    ent->eps_reachable_subexps_map
1996                      &= ~((bitset_word_t) 1 << subexp_idx);
1997                }
1998              while (ent++->more);
1999            }
2000          break;
2001
2002        case OP_OPEN_SUBEXP:
2003          if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2004            return -1;
2005          break;
2006
2007        case OP_CLOSE_SUBEXP:
2008          if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2009            return 0;
2010          break;
2011
2012        default:
2013            break;
2014        }
2015    }
2016
2017  return (boundaries & 2) ? 1 : 0;
2018}
2019
2020static int
2021internal_function
2022check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2023                           int subexp_idx, int from_node, int str_idx,
2024                           int bkref_idx)
2025{
2026  struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2027  int boundaries;
2028
2029  /* If we are outside the range of the subexpression, return -1 or 1.  */
2030  if (str_idx < lim->subexp_from)
2031    return -1;
2032
2033  if (lim->subexp_to < str_idx)
2034    return 1;
2035
2036  /* If we are within the subexpression, return 0.  */
2037  boundaries = (str_idx == lim->subexp_from);
2038  boundaries |= (str_idx == lim->subexp_to) << 1;
2039  if (boundaries == 0)
2040    return 0;
2041
2042  /* Else, examine epsilon closure.  */
2043  return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2044                                      from_node, bkref_idx);
2045}
2046
2047/* Check the limitations of sub expressions LIMITS, and remove the nodes
2048   which are against limitations from DEST_NODES. */
2049
2050static reg_errcode_t
2051internal_function
2052check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2053                     const re_node_set *candidates, re_node_set *limits,
2054                     struct re_backref_cache_entry *bkref_ents, int str_idx)
2055{
2056  reg_errcode_t err;
2057  int node_idx, lim_idx;
2058
2059  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2060    {
2061      int subexp_idx;
2062      struct re_backref_cache_entry *ent;
2063      ent = bkref_ents + limits->elems[lim_idx];
2064
2065      if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2066        continue; /* This is unrelated limitation.  */
2067
2068      subexp_idx = dfa->nodes[ent->node].opr.idx;
2069      if (ent->subexp_to == str_idx)
2070        {
2071          int ops_node = -1;
2072          int cls_node = -1;
2073          for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2074            {
2075              int node = dest_nodes->elems[node_idx];
2076              re_token_type_t type = dfa->nodes[node].type;
2077              if (type == OP_OPEN_SUBEXP
2078                  && subexp_idx == dfa->nodes[node].opr.idx)
2079                ops_node = node;
2080              else if (type == OP_CLOSE_SUBEXP
2081                       && subexp_idx == dfa->nodes[node].opr.idx)
2082                cls_node = node;
2083            }
2084
2085          /* Check the limitation of the open subexpression.  */
2086          /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2087          if (ops_node >= 0)
2088            {
2089              err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2090                                           candidates);
2091              if (BE (err != REG_NOERROR, 0))
2092                return err;
2093            }
2094
2095          /* Check the limitation of the close subexpression.  */
2096          if (cls_node >= 0)
2097            for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2098              {
2099                int node = dest_nodes->elems[node_idx];
2100                if (!re_node_set_contains (dfa->inveclosures + node,
2101                                           cls_node)
2102                    && !re_node_set_contains (dfa->eclosures + node,
2103                                              cls_node))
2104                  {
2105                    /* It is against this limitation.
2106                       Remove it form the current sifted state.  */
2107                    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2108                                                 candidates);
2109                    if (BE (err != REG_NOERROR, 0))
2110                      return err;
2111                    --node_idx;
2112                  }
2113              }
2114        }
2115      else /* (ent->subexp_to != str_idx)  */
2116        {
2117          for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2118            {
2119              int node = dest_nodes->elems[node_idx];
2120              re_token_type_t type = dfa->nodes[node].type;
2121              if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2122                {
2123                  if (subexp_idx != dfa->nodes[node].opr.idx)
2124                    continue;
2125                  /* It is against this limitation.
2126                     Remove it form the current sifted state.  */
2127                  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2128                                               candidates);
2129                  if (BE (err != REG_NOERROR, 0))
2130                    return err;
2131                }
2132            }
2133        }
2134    }
2135  return REG_NOERROR;
2136}
2137
2138static reg_errcode_t
2139internal_function
2140sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2141                   int str_idx, const re_node_set *candidates)
2142{
2143  const re_dfa_t *const dfa = mctx->dfa;
2144  reg_errcode_t err;
2145  int node_idx, node;
2146  re_sift_context_t local_sctx;
2147  int first_idx = search_cur_bkref_entry (mctx, str_idx);
2148
2149  if (first_idx == -1)
2150    return REG_NOERROR;
2151
2152  local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2153
2154  for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2155    {
2156      int enabled_idx;
2157      re_token_type_t type;
2158      struct re_backref_cache_entry *entry;
2159      node = candidates->elems[node_idx];
2160      type = dfa->nodes[node].type;
2161      /* Avoid infinite loop for the REs like "()\1+".  */
2162      if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2163        continue;
2164      if (type != OP_BACK_REF)
2165        continue;
2166
2167      entry = mctx->bkref_ents + first_idx;
2168      enabled_idx = first_idx;
2169      do
2170        {
2171          int subexp_len;
2172          int to_idx;
2173          int dst_node;
2174          int ret;
2175          re_dfastate_t *cur_state;
2176
2177          if (entry->node != node)
2178            continue;
2179          subexp_len = entry->subexp_to - entry->subexp_from;
2180          to_idx = str_idx + subexp_len;
2181          dst_node = (subexp_len ? dfa->nexts[node]
2182                      : dfa->edests[node].elems[0]);
2183
2184          if (to_idx > sctx->last_str_idx
2185              || sctx->sifted_states[to_idx] == NULL
2186              || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2187              || check_dst_limits (mctx, &sctx->limits, node,
2188                                   str_idx, dst_node, to_idx))
2189            continue;
2190
2191          if (local_sctx.sifted_states == NULL)
2192            {
2193              local_sctx = *sctx;
2194              err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2195              if (BE (err != REG_NOERROR, 0))
2196                goto free_return;
2197            }
2198          local_sctx.last_node = node;
2199          local_sctx.last_str_idx = str_idx;
2200          ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2201          if (BE (ret < 0, 0))
2202            {
2203              err = REG_ESPACE;
2204              goto free_return;
2205            }
2206          cur_state = local_sctx.sifted_states[str_idx];
2207          err = sift_states_backward (mctx, &local_sctx);
2208          if (BE (err != REG_NOERROR, 0))
2209            goto free_return;
2210          if (sctx->limited_states != NULL)
2211            {
2212              err = merge_state_array (dfa, sctx->limited_states,
2213                                       local_sctx.sifted_states,
2214                                       str_idx + 1);
2215              if (BE (err != REG_NOERROR, 0))
2216                goto free_return;
2217            }
2218          local_sctx.sifted_states[str_idx] = cur_state;
2219          re_node_set_remove (&local_sctx.limits, enabled_idx);
2220
2221          /* mctx->bkref_ents may have changed, reload the pointer.  */
2222          entry = mctx->bkref_ents + enabled_idx;
2223        }
2224      while (enabled_idx++, entry++->more);
2225    }
2226  err = REG_NOERROR;
2227 free_return:
2228  if (local_sctx.sifted_states != NULL)
2229    {
2230      re_node_set_free (&local_sctx.limits);
2231    }
2232
2233  return err;
2234}
2235
2236
2237#ifdef RE_ENABLE_I18N
2238static int
2239internal_function
2240sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2241                     int node_idx, int str_idx, int max_str_idx)
2242{
2243  const re_dfa_t *const dfa = mctx->dfa;
2244  int naccepted;
2245  /* Check the node can accept `multi byte'.  */
2246  naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2247  if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2248      !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2249                            dfa->nexts[node_idx]))
2250    /* The node can't accept the `multi byte', or the
2251       destination was already thrown away, then the node
2252       could't accept the current input `multi byte'.   */
2253    naccepted = 0;
2254  /* Otherwise, it is sure that the node could accept
2255     `naccepted' bytes input.  */
2256  return naccepted;
2257}
2258#endif /* RE_ENABLE_I18N */
2259
2260\f
2261/* Functions for state transition.  */
2262
2263/* Return the next state to which the current state STATE will transit by
2264   accepting the current input byte, and update STATE_LOG if necessary.
2265   If STATE can accept a multibyte char/collating element/back reference
2266   update the destination of STATE_LOG.  */
2267
2268static re_dfastate_t *
2269internal_function
2270transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2271               re_dfastate_t *state)
2272{
2273  re_dfastate_t **trtable;
2274  unsigned char ch;
2275
2276#ifdef RE_ENABLE_I18N
2277  /* If the current state can accept multibyte.  */
2278  if (BE (state->accept_mb, 0))
2279    {
2280      *err = transit_state_mb (mctx, state);
2281      if (BE (*err != REG_NOERROR, 0))
2282        return NULL;
2283    }
2284#endif /* RE_ENABLE_I18N */
2285
2286  /* Then decide the next state with the single byte.  */
2287#if 0
2288  if (0)
2289    /* don't use transition table  */
2290    return transit_state_sb (err, mctx, state);
2291#endif
2292
2293  /* Use transition table  */
2294  ch = re_string_fetch_byte (&mctx->input);
2295  for (;;)
2296    {
2297      trtable = state->trtable;
2298      if (BE (trtable != NULL, 1))
2299        return trtable[ch];
2300
2301      trtable = state->word_trtable;
2302      if (BE (trtable != NULL, 1))
2303        {
2304          unsigned int context;
2305          context
2306            = re_string_context_at (&mctx->input,
2307                                    re_string_cur_idx (&mctx->input) - 1,
2308                                    mctx->eflags);
2309          if (IS_WORD_CONTEXT (context))
2310            return trtable[ch + SBC_MAX];
2311          else
2312            return trtable[ch];
2313        }
2314
2315      if (!build_trtable (mctx->dfa, state))
2316        {
2317          *err = REG_ESPACE;
2318          return NULL;
2319        }
2320
2321      /* Retry, we now have a transition table.  */
2322    }
2323}
2324
2325/* Update the state_log if we need */
2326re_dfastate_t *
2327internal_function
2328merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2329                      re_dfastate_t *next_state)
2330{
2331  const re_dfa_t *const dfa = mctx->dfa;
2332  int cur_idx = re_string_cur_idx (&mctx->input);
2333
2334  if (cur_idx > mctx->state_log_top)
2335    {
2336      mctx->state_log[cur_idx] = next_state;
2337      mctx->state_log_top = cur_idx;
2338    }
2339  else if (mctx->state_log[cur_idx] == 0)
2340    {
2341      mctx->state_log[cur_idx] = next_state;
2342    }
2343  else
2344    {
2345      re_dfastate_t *pstate;
2346      unsigned int context;
2347      re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2348      /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2349         the destination of a multibyte char/collating element/
2350         back reference.  Then the next state is the union set of
2351         these destinations and the results of the transition table.  */
2352      pstate = mctx->state_log[cur_idx];
2353      log_nodes = pstate->entrance_nodes;
2354      if (next_state != NULL)
2355        {
2356          table_nodes = next_state->entrance_nodes;
2357          *err = re_node_set_init_union (&next_nodes, table_nodes,
2358                                             log_nodes);
2359          if (BE (*err != REG_NOERROR, 0))
2360            return NULL;
2361        }
2362      else
2363        next_nodes = *log_nodes;
2364      /* Note: We already add the nodes of the initial state,
2365         then we don't need to add them here.  */
2366
2367      context = re_string_context_at (&mctx->input,
2368                                      re_string_cur_idx (&mctx->input) - 1,
2369                                      mctx->eflags);
2370      next_state = mctx->state_log[cur_idx]
2371        = re_acquire_state_context (err, dfa, &next_nodes, context);
2372      /* We don't need to check errors here, since the return value of
2373         this function is next_state and ERR is already set.  */
2374
2375      if (table_nodes != NULL)
2376        re_node_set_free (&next_nodes);
2377    }
2378
2379  if (BE (dfa->nbackref, 0) && next_state != NULL)
2380    {
2381      /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2382         later.  We must check them here, since the back references in the
2383         next state might use them.  */
2384      *err = check_subexp_matching_top (mctx, &next_state->nodes,
2385                                        cur_idx);
2386      if (BE (*err != REG_NOERROR, 0))
2387        return NULL;
2388
2389      /* If the next state has back references.  */
2390      if (next_state->has_backref)
2391        {
2392          *err = transit_state_bkref (mctx, &next_state->nodes);
2393          if (BE (*err != REG_NOERROR, 0))
2394            return NULL;
2395          next_state = mctx->state_log[cur_idx];
2396        }
2397    }
2398
2399  return next_state;
2400}
2401
2402/* Skip bytes in the input that correspond to part of a
2403   multi-byte match, then look in the log for a state
2404   from which to restart matching.  */
2405re_dfastate_t *
2406internal_function
2407find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2408{
2409  re_dfastate_t *cur_state;
2410  do
2411    {
2412      int max = mctx->state_log_top;
2413      int cur_str_idx = re_string_cur_idx (&mctx->input);
2414
2415      do
2416        {
2417          if (++cur_str_idx > max)
2418            return NULL;
2419          re_string_skip_bytes (&mctx->input, 1);
2420        }
2421      while (mctx->state_log[cur_str_idx] == NULL);
2422
2423      cur_state = merge_state_with_log (err, mctx, NULL);
2424    }
2425  while (*err == REG_NOERROR && cur_state == NULL);
2426  return cur_state;
2427}
2428
2429/* Helper functions for transit_state.  */
2430
2431/* From the node set CUR_NODES, pick up the nodes whose types are
2432   OP_OPEN_SUBEXP and which have corresponding back references in the regular
2433   expression. And register them to use them later for evaluating the
2434   correspoding back references.  */
2435
2436static reg_errcode_t
2437internal_function
2438check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2439                           int str_idx)
2440{
2441  const re_dfa_t *const dfa = mctx->dfa;
2442  int node_idx;
2443  reg_errcode_t err;
2444
2445  /* TODO: This isn't efficient.
2446           Because there might be more than one nodes whose types are
2447           OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2448           nodes.
2449           E.g. RE: (a){2}  */
2450  for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2451    {
2452      int node = cur_nodes->elems[node_idx];
2453      if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2454          && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2455          && (dfa->used_bkref_map
2456              & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2457        {
2458          err = match_ctx_add_subtop (mctx, node, str_idx);
2459          if (BE (err != REG_NOERROR, 0))
2460            return err;
2461        }
2462    }
2463  return REG_NOERROR;
2464}
2465
2466#if 0
2467/* Return the next state to which the current state STATE will transit by
2468   accepting the current input byte.  */
2469
2470static re_dfastate_t *
2471transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2472                  re_dfastate_t *state)
2473{
2474  const re_dfa_t *const dfa = mctx->dfa;
2475  re_node_set next_nodes;
2476  re_dfastate_t *next_state;
2477  int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2478  unsigned int context;
2479
2480  *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2481  if (BE (*err != REG_NOERROR, 0))
2482    return NULL;
2483  for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2484    {
2485      int cur_node = state->nodes.elems[node_cnt];
2486      if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2487        {
2488          *err = re_node_set_merge (&next_nodes,
2489                                    dfa->eclosures + dfa->nexts[cur_node]);
2490          if (BE (*err != REG_NOERROR, 0))
2491            {
2492              re_node_set_free (&next_nodes);
2493              return NULL;
2494            }
2495        }
2496    }
2497  context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2498  next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2499  /* We don't need to check errors here, since the return value of
2500     this function is next_state and ERR is already set.  */
2501
2502  re_node_set_free (&next_nodes);
2503  re_string_skip_bytes (&mctx->input, 1);
2504  return next_state;
2505}
2506#endif
2507
2508#ifdef RE_ENABLE_I18N
2509static reg_errcode_t
2510internal_function
2511transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2512{
2513  const re_dfa_t *const dfa = mctx->dfa;
2514  reg_errcode_t err;
2515  int i;
2516
2517  for (i = 0; i < pstate->nodes.nelem; ++i)
2518    {
2519      re_node_set dest_nodes, *new_nodes;
2520      int cur_node_idx = pstate->nodes.elems[i];
2521      int naccepted, dest_idx;
2522      unsigned int context;
2523      re_dfastate_t *dest_state;
2524
2525      if (!dfa->nodes[cur_node_idx].accept_mb)
2526        continue;
2527
2528      if (dfa->nodes[cur_node_idx].constraint)
2529        {
2530          context = re_string_context_at (&mctx->input,
2531                                          re_string_cur_idx (&mctx->input),
2532                                          mctx->eflags);
2533          if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2534                                           context))
2535            continue;
2536        }
2537
2538      /* How many bytes the node can accept?  */
2539      naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2540                                           re_string_cur_idx (&mctx->input));
2541      if (naccepted == 0)
2542        continue;
2543
2544      /* The node can accepts `naccepted' bytes.  */
2545      dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2546      mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2547                               : mctx->max_mb_elem_len);
2548      err = clean_state_log_if_needed (mctx, dest_idx);
2549      if (BE (err != REG_NOERROR, 0))
2550        return err;
2551#ifdef DEBUG
2552      assert (dfa->nexts[cur_node_idx] != -1);
2553#endif
2554      new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2555
2556      dest_state = mctx->state_log[dest_idx];
2557      if (dest_state == NULL)
2558        dest_nodes = *new_nodes;
2559      else
2560        {
2561          err = re_node_set_init_union (&dest_nodes,
2562                                        dest_state->entrance_nodes, new_nodes);
2563          if (BE (err != REG_NOERROR, 0))
2564            return err;
2565        }
2566      context = re_string_context_at (&mctx->input, dest_idx - 1,
2567                                      mctx->eflags);
2568      mctx->state_log[dest_idx]
2569        = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2570      if (dest_state != NULL)
2571        re_node_set_free (&dest_nodes);
2572      if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2573        return err;
2574    }
2575  return REG_NOERROR;
2576}
2577#endif /* RE_ENABLE_I18N */
2578
2579static reg_errcode_t
2580internal_function
2581transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2582{
2583  const re_dfa_t *const dfa = mctx->dfa;
2584  reg_errcode_t err;
2585  int i;
2586  int cur_str_idx = re_string_cur_idx (&mctx->input);
2587
2588  for (i = 0; i < nodes->nelem; ++i)
2589    {
2590      int dest_str_idx, prev_nelem, bkc_idx;
2591      int node_idx = nodes->elems[i];
2592      unsigned int context;
2593      const re_token_t *node = dfa->nodes + node_idx;
2594      re_node_set *new_dest_nodes;
2595
2596      /* Check whether `node' is a backreference or not.  */
2597      if (node->type != OP_BACK_REF)
2598        continue;
2599
2600      if (node->constraint)
2601        {
2602          context = re_string_context_at (&mctx->input, cur_str_idx,
2603                                          mctx->eflags);
2604          if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2605            continue;
2606        }
2607
2608      /* `node' is a backreference.
2609         Check the substring which the substring matched.  */
2610      bkc_idx = mctx->nbkref_ents;
2611      err = get_subexp (mctx, node_idx, cur_str_idx);
2612      if (BE (err != REG_NOERROR, 0))
2613        goto free_return;
2614
2615      /* And add the epsilon closures (which is `new_dest_nodes') of
2616         the backreference to appropriate state_log.  */
2617#ifdef DEBUG
2618      assert (dfa->nexts[node_idx] != -1);
2619#endif
2620      for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2621        {
2622          int subexp_len;
2623          re_dfastate_t *dest_state;
2624          struct re_backref_cache_entry *bkref_ent;
2625          bkref_ent = mctx->bkref_ents + bkc_idx;
2626          if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2627            continue;
2628          subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2629          new_dest_nodes = (subexp_len == 0
2630                            ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2631                            : dfa->eclosures + dfa->nexts[node_idx]);
2632          dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2633                          - bkref_ent->subexp_from);
2634          context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2635                                          mctx->eflags);
2636          dest_state = mctx->state_log[dest_str_idx];
2637          prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2638                        : mctx->state_log[cur_str_idx]->nodes.nelem);
2639          /* Add `new_dest_node' to state_log.  */
2640          if (dest_state == NULL)
2641            {
2642              mctx->state_log[dest_str_idx]
2643                = re_acquire_state_context (&err, dfa, new_dest_nodes,
2644                                            context);
2645              if (BE (mctx->state_log[dest_str_idx] == NULL
2646                      && err != REG_NOERROR, 0))
2647                goto free_return;
2648            }
2649          else
2650            {
2651              re_node_set dest_nodes;
2652              err = re_node_set_init_union (&dest_nodes,
2653                                            dest_state->entrance_nodes,
2654                                            new_dest_nodes);
2655              if (BE (err != REG_NOERROR, 0))
2656                {
2657                  re_node_set_free (&dest_nodes);
2658                  goto free_return;
2659                }
2660              mctx->state_log[dest_str_idx]
2661                = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2662              re_node_set_free (&dest_nodes);
2663              if (BE (mctx->state_log[dest_str_idx] == NULL
2664                      && err != REG_NOERROR, 0))
2665                goto free_return;
2666            }
2667          /* We need to check recursively if the backreference can epsilon
2668             transit.  */
2669          if (subexp_len == 0
2670              && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2671            {
2672              err = check_subexp_matching_top (mctx, new_dest_nodes,
2673                                               cur_str_idx);
2674              if (BE (err != REG_NOERROR, 0))
2675                goto free_return;
2676              err = transit_state_bkref (mctx, new_dest_nodes);
2677              if (BE (err != REG_NOERROR, 0))
2678                goto free_return;
2679            }
2680        }
2681    }
2682  err = REG_NOERROR;
2683 free_return:
2684  return err;
2685}
2686
2687/* Enumerate all the candidates which the backreference BKREF_NODE can match
2688   at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2689   Note that we might collect inappropriate candidates here.
2690   However, the cost of checking them strictly here is too high, then we
2691   delay these checking for prune_impossible_nodes().  */
2692
2693static reg_errcode_t
2694internal_function
2695get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2696{
2697  const re_dfa_t *const dfa = mctx->dfa;
2698  int subexp_num, sub_top_idx;
2699  const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2700  /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2701  int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2702  if (cache_idx != -1)
2703    {
2704      const struct re_backref_cache_entry *entry
2705        = mctx->bkref_ents + cache_idx;
2706      do
2707        if (entry->node == bkref_node)
2708          return REG_NOERROR; /* We already checked it.  */
2709      while (entry++->more);
2710    }
2711
2712  subexp_num = dfa->nodes[bkref_node].opr.idx;
2713
2714  /* For each sub expression  */
2715  for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2716    {
2717      reg_errcode_t err;
2718      re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2719      re_sub_match_last_t *sub_last;
2720      int sub_last_idx, sl_str, bkref_str_off;
2721
2722      if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2723        continue; /* It isn't related.  */
2724
2725      sl_str = sub_top->str_idx;
2726      bkref_str_off = bkref_str_idx;
2727      /* At first, check the last node of sub expressions we already
2728         evaluated.  */
2729      for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2730        {
2731          int sl_str_diff;
2732          sub_last = sub_top->lasts[sub_last_idx];
2733          sl_str_diff = sub_last->str_idx - sl_str;
2734          /* The matched string by the sub expression match with the substring
2735             at the back reference?  */
2736          if (sl_str_diff > 0)
2737            {
2738              if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2739                {
2740                  /* Not enough chars for a successful match.  */
2741                  if (bkref_str_off + sl_str_diff > mctx->input.len)
2742                    break;
2743
2744                  err = clean_state_log_if_needed (mctx,
2745                                                   bkref_str_off
2746                                                   + sl_str_diff);
2747                  if (BE (err != REG_NOERROR, 0))
2748                    return err;
2749                  buf = (const char *) re_string_get_buffer (&mctx->input);
2750                }
2751              if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2752                /* We don't need to search this sub expression any more.  */
2753                break;
2754            }
2755          bkref_str_off += sl_str_diff;
2756          sl_str += sl_str_diff;
2757          err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2758                                bkref_str_idx);
2759
2760          /* Reload buf, since the preceding call might have reallocated
2761             the buffer.  */
2762          buf = (const char *) re_string_get_buffer (&mctx->input);
2763
2764          if (err == REG_NOMATCH)
2765            continue;
2766          if (BE (err != REG_NOERROR, 0))
2767            return err;
2768        }
2769
2770      if (sub_last_idx < sub_top->nlasts)
2771        continue;
2772      if (sub_last_idx > 0)
2773        ++sl_str;
2774      /* Then, search for the other last nodes of the sub expression.  */
2775      for (; sl_str <= bkref_str_idx; ++sl_str)
2776        {
2777          int cls_node, sl_str_off;
2778          const re_node_set *nodes;
2779          sl_str_off = sl_str - sub_top->str_idx;
2780          /* The matched string by the sub expression match with the substring
2781             at the back reference?  */
2782          if (sl_str_off > 0)
2783            {
2784              if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2785                {
2786                  /* If we are at the end of the input, we cannot match.  */
2787                  if (bkref_str_off >= mctx->input.len)
2788                    break;
2789
2790                  err = extend_buffers (mctx);
2791                  if (BE (err != REG_NOERROR, 0))
2792                    return err;
2793
2794                  buf = (const char *) re_string_get_buffer (&mctx->input);
2795                }
2796              if (buf [bkref_str_off++] != buf[sl_str - 1])
2797                break; /* We don't need to search this sub expression
2798                          any more.  */
2799            }
2800          if (mctx->state_log[sl_str] == NULL)
2801            continue;
2802          /* Does this state have a ')' of the sub expression?  */
2803          nodes = &mctx->state_log[sl_str]->nodes;
2804          cls_node = find_subexp_node (dfa, nodes, subexp_num,
2805                                       OP_CLOSE_SUBEXP);
2806          if (cls_node == -1)
2807            continue; /* No.  */
2808          if (sub_top->path == NULL)
2809            {
2810              sub_top->path = calloc (sizeof (state_array_t),
2811                                      sl_str - sub_top->str_idx + 1);
2812              if (sub_top->path == NULL)
2813                return REG_ESPACE;
2814            }
2815          /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2816             in the current context?  */
2817          err = check_arrival (mctx, sub_top->path, sub_top->node,
2818                               sub_top->str_idx, cls_node, sl_str,
2819                               OP_CLOSE_SUBEXP);
2820          if (err == REG_NOMATCH)
2821              continue;
2822          if (BE (err != REG_NOERROR, 0))
2823              return err;
2824          sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2825          if (BE (sub_last == NULL, 0))
2826            return REG_ESPACE;
2827          err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2828                                bkref_str_idx);
2829          if (err == REG_NOMATCH)
2830            continue;
2831        }
2832    }
2833  return REG_NOERROR;
2834}
2835
2836/* Helper functions for get_subexp().  */
2837
2838/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2839   If it can arrive, register the sub expression expressed with SUB_TOP
2840   and SUB_LAST.  */
2841
2842static reg_errcode_t
2843internal_function
2844get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2845                re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2846{
2847  reg_errcode_t err;
2848  int to_idx;
2849  /* Can the subexpression arrive the back reference?  */
2850  err = check_arrival (mctx, &sub_last->path, sub_last->node,
2851                       sub_last->str_idx, bkref_node, bkref_str,
2852                       OP_OPEN_SUBEXP);
2853  if (err != REG_NOERROR)
2854    return err;
2855  err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2856                             sub_last->str_idx);
2857  if (BE (err != REG_NOERROR, 0))
2858    return err;
2859  to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2860  return clean_state_log_if_needed (mctx, to_idx);
2861}
2862
2863/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2864   Search '(' if FL_OPEN, or search ')' otherwise.
2865   TODO: This function isn't efficient...
2866         Because there might be more than one nodes whose types are
2867         OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2868         nodes.
2869         E.g. RE: (a){2}  */
2870
2871static int
2872internal_function
2873find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2874                  int subexp_idx, int type)
2875{
2876  int cls_idx;
2877  for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2878    {
2879      int cls_node = nodes->elems[cls_idx];
2880      const re_token_t *node = dfa->nodes + cls_node;
2881      if (node->type == type
2882          && node->opr.idx == subexp_idx)
2883        return cls_node;
2884    }
2885  return -1;
2886}
2887
2888/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2889   LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2890   heavily reused.
2891   Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2892
2893static reg_errcode_t
2894internal_function
2895check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2896               int top_str, int last_node, int last_str, int type)
2897{
2898  const re_dfa_t *const dfa = mctx->dfa;
2899  reg_errcode_t err = REG_NOERROR;
2900  int subexp_num, backup_cur_idx, str_idx, null_cnt;
2901  re_dfastate_t *cur_state = NULL;
2902  re_node_set *cur_nodes, next_nodes;
2903  re_dfastate_t **backup_state_log;
2904  unsigned int context;
2905
2906  subexp_num = dfa->nodes[top_node].opr.idx;
2907  /* Extend the buffer if we need.  */
2908  if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2909    {
2910      re_dfastate_t **new_array;
2911      int old_alloc = path->alloc;
2912      path->alloc += last_str + mctx->max_mb_elem_len + 1;
2913      new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2914      if (BE (new_array == NULL, 0))
2915        {
2916          path->alloc = old_alloc;
2917          return REG_ESPACE;
2918        }
2919      path->array = new_array;
2920      memset (new_array + old_alloc, '\0',
2921              sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2922    }
2923
2924  str_idx = path->next_idx ? path->next_idx : top_str;
2925
2926  /* Temporary modify MCTX.  */
2927  backup_state_log = mctx->state_log;
2928  backup_cur_idx = mctx->input.cur_idx;
2929  mctx->state_log = path->array;
2930  mctx->input.cur_idx = str_idx;
2931
2932  /* Setup initial node set.  */
2933  context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2934  if (str_idx == top_str)
2935    {
2936      err = re_node_set_init_1 (&next_nodes, top_node);
2937      if (BE (err != REG_NOERROR, 0))
2938        return err;
2939      err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2940      if (BE (err != REG_NOERROR, 0))
2941        {
2942          re_node_set_free (&next_nodes);
2943          return err;
2944        }
2945    }
2946  else
2947    {
2948      cur_state = mctx->state_log[str_idx];
2949      if (cur_state && cur_state->has_backref)
2950        {
2951          err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2952          if (BE (err != REG_NOERROR, 0))
2953            return err;
2954        }
2955      else
2956        re_node_set_init_empty (&next_nodes);
2957    }
2958  if (str_idx == top_str || (cur_state && cur_state->has_backref))
2959    {
2960      if (next_nodes.nelem)
2961        {
2962          err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2963                                    subexp_num, type);
2964          if (BE (err != REG_NOERROR, 0))
2965            {
2966              re_node_set_free (&next_nodes);
2967              return err;
2968            }
2969        }
2970      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2971      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2972        {
2973          re_node_set_free (&next_nodes);
2974          return err;
2975        }
2976      mctx->state_log[str_idx] = cur_state;
2977    }
2978
2979  for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2980    {
2981      re_node_set_empty (&next_nodes);
2982      if (mctx->state_log[str_idx + 1])
2983        {
2984          err = re_node_set_merge (&next_nodes,
2985                                   &mctx->state_log[str_idx + 1]->nodes);
2986          if (BE (err != REG_NOERROR, 0))
2987            {
2988              re_node_set_free (&next_nodes);
2989              return err;
2990            }
2991        }
2992      if (cur_state)
2993        {
2994          err = check_arrival_add_next_nodes (mctx, str_idx,
2995                                              &cur_state->non_eps_nodes,
2996                                              &next_nodes);
2997          if (BE (err != REG_NOERROR, 0))
2998            {
2999              re_node_set_free (&next_nodes);
3000              return err;
3001            }
3002        }
3003      ++str_idx;
3004      if (next_nodes.nelem)
3005        {
3006          err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3007          if (BE (err != REG_NOERROR, 0))
3008            {
3009              re_node_set_free (&next_nodes);
3010              return err;
3011            }
3012          err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3013                                    subexp_num, type);
3014          if (BE (err != REG_NOERROR, 0))
3015            {
3016              re_node_set_free (&next_nodes);
3017              return err;
3018            }
3019        }
3020      context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3021      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3022      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3023        {
3024          re_node_set_free (&next_nodes);
3025          return err;
3026        }
3027      mctx->state_log[str_idx] = cur_state;
3028      null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3029    }
3030  re_node_set_free (&next_nodes);
3031  cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3032               : &mctx->state_log[last_str]->nodes);
3033  path->next_idx = str_idx;
3034
3035  /* Fix MCTX.  */
3036  mctx->state_log = backup_state_log;
3037  mctx->input.cur_idx = backup_cur_idx;
3038
3039  /* Then check the current node set has the node LAST_NODE.  */
3040  if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3041    return REG_NOERROR;
3042
3043  return REG_NOMATCH;
3044}
3045
3046/* Helper functions for check_arrival.  */
3047
3048/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3049   to NEXT_NODES.
3050   TODO: This function is similar to the functions transit_state*(),
3051         however this function has many additional works.
3052         Can't we unify them?  */
3053
3054static reg_errcode_t
3055internal_function
3056check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3057                              re_node_set *cur_nodes, re_node_set *next_nodes)
3058{
3059  const re_dfa_t *const dfa = mctx->dfa;
3060  int result;
3061  int cur_idx;
3062#ifdef RE_ENABLE_I18N
3063  reg_errcode_t err = REG_NOERROR;
3064#endif
3065  re_node_set union_set;
3066  re_node_set_init_empty (&union_set);
3067  for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3068    {
3069      int naccepted = 0;
3070      int cur_node = cur_nodes->elems[cur_idx];
3071#ifdef DEBUG
3072      re_token_type_t type = dfa->nodes[cur_node].type;
3073      assert (!IS_EPSILON_NODE (type));
3074#endif
3075#ifdef RE_ENABLE_I18N
3076      /* If the node may accept `multi byte'.  */
3077      if (dfa->nodes[cur_node].accept_mb)
3078        {
3079          naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3080                                               str_idx);
3081          if (naccepted > 1)
3082            {
3083              re_dfastate_t *dest_state;
3084              int next_node = dfa->nexts[cur_node];
3085              int next_idx = str_idx + naccepted;
3086              dest_state = mctx->state_log[next_idx];
3087              re_node_set_empty (&union_set);
3088              if (dest_state)
3089                {
3090                  err = re_node_set_merge (&union_set, &dest_state->nodes);
3091                  if (BE (err != REG_NOERROR, 0))
3092                    {
3093                      re_node_set_free (&union_set);
3094                      return err;
3095                    }
3096                }
3097              result = re_node_set_insert (&union_set, next_node);
3098              if (BE (result < 0, 0))
3099                {
3100                  re_node_set_free (&union_set);
3101                  return REG_ESPACE;
3102                }
3103              mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3104                                                            &union_set);
3105              if (BE (mctx->state_log[next_idx] == NULL
3106                      && err != REG_NOERROR, 0))
3107                {
3108                  re_node_set_free (&union_set);
3109                  return err;
3110                }
3111            }
3112        }
3113#endif /* RE_ENABLE_I18N */
3114      if (naccepted
3115          || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3116        {
3117          result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3118          if (BE (result < 0, 0))
3119            {
3120              re_node_set_free (&union_set);
3121              return REG_ESPACE;
3122            }
3123        }
3124    }
3125  re_node_set_free (&union_set);
3126  return REG_NOERROR;
3127}
3128
3129/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3130   CUR_NODES, however exclude the nodes which are:
3131    - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3132    - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3133*/
3134
3135static reg_errcode_t
3136internal_function
3137check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3138                          int ex_subexp, int type)
3139{
3140  reg_errcode_t err;
3141  int idx, outside_node;
3142  re_node_set new_nodes;
3143#ifdef DEBUG
3144  assert (cur_nodes->nelem);
3145#endif
3146  err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3147  if (BE (err != REG_NOERROR, 0))
3148    return err;
3149  /* Create a new node set NEW_NODES with the nodes which are epsilon
3150     closures of the node in CUR_NODES.  */
3151
3152  for (idx = 0; idx < cur_nodes->nelem; ++idx)
3153    {
3154      int cur_node = cur_nodes->elems[idx];
3155      const re_node_set *eclosure = dfa->eclosures + cur_node;
3156      outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3157      if (outside_node == -1)
3158        {
3159          /* There are no problematic nodes, just merge them.  */
3160          err = re_node_set_merge (&new_nodes, eclosure);
3161          if (BE (err != REG_NOERROR, 0))
3162            {
3163              re_node_set_free (&new_nodes);
3164              return err;
3165            }
3166        }
3167      else
3168        {
3169          /* There are problematic nodes, re-calculate incrementally.  */
3170          err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3171                                              ex_subexp, type);
3172          if (BE (err != REG_NOERROR, 0))
3173            {
3174              re_node_set_free (&new_nodes);
3175              return err;
3176            }
3177        }
3178    }
3179  re_node_set_free (cur_nodes);
3180  *cur_nodes = new_nodes;
3181  return REG_NOERROR;
3182}
3183
3184/* Helper function for check_arrival_expand_ecl.
3185   Check incrementally the epsilon closure of TARGET, and if it isn't
3186   problematic append it to DST_NODES.  */
3187
3188static reg_errcode_t
3189internal_function
3190check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3191                              int target, int ex_subexp, int type)
3192{
3193  int cur_node;
3194  for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3195    {
3196      int err;
3197
3198      if (dfa->nodes[cur_node].type == type
3199          && dfa->nodes[cur_node].opr.idx == ex_subexp)
3200        {
3201          if (type == OP_CLOSE_SUBEXP)
3202            {
3203              err = re_node_set_insert (dst_nodes, cur_node);
3204              if (BE (err == -1, 0))
3205                return REG_ESPACE;
3206            }
3207          break;
3208        }
3209      err = re_node_set_insert (dst_nodes, cur_node);
3210      if (BE (err == -1, 0))
3211        return REG_ESPACE;
3212      if (dfa->edests[cur_node].nelem == 0)
3213        break;
3214      if (dfa->edests[cur_node].nelem == 2)
3215        {
3216          err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3217                                              dfa->edests[cur_node].elems[1],
3218                                              ex_subexp, type);
3219          if (BE (err != REG_NOERROR, 0))
3220            return err;
3221        }
3222      cur_node = dfa->edests[cur_node].elems[0];
3223    }
3224  return REG_NOERROR;
3225}
3226
3227
3228/* For all the back references in the current state, calculate the
3229   destination of the back references by the appropriate entry
3230   in MCTX->BKREF_ENTS.  */
3231
3232static reg_errcode_t
3233internal_function
3234expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3235                    int cur_str, int subexp_num, int type)
3236{
3237  const re_dfa_t *const dfa = mctx->dfa;
3238  reg_errcode_t err;
3239  int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3240  struct re_backref_cache_entry *ent;
3241
3242  if (cache_idx_start == -1)
3243    return REG_NOERROR;
3244
3245 restart:
3246  ent = mctx->bkref_ents + cache_idx_start;
3247  do
3248    {
3249      int to_idx, next_node;
3250
3251      /* Is this entry ENT is appropriate?  */
3252      if (!re_node_set_contains (cur_nodes, ent->node))
3253        continue; /* No.  */
3254
3255      to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3256      /* Calculate the destination of the back reference, and append it
3257         to MCTX->STATE_LOG.  */
3258      if (to_idx == cur_str)
3259        {
3260          /* The backreference did epsilon transit, we must re-check all the
3261             node in the current state.  */
3262          re_node_set new_dests;
3263          reg_errcode_t err2, err3;
3264          next_node = dfa->edests[ent->node].elems[0];
3265          if (re_node_set_contains (cur_nodes, next_node))
3266            continue;
3267          err = re_node_set_init_1 (&new_dests, next_node);
3268          err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3269          err3 = re_node_set_merge (cur_nodes, &new_dests);
3270          re_node_set_free (&new_dests);
3271          if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3272                  || err3 != REG_NOERROR, 0))
3273            {
3274              err = (err != REG_NOERROR ? err
3275                     : (err2 != REG_NOERROR ? err2 : err3));
3276              return err;
3277            }
3278          /* TODO: It is still inefficient...  */
3279          goto restart;
3280        }
3281      else
3282        {
3283          re_node_set union_set;
3284          next_node = dfa->nexts[ent->node];
3285          if (mctx->state_log[to_idx])
3286            {
3287              int ret;
3288              if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3289                                        next_node))
3290                continue;
3291              err = re_node_set_init_copy (&union_set,
3292                                           &mctx->state_log[to_idx]->nodes);
3293              ret = re_node_set_insert (&union_set, next_node);
3294              if (BE (err != REG_NOERROR || ret < 0, 0))
3295                {
3296                  re_node_set_free (&union_set);
3297                  err = err != REG_NOERROR ? err : REG_ESPACE;
3298                  return err;
3299                }
3300            }
3301          else
3302            {
3303              err = re_node_set_init_1 (&union_set, next_node);
3304              if (BE (err != REG_NOERROR, 0))
3305                return err;
3306            }
3307          mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3308          re_node_set_free (&union_set);
3309          if (BE (mctx->state_log[to_idx] == NULL
3310                  && err != REG_NOERROR, 0))
3311            return err;
3312        }
3313    }
3314  while (ent++->more);
3315  return REG_NOERROR;
3316}
3317
3318/* Build transition table for the state.
3319   Return 1 if succeeded, otherwise return NULL.  */
3320
3321static int
3322internal_function
3323build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3324{
3325  reg_errcode_t err;
3326  int i, j, ch, need_word_trtable = 0;
3327  bitset_word_t elem, mask;
3328  bool dests_node_malloced = false;
3329  bool dest_states_malloced = false;
3330  int ndests; /* Number of the destination states from `state'.  */
3331  re_dfastate_t **trtable;
3332  re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3333  re_node_set follows, *dests_node;
3334  bitset_t *dests_ch;
3335  bitset_t acceptable;
3336
3337  struct dests_alloc
3338  {
3339    re_node_set dests_node[SBC_MAX];
3340    bitset_t dests_ch[SBC_MAX];
3341  } *dests_alloc;
3342
3343  /* We build DFA states which corresponds to the destination nodes
3344     from `state'.  `dests_node[i]' represents the nodes which i-th
3345     destination state contains, and `dests_ch[i]' represents the
3346     characters which i-th destination state accepts.  */
3347#ifdef HAVE_ALLOCA
3348  if (__libc_use_alloca (sizeof (struct dests_alloc)))
3349    dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3350  else
3351#endif
3352    {
3353      dests_alloc = re_malloc (struct dests_alloc, 1);
3354      if (BE (dests_alloc == NULL, 0))
3355        return 0;
3356      dests_node_malloced = true;
3357    }
3358  dests_node = dests_alloc->dests_node;
3359  dests_ch = dests_alloc->dests_ch;
3360
3361  /* Initialize transiton table.  */
3362  state->word_trtable = state->trtable = NULL;
3363
3364  /* At first, group all nodes belonging to `state' into several
3365     destinations.  */
3366  ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3367  if (BE (ndests <= 0, 0))
3368    {
3369      if (dests_node_malloced)
3370        free (dests_alloc);
3371      /* Return 0 in case of an error, 1 otherwise.  */
3372      if (ndests == 0)
3373        {
3374          state->trtable = (re_dfastate_t **)
3375            calloc (sizeof (re_dfastate_t *), SBC_MAX);
3376          return 1;
3377        }
3378      return 0;
3379    }
3380
3381  err = re_node_set_alloc (&follows, ndests + 1);
3382  if (BE (err != REG_NOERROR, 0))
3383    goto out_free;
3384
3385  /* Avoid arithmetic overflow in size calculation.  */
3386  if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3387            / (3 * sizeof (re_dfastate_t *)))
3388           < ndests),
3389          0))
3390    goto out_free;
3391
3392#ifdef HAVE_ALLOCA
3393  if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3394                         + ndests * 3 * sizeof (re_dfastate_t *)))
3395    dest_states = (re_dfastate_t **)
3396      alloca (ndests * 3 * sizeof (re_dfastate_t *));
3397  else
3398#endif
3399    {
3400      dest_states = (re_dfastate_t **)
3401        malloc (ndests * 3 * sizeof (re_dfastate_t *));
3402      if (BE (dest_states == NULL, 0))
3403        {
3404out_free:
3405          if (dest_states_malloced)
3406            free (dest_states);
3407          re_node_set_free (&follows);
3408          for (i = 0; i < ndests; ++i)
3409            re_node_set_free (dests_node + i);
3410          if (dests_node_malloced)
3411            free (dests_alloc);
3412          return 0;
3413        }
3414      dest_states_malloced = true;
3415    }
3416  dest_states_word = dest_states + ndests;
3417  dest_states_nl = dest_states_word + ndests;
3418  bitset_empty (acceptable);
3419
3420  /* Then build the states for all destinations.  */
3421  for (i = 0; i < ndests; ++i)
3422    {
3423      int next_node;
3424      re_node_set_empty (&follows);
3425      /* Merge the follows of this destination states.  */
3426      for (j = 0; j < dests_node[i].nelem; ++j)
3427        {
3428          next_node = dfa->nexts[dests_node[i].elems[j]];
3429          if (next_node != -1)
3430            {
3431              err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3432              if (BE (err != REG_NOERROR, 0))
3433                goto out_free;
3434            }
3435        }
3436      dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3437      if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3438        goto out_free;
3439      /* If the new state has context constraint,
3440         build appropriate states for these contexts.  */
3441      if (dest_states[i]->has_constraint)
3442        {
3443          dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3444                                                          CONTEXT_WORD);
3445          if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3446            goto out_free;
3447
3448          if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3449            need_word_trtable = 1;
3450
3451          dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3452                                                        CONTEXT_NEWLINE);
3453          if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3454            goto out_free;
3455        }
3456      else
3457        {
3458          dest_states_word[i] = dest_states[i];
3459          dest_states_nl[i] = dest_states[i];
3460        }
3461      bitset_merge (acceptable, dests_ch[i]);
3462    }
3463
3464  if (!BE (need_word_trtable, 0))
3465    {
3466      /* We don't care about whether the following character is a word
3467         character, or we are in a single-byte character set so we can
3468         discern by looking at the character code: allocate a
3469         256-entry transition table.  */
3470      trtable = state->trtable =
3471        (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3472      if (BE (trtable == NULL, 0))
3473        goto out_free;
3474
3475      /* For all characters ch...:  */
3476      for (i = 0; i < BITSET_WORDS; ++i)
3477        for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3478             elem;
3479             mask <<= 1, elem >>= 1, ++ch)
3480          if (BE (elem & 1, 0))
3481            {
3482              /* There must be exactly one destination which accepts
3483                 character ch.  See group_nodes_into_DFAstates.  */
3484              for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3485                ;
3486
3487              /* j-th destination accepts the word character ch.  */
3488              if (dfa->word_char[i] & mask)
3489                trtable[ch] = dest_states_word[j];
3490              else
3491                trtable[ch] = dest_states[j];
3492            }
3493    }
3494  else
3495    {
3496      /* We care about whether the following character is a word
3497         character, and we are in a multi-byte character set: discern
3498         by looking at the character code: build two 256-entry
3499         transition tables, one starting at trtable[0] and one
3500         starting at trtable[SBC_MAX].  */
3501      trtable = state->word_trtable =
3502        (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3503      if (BE (trtable == NULL, 0))
3504        goto out_free;
3505
3506      /* For all characters ch...:  */
3507      for (i = 0; i < BITSET_WORDS; ++i)
3508        for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3509             elem;
3510             mask <<= 1, elem >>= 1, ++ch)
3511          if (BE (elem & 1, 0))
3512            {
3513              /* There must be exactly one destination which accepts
3514                 character ch.  See group_nodes_into_DFAstates.  */
3515              for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3516                ;
3517
3518              /* j-th destination accepts the word character ch.  */
3519              trtable[ch] = dest_states[j];
3520              trtable[ch + SBC_MAX] = dest_states_word[j];
3521            }
3522    }
3523
3524  /* new line */
3525  if (bitset_contain (acceptable, NEWLINE_CHAR))
3526    {
3527      /* The current state accepts newline character.  */
3528      for (j = 0; j < ndests; ++j)
3529        if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3530          {
3531            /* k-th destination accepts newline character.  */
3532            trtable[NEWLINE_CHAR] = dest_states_nl[j];
3533            if (need_word_trtable)
3534              trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3535            /* There must be only one destination which accepts
3536               newline.  See group_nodes_into_DFAstates.  */
3537            break;
3538          }
3539    }
3540
3541  if (dest_states_malloced)
3542    free (dest_states);
3543
3544  re_node_set_free (&follows);
3545  for (i = 0; i < ndests; ++i)
3546    re_node_set_free (dests_node + i);
3547
3548  if (dests_node_malloced)
3549    free (dests_alloc);
3550
3551  return 1;
3552}
3553
3554/* Group all nodes belonging to STATE into several destinations.
3555   Then for all destinations, set the nodes belonging to the destination
3556   to DESTS_NODE[i] and set the characters accepted by the destination
3557   to DEST_CH[i].  This function return the number of destinations.  */
3558
3559static int
3560internal_function
3561group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3562                            re_node_set *dests_node, bitset_t *dests_ch)
3563{
3564  reg_errcode_t err;
3565  int result;
3566  int i, j, k;
3567  int ndests; /* Number of the destinations from `state'.  */
3568  bitset_t accepts; /* Characters a node can accept.  */
3569  const re_node_set *cur_nodes = &state->nodes;
3570  bitset_empty (accepts);
3571  ndests = 0;
3572
3573  /* For all the nodes belonging to `state',  */
3574  for (i = 0; i < cur_nodes->nelem; ++i)
3575    {
3576      re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3577      re_token_type_t type = node->type;
3578      unsigned int constraint = node->constraint;
3579
3580      /* Enumerate all single byte character this node can accept.  */
3581      if (type == CHARACTER)
3582        bitset_set (accepts, node->opr.c);
3583      else if (type == SIMPLE_BRACKET)
3584        {
3585          bitset_merge (accepts, node->opr.sbcset);
3586        }
3587      else if (type == OP_PERIOD)
3588        {
3589#ifdef RE_ENABLE_I18N
3590          if (dfa->mb_cur_max > 1)
3591            bitset_merge (accepts, dfa->sb_char);
3592          else
3593#endif
3594            bitset_set_all (accepts);
3595          if (!(dfa->syntax & RE_DOT_NEWLINE))
3596            bitset_clear (accepts, '\n');
3597          if (dfa->syntax & RE_DOT_NOT_NULL)
3598            bitset_clear (accepts, '\0');
3599        }
3600#ifdef RE_ENABLE_I18N
3601      else if (type == OP_UTF8_PERIOD)
3602        {
3603          memset (accepts, '\xff', sizeof (bitset_t) / 2);
3604          if (!(dfa->syntax & RE_DOT_NEWLINE))
3605            bitset_clear (accepts, '\n');
3606          if (dfa->syntax & RE_DOT_NOT_NULL)
3607            bitset_clear (accepts, '\0');
3608        }
3609#endif
3610      else
3611        continue;
3612
3613      /* Check the `accepts' and sift the characters which are not
3614         match it the context.  */
3615      if (constraint)
3616        {
3617          if (constraint & NEXT_NEWLINE_CONSTRAINT)
3618            {
3619              bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3620              bitset_empty (accepts);
3621              if (accepts_newline)
3622                bitset_set (accepts, NEWLINE_CHAR);
3623              else
3624                continue;
3625            }
3626          if (constraint & NEXT_ENDBUF_CONSTRAINT)
3627            {
3628              bitset_empty (accepts);
3629              continue;
3630            }
3631
3632          if (constraint & NEXT_WORD_CONSTRAINT)
3633            {
3634              bitset_word_t any_set = 0;
3635              if (type == CHARACTER && !node->word_char)
3636                {
3637                  bitset_empty (accepts);
3638                  continue;
3639                }
3640#ifdef RE_ENABLE_I18N
3641              if (dfa->mb_cur_max > 1)
3642                for (j = 0; j < BITSET_WORDS; ++j)
3643                  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3644              else
3645#endif
3646                for (j = 0; j < BITSET_WORDS; ++j)
3647                  any_set |= (accepts[j] &= dfa->word_char[j]);
3648              if (!any_set)
3649                continue;
3650            }
3651          if (constraint & NEXT_NOTWORD_CONSTRAINT)
3652            {
3653              bitset_word_t any_set = 0;
3654              if (type == CHARACTER && node->word_char)
3655                {
3656                  bitset_empty (accepts);
3657                  continue;
3658                }
3659#ifdef RE_ENABLE_I18N
3660              if (dfa->mb_cur_max > 1)
3661                for (j = 0; j < BITSET_WORDS; ++j)
3662                  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3663              else
3664#endif
3665                for (j = 0; j < BITSET_WORDS; ++j)
3666                  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3667              if (!any_set)
3668                continue;
3669            }
3670        }
3671
3672      /* Then divide `accepts' into DFA states, or create a new
3673         state.  Above, we make sure that accepts is not empty.  */
3674      for (j = 0; j < ndests; ++j)
3675        {
3676          bitset_t intersec; /* Intersection sets, see below.  */
3677          bitset_t remains;
3678          /* Flags, see below.  */
3679          bitset_word_t has_intersec, not_subset, not_consumed;
3680
3681          /* Optimization, skip if this state doesn't accept the character.  */
3682          if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3683            continue;
3684
3685          /* Enumerate the intersection set of this state and `accepts'.  */
3686          has_intersec = 0;
3687          for (k = 0; k < BITSET_WORDS; ++k)
3688            has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3689          /* And skip if the intersection set is empty.  */
3690          if (!has_intersec)
3691            continue;
3692
3693          /* Then check if this state is a subset of `accepts'.  */
3694          not_subset = not_consumed = 0;
3695          for (k = 0; k < BITSET_WORDS; ++k)
3696            {
3697              not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3698              not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3699            }
3700
3701          /* If this state isn't a subset of `accepts', create a
3702             new group state, which has the `remains'. */
3703          if (not_subset)
3704            {
3705              bitset_copy (dests_ch[ndests], remains);
3706              bitset_copy (dests_ch[j], intersec);
3707              err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3708              if (BE (err != REG_NOERROR, 0))
3709                goto error_return;
3710              ++ndests;
3711            }
3712
3713          /* Put the position in the current group. */
3714          result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3715          if (BE (result < 0, 0))
3716            goto error_return;
3717
3718          /* If all characters are consumed, go to next node. */
3719          if (!not_consumed)
3720            break;
3721        }
3722      /* Some characters remain, create a new group. */
3723      if (j == ndests)
3724        {
3725          bitset_copy (dests_ch[ndests], accepts);
3726          err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3727          if (BE (err != REG_NOERROR, 0))
3728            goto error_return;
3729          ++ndests;
3730          bitset_empty (accepts);
3731        }
3732    }
3733  return ndests;
3734 error_return:
3735  for (j = 0; j < ndests; ++j)
3736    re_node_set_free (dests_node + j);
3737  return -1;
3738}
3739
3740#ifdef RE_ENABLE_I18N
3741/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3742   Return the number of the bytes the node accepts.
3743   STR_IDX is the current index of the input string.
3744
3745   This function handles the nodes which can accept one character, or
3746   one collating element like '.', '[a-z]', opposite to the other nodes
3747   can only accept one byte.  */
3748
3749static int
3750internal_function
3751check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3752                         const re_string_t *input, int str_idx)
3753{
3754  const re_token_t *node = dfa->nodes + node_idx;
3755  int char_len, elem_len;
3756  int i;
3757  wint_t wc;
3758
3759  if (BE (node->type == OP_UTF8_PERIOD, 0))
3760    {
3761      unsigned char c = re_string_byte_at (input, str_idx), d;
3762      if (BE (c < 0xc2, 1))
3763        return 0;
3764
3765      if (str_idx + 2 > input->len)
3766        return 0;
3767
3768      d = re_string_byte_at (input, str_idx + 1);
3769      if (c < 0xe0)
3770        return (d < 0x80 || d > 0xbf) ? 0 : 2;
3771      else if (c < 0xf0)
3772        {
3773          char_len = 3;
3774          if (c == 0xe0 && d < 0xa0)
3775            return 0;
3776        }
3777      else if (c < 0xf8)
3778        {
3779          char_len = 4;
3780          if (c == 0xf0 && d < 0x90)
3781            return 0;
3782        }
3783      else if (c < 0xfc)
3784        {
3785          char_len = 5;
3786          if (c == 0xf8 && d < 0x88)
3787            return 0;
3788        }
3789      else if (c < 0xfe)
3790        {
3791          char_len = 6;
3792          if (c == 0xfc && d < 0x84)
3793            return 0;
3794        }
3795      else
3796        return 0;
3797
3798      if (str_idx + char_len > input->len)
3799        return 0;
3800
3801      for (i = 1; i < char_len; ++i)
3802        {
3803          d = re_string_byte_at (input, str_idx + i);
3804          if (d < 0x80 || d > 0xbf)
3805            return 0;
3806        }
3807      return char_len;
3808    }
3809
3810  char_len = re_string_char_size_at (input, str_idx);
3811  if (node->type == OP_PERIOD)
3812    {
3813      if (char_len <= 1)
3814        return 0;
3815      /* FIXME: I don't think this if is needed, as both '\n'
3816         and '\0' are char_len == 1.  */
3817      /* '.' accepts any one character except the following two cases.  */
3818      if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3819           re_string_byte_at (input, str_idx) == '\n') ||
3820          ((dfa->syntax & RE_DOT_NOT_NULL) &&
3821           re_string_byte_at (input, str_idx) == '\0'))
3822        return 0;
3823      return char_len;
3824    }
3825
3826  elem_len = re_string_elem_size_at (input, str_idx);
3827  wc = __btowc(*(input->mbs+str_idx));
3828  if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3829    return 0;
3830
3831  if (node->type == COMPLEX_BRACKET)
3832    {
3833      const re_charset_t *cset = node->opr.mbcset;
3834# ifdef _LIBC
3835      const unsigned char *pin
3836        = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3837      int j;
3838      uint32_t nrules;
3839# endif /* _LIBC */
3840      int match_len = 0;
3841      wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3842                    ? re_string_wchar_at (input, str_idx) : 0);
3843
3844      /* match with multibyte character?  */
3845      for (i = 0; i < cset->nmbchars; ++i)
3846        if (wc == cset->mbchars[i])
3847          {
3848            match_len = char_len;
3849            goto check_node_accept_bytes_match;
3850          }
3851      /* match with character_class?  */
3852      for (i = 0; i < cset->nchar_classes; ++i)
3853        {
3854          wctype_t wt = cset->char_classes[i];
3855          if (__iswctype (wc, wt))
3856            {
3857              match_len = char_len;
3858              goto check_node_accept_bytes_match;
3859            }
3860        }
3861
3862# ifdef _LIBC
3863      nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3864      if (nrules != 0)
3865        {
3866          unsigned int in_collseq = 0;
3867          const int32_t *table, *indirect;
3868          const unsigned char *weights, *extra;
3869          const char *collseqwc;
3870          /* This #include defines a local function!  */
3871#  include <locale/weight.h>
3872
3873          /* match with collating_symbol?  */
3874          if (cset->ncoll_syms)
3875            extra = (const unsigned char *)
3876              _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3877          for (i = 0; i < cset->ncoll_syms; ++i)
3878            {
3879              const unsigned char *coll_sym = extra + cset->coll_syms[i];
3880              /* Compare the length of input collating element and
3881                 the length of current collating element.  */
3882              if (*coll_sym != elem_len)
3883                continue;
3884              /* Compare each bytes.  */
3885              for (j = 0; j < *coll_sym; j++)
3886                if (pin[j] != coll_sym[1 + j])
3887                  break;
3888              if (j == *coll_sym)
3889                {
3890                  /* Match if every bytes is equal.  */
3891                  match_len = j;
3892                  goto check_node_accept_bytes_match;
3893                }
3894            }
3895
3896          if (cset->nranges)
3897            {
3898              if (elem_len <= char_len)
3899                {
3900                  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3901                  in_collseq = __collseq_table_lookup (collseqwc, wc);
3902                }
3903              else
3904                in_collseq = find_collation_sequence_value (pin, elem_len);
3905            }
3906          /* match with range expression?  */
3907          for (i = 0; i < cset->nranges; ++i)
3908            if (cset->range_starts[i] <= in_collseq
3909                && in_collseq <= cset->range_ends[i])
3910              {
3911                match_len = elem_len;
3912                goto check_node_accept_bytes_match;
3913              }
3914
3915          /* match with equivalence_class?  */
3916          if (cset->nequiv_classes)
3917            {
3918              const unsigned char *cp = pin;
3919              table = (const int32_t *)
3920                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3921              weights = (const unsigned char *)
3922                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3923              extra = (const unsigned char *)
3924                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3925              indirect = (const int32_t *)
3926                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3927              int32_t idx = findidx (&cp);
3928              if (idx > 0)
3929                for (i = 0; i < cset->nequiv_classes; ++i)
3930                  {
3931                    int32_t equiv_class_idx = cset->equiv_classes[i];
3932                    size_t weight_len = weights[idx & 0xffffff];
3933                    if (weight_len == weights[equiv_class_idx & 0xffffff]
3934                        && (idx >> 24) == (equiv_class_idx >> 24))
3935                      {
3936                        int cnt = 0;
3937
3938                        idx &= 0xffffff;
3939                        equiv_class_idx &= 0xffffff;
3940
3941                        while (cnt <= weight_len
3942                               && (weights[equiv_class_idx + 1 + cnt]
3943                                   == weights[idx + 1 + cnt]))
3944                          ++cnt;
3945                        if (cnt > weight_len)
3946                          {
3947                            match_len = elem_len;
3948                            goto check_node_accept_bytes_match;
3949                          }
3950                      }
3951                  }
3952            }
3953        }
3954      else
3955# endif /* _LIBC */
3956        {
3957          /* match with range expression?  */
3958#if __GNUC__ >= 2
3959          wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3960#else
3961          wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3962          cmp_buf[2] = wc;
3963#endif
3964          for (i = 0; i < cset->nranges; ++i)
3965            {
3966              cmp_buf[0] = cset->range_starts[i];
3967              cmp_buf[4] = cset->range_ends[i];
3968              if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3969                  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3970                {
3971                  match_len = char_len;
3972                  goto check_node_accept_bytes_match;
3973                }
3974            }
3975        }
3976    check_node_accept_bytes_match:
3977      if (!cset->non_match)
3978        return match_len;
3979      else
3980        {
3981          if (match_len > 0)
3982            return 0;
3983          else
3984            return (elem_len > char_len) ? elem_len : char_len;
3985        }
3986    }
3987  return 0;
3988}
3989
3990# ifdef _LIBC
3991static unsigned int
3992internal_function
3993find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3994{
3995  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3996  if (nrules == 0)
3997    {
3998      if (mbs_len == 1)
3999        {
4000          /* No valid character.  Match it as a single byte character.  */
4001          const unsigned char *collseq = (const unsigned char *)
4002            _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4003          return collseq[mbs[0]];
4004        }
4005      return UINT_MAX;
4006    }
4007  else
4008    {
4009      int32_t idx;
4010      const unsigned char *extra = (const unsigned char *)
4011        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4012      int32_t extrasize = (const unsigned char *)
4013        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4014
4015      for (idx = 0; idx < extrasize;)
4016        {
4017          int mbs_cnt, found = 0;
4018          int32_t elem_mbs_len;
4019          /* Skip the name of collating element name.  */
4020          idx = idx + extra[idx] + 1;
4021          elem_mbs_len = extra[idx++];
4022          if (mbs_len == elem_mbs_len)
4023            {
4024              for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4025                if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4026                  break;
4027              if (mbs_cnt == elem_mbs_len)
4028                /* Found the entry.  */
4029                found = 1;
4030            }
4031          /* Skip the byte sequence of the collating element.  */
4032          idx += elem_mbs_len;
4033          /* Adjust for the alignment.  */
4034          idx = (idx + 3) & ~3;
4035          /* Skip the collation sequence value.  */
4036          idx += sizeof (uint32_t);
4037          /* Skip the wide char sequence of the collating element.  */
4038          idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4039          /* If we found the entry, return the sequence value.  */
4040          if (found)
4041            return *(uint32_t *) (extra + idx);
4042          /* Skip the collation sequence value.  */
4043          idx += sizeof (uint32_t);
4044        }
4045      return UINT_MAX;
4046    }
4047}
4048# endif /* _LIBC */
4049#endif /* RE_ENABLE_I18N */
4050
4051/* Check whether the node accepts the byte which is IDX-th
4052   byte of the INPUT.  */
4053
4054static int
4055internal_function
4056check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4057                   int idx)
4058{
4059  unsigned char ch;
4060  ch = re_string_byte_at (&mctx->input, idx);
4061  switch (node->type)
4062    {
4063    case CHARACTER:
4064      if (node->opr.c != ch)
4065        return 0;
4066      break;
4067
4068    case SIMPLE_BRACKET:
4069      if (!bitset_contain (node->opr.sbcset, ch))
4070        return 0;
4071      break;
4072
4073#ifdef RE_ENABLE_I18N
4074    case OP_UTF8_PERIOD:
4075      if (ch >= 0x80)
4076        return 0;
4077      /* FALLTHROUGH */
4078#endif
4079    case OP_PERIOD:
4080      if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4081          || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4082        return 0;
4083      break;
4084
4085    default:
4086      return 0;
4087    }
4088
4089  if (node->constraint)
4090    {
4091      /* The node has constraints.  Check whether the current context
4092         satisfies the constraints.  */
4093      unsigned int context = re_string_context_at (&mctx->input, idx,
4094                                                   mctx->eflags);
4095      if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4096        return 0;
4097    }
4098
4099  return 1;
4100}
4101
4102/* Extend the buffers, if the buffers have run out.  */
4103
4104static reg_errcode_t
4105internal_function
4106extend_buffers (re_match_context_t *mctx)
4107{
4108  reg_errcode_t ret;
4109  re_string_t *pstr = &mctx->input;
4110
4111  /* Avoid overflow.  */
4112  if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4113    return REG_ESPACE;
4114
4115  /* Double the lengthes of the buffers.  */
4116  ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4117  if (BE (ret != REG_NOERROR, 0))
4118    return ret;
4119
4120  if (mctx->state_log != NULL)
4121    {
4122      /* And double the length of state_log.  */
4123      /* XXX We have no indication of the size of this buffer.  If this
4124         allocation fail we have no indication that the state_log array
4125         does not have the right size.  */
4126      re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4127                                              pstr->bufs_len + 1);
4128      if (BE (new_array == NULL, 0))
4129        return REG_ESPACE;
4130      mctx->state_log = new_array;
4131    }
4132
4133  /* Then reconstruct the buffers.  */
4134  if (pstr->icase)
4135    {
4136#ifdef RE_ENABLE_I18N
4137      if (pstr->mb_cur_max > 1)
4138        {
4139          ret = build_wcs_upper_buffer (pstr);
4140          if (BE (ret != REG_NOERROR, 0))
4141            return ret;
4142        }
4143      else
4144#endif /* RE_ENABLE_I18N  */
4145        build_upper_buffer (pstr);
4146    }
4147  else
4148    {
4149#ifdef RE_ENABLE_I18N
4150      if (pstr->mb_cur_max > 1)
4151        build_wcs_buffer (pstr);
4152      else
4153#endif /* RE_ENABLE_I18N  */
4154        {
4155          if (pstr->trans != NULL)
4156            re_string_translate_buffer (pstr);
4157        }
4158    }
4159  return REG_NOERROR;
4160}
4161
4162\f
4163/* Functions for matching context.  */
4164
4165/* Initialize MCTX.  */
4166
4167static reg_errcode_t
4168internal_function
4169match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4170{
4171  mctx->eflags = eflags;
4172  mctx->match_last = -1;
4173  if (n > 0)
4174    {
4175      mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4176      mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4177      if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4178        return REG_ESPACE;
4179    }
4180  /* Already zero-ed by the caller.
4181     else
4182       mctx->bkref_ents = NULL;
4183     mctx->nbkref_ents = 0;
4184     mctx->nsub_tops = 0;  */
4185  mctx->abkref_ents = n;
4186  mctx->max_mb_elem_len = 1;
4187  mctx->asub_tops = n;
4188  return REG_NOERROR;
4189}
4190
4191/* Clean the entries which depend on the current input in MCTX.
4192   This function must be invoked when the matcher changes the start index
4193   of the input, or changes the input string.  */
4194
4195static void
4196internal_function
4197match_ctx_clean (re_match_context_t *mctx)
4198{
4199  int st_idx;
4200  for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4201    {
4202      int sl_idx;
4203      re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4204      for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4205        {
4206          re_sub_match_last_t *last = top->lasts[sl_idx];
4207          re_free (last->path.array);
4208          re_free (last);
4209        }
4210      re_free (top->lasts);
4211      if (top->path)
4212        {
4213          re_free (top->path->array);
4214          re_free (top->path);
4215        }
4216      free (top);
4217    }
4218
4219  mctx->nsub_tops = 0;
4220  mctx->nbkref_ents = 0;
4221}
4222
4223/* Free all the memory associated with MCTX.  */
4224
4225static void
4226internal_function
4227match_ctx_free (re_match_context_t *mctx)
4228{
4229  /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4230  match_ctx_clean (mctx);
4231  re_free (mctx->sub_tops);
4232  re_free (mctx->bkref_ents);
4233}
4234
4235/* Add a new backreference entry to MCTX.
4236   Note that we assume that caller never call this function with duplicate
4237   entry, and call with STR_IDX which isn't smaller than any existing entry.
4238*/
4239
4240static reg_errcode_t
4241internal_function
4242match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4243                     int to)
4244{
4245  if (mctx->nbkref_ents >= mctx->abkref_ents)
4246    {
4247      struct re_backref_cache_entry* new_entry;
4248      new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4249                              mctx->abkref_ents * 2);
4250      if (BE (new_entry == NULL, 0))
4251        {
4252          re_free (mctx->bkref_ents);
4253          return REG_ESPACE;
4254        }
4255      mctx->bkref_ents = new_entry;
4256      memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4257              sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4258      mctx->abkref_ents *= 2;
4259    }
4260  if (mctx->nbkref_ents > 0
4261      && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4262    mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4263
4264  mctx->bkref_ents[mctx->nbkref_ents].node = node;
4265  mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4266  mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4267  mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4268
4269  /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4270     If bit N is clear, means that this entry won't epsilon-transition to
4271     an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4272     it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4273     such node.
4274
4275     A backreference does not epsilon-transition unless it is empty, so set
4276     to all zeros if FROM != TO.  */
4277  mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4278    = (from == to ? ~0 : 0);
4279
4280  mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4281  if (mctx->max_mb_elem_len < to - from)
4282    mctx->max_mb_elem_len = to - from;
4283  return REG_NOERROR;
4284}
4285
4286/* Search for the first entry which has the same str_idx, or -1 if none is
4287   found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4288
4289static int
4290internal_function
4291search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4292{
4293  int left, right, mid, last;
4294  last = right = mctx->nbkref_ents;
4295  for (left = 0; left < right;)
4296    {
4297      mid = (left + right) / 2;
4298      if (mctx->bkref_ents[mid].str_idx < str_idx)
4299        left = mid + 1;
4300      else
4301        right = mid;
4302    }
4303  if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4304    return left;
4305  else
4306    return -1;
4307}
4308
4309/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4310   at STR_IDX.  */
4311
4312static reg_errcode_t
4313internal_function
4314match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4315{
4316#ifdef DEBUG
4317  assert (mctx->sub_tops != NULL);
4318  assert (mctx->asub_tops > 0);
4319#endif
4320  if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4321    {
4322      int new_asub_tops = mctx->asub_tops * 2;
4323      re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4324                                                   re_sub_match_top_t *,
4325                                                   new_asub_tops);
4326      if (BE (new_array == NULL, 0))
4327        return REG_ESPACE;
4328      mctx->sub_tops = new_array;
4329      mctx->asub_tops = new_asub_tops;
4330    }
4331  mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4332  if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4333    return REG_ESPACE;
4334  mctx->sub_tops[mctx->nsub_tops]->node = node;
4335  mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4336  return REG_NOERROR;
4337}
4338
4339/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4340   at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4341
4342static re_sub_match_last_t *
4343internal_function
4344match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4345{
4346  re_sub_match_last_t *new_entry;
4347  if (BE (subtop->nlasts == subtop->alasts, 0))
4348    {
4349      int new_alasts = 2 * subtop->alasts + 1;
4350      re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4351                                                    re_sub_match_last_t *,
4352                                                    new_alasts);
4353      if (BE (new_array == NULL, 0))
4354        return NULL;
4355      subtop->lasts = new_array;
4356      subtop->alasts = new_alasts;
4357    }
4358  new_entry = calloc (1, sizeof (re_sub_match_last_t));
4359  if (BE (new_entry != NULL, 1))
4360    {
4361      subtop->lasts[subtop->nlasts] = new_entry;
4362      new_entry->node = node;
4363      new_entry->str_idx = str_idx;
4364      ++subtop->nlasts;
4365    }
4366  return new_entry;
4367}
4368
4369static void
4370internal_function
4371sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4372               re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4373{
4374  sctx->sifted_states = sifted_sts;
4375  sctx->limited_states = limited_sts;
4376  sctx->last_node = last_node;
4377  sctx->last_str_idx = last_str_idx;
4378  re_node_set_init_empty (&sctx->limits);
4379}