compat / regex / regexec.con commit Merge branch 'lp/maint-diff-three-dash-with-graph' (31a199a)
   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, write to the Free
  18   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19   02110-1301 USA.  */
  20
  21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                     int n) internal_function;
  23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24static void match_ctx_free (re_match_context_t *cache) internal_function;
  25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                          int str_idx, int from, int to)
  27     internal_function;
  28static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29     internal_function;
  30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                           int str_idx) internal_function;
  32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                   int node, int str_idx)
  34     internal_function;
  35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                           re_dfastate_t **limited_sts, int last_node,
  37                           int last_str_idx)
  38     internal_function;
  39static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                         const char *string, int length,
  41                                         int start, int range, int stop,
  42                                         size_t nmatch, regmatch_t pmatch[],
  43                                         int eflags);
  44static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                             const char *string1, int length1,
  46                             const char *string2, int length2,
  47                             int start, int range, struct re_registers *regs,
  48                             int stop, int ret_len);
  49static int re_search_stub (struct re_pattern_buffer *bufp,
  50                           const char *string, int length, int start,
  51                           int range, int stop, struct re_registers *regs,
  52                           int ret_len);
  53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                              int nregs, int regs_allocated);
  55static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
  56static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  57                           int *p_match_first) internal_function;
  58static int check_halt_state_context (const re_match_context_t *mctx,
  59                                     const re_dfastate_t *state, int idx)
  60     internal_function;
  61static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  62                         regmatch_t *prev_idx_match, int cur_node,
  63                         int cur_idx, int nmatch) internal_function;
  64static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  65                                      int str_idx, int dest_node, int nregs,
  66                                      regmatch_t *regs,
  67                                      re_node_set *eps_via_nodes)
  68     internal_function;
  69static reg_errcode_t set_regs (const regex_t *preg,
  70                               const re_match_context_t *mctx,
  71                               size_t nmatch, regmatch_t *pmatch,
  72                               int fl_backtrack) internal_function;
  73static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  74     internal_function;
  75
  76#ifdef RE_ENABLE_I18N
  77static int sift_states_iter_mb (const re_match_context_t *mctx,
  78                                re_sift_context_t *sctx,
  79                                int node_idx, int str_idx, int max_str_idx)
  80     internal_function;
  81#endif /* RE_ENABLE_I18N */
  82static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  83                                           re_sift_context_t *sctx)
  84     internal_function;
  85static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  86                                          re_sift_context_t *sctx, int str_idx,
  87                                          re_node_set *cur_dest)
  88     internal_function;
  89static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  90                                              re_sift_context_t *sctx,
  91                                              int str_idx,
  92                                              re_node_set *dest_nodes)
  93     internal_function;
  94static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  95                                            re_node_set *dest_nodes,
  96                                            const re_node_set *candidates)
  97     internal_function;
  98static int check_dst_limits (const re_match_context_t *mctx,
  99                             re_node_set *limits,
 100                             int dst_node, int dst_idx, int src_node,
 101                             int src_idx) internal_function;
 102static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 103                                        int boundaries, int subexp_idx,
 104                                        int from_node, int bkref_idx)
 105     internal_function;
 106static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 107                                      int limit, int subexp_idx,
 108                                      int node, int str_idx,
 109                                      int bkref_idx) internal_function;
 110static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 111                                          re_node_set *dest_nodes,
 112                                          const re_node_set *candidates,
 113                                          re_node_set *limits,
 114                                          struct re_backref_cache_entry *bkref_ents,
 115                                          int str_idx) internal_function;
 116static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 117                                        re_sift_context_t *sctx,
 118                                        int str_idx, const re_node_set *candidates)
 119     internal_function;
 120static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 121                                        re_dfastate_t **dst,
 122                                        re_dfastate_t **src, int num)
 123     internal_function;
 124static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 125                                         re_match_context_t *mctx) internal_function;
 126static re_dfastate_t *transit_state (reg_errcode_t *err,
 127                                     re_match_context_t *mctx,
 128                                     re_dfastate_t *state) internal_function;
 129static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 130                                            re_match_context_t *mctx,
 131                                            re_dfastate_t *next_state)
 132     internal_function;
 133static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 134                                                re_node_set *cur_nodes,
 135                                                int str_idx) internal_function;
 136#if 0
 137static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 138                                        re_match_context_t *mctx,
 139                                        re_dfastate_t *pstate)
 140     internal_function;
 141#endif
 142#ifdef RE_ENABLE_I18N
 143static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 144                                       re_dfastate_t *pstate)
 145     internal_function;
 146#endif /* RE_ENABLE_I18N */
 147static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 148                                          const re_node_set *nodes)
 149     internal_function;
 150static reg_errcode_t get_subexp (re_match_context_t *mctx,
 151                                 int bkref_node, int bkref_str_idx)
 152     internal_function;
 153static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 154                                     const re_sub_match_top_t *sub_top,
 155                                     re_sub_match_last_t *sub_last,
 156                                     int bkref_node, int bkref_str)
 157     internal_function;
 158static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 159                             int subexp_idx, int type) internal_function;
 160static reg_errcode_t check_arrival (re_match_context_t *mctx,
 161                                    state_array_t *path, int top_node,
 162                                    int top_str, int last_node, int last_str,
 163                                    int type) internal_function;
 164static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 165                                                   int str_idx,
 166                                                   re_node_set *cur_nodes,
 167                                                   re_node_set *next_nodes)
 168     internal_function;
 169static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 170                                               re_node_set *cur_nodes,
 171                                               int ex_subexp, int type)
 172     internal_function;
 173static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 174                                                   re_node_set *dst_nodes,
 175                                                   int target, int ex_subexp,
 176                                                   int type) internal_function;
 177static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 178                                         re_node_set *cur_nodes, int cur_str,
 179                                         int subexp_num, int type)
 180     internal_function;
 181static int build_trtable (const re_dfa_t *dfa,
 182                          re_dfastate_t *state) internal_function;
 183#ifdef RE_ENABLE_I18N
 184static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 185                                    const re_string_t *input, int idx)
 186     internal_function;
 187# ifdef _LIBC
 188static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 189                                                   size_t name_len)
 190     internal_function;
 191# endif /* _LIBC */
 192#endif /* RE_ENABLE_I18N */
 193static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 194                                       const re_dfastate_t *state,
 195                                       re_node_set *states_node,
 196                                       bitset_t *states_ch) internal_function;
 197static int check_node_accept (const re_match_context_t *mctx,
 198                              const re_token_t *node, int idx)
 199     internal_function;
 200static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 201     internal_function;
 202\f
 203/* Entry point for POSIX code.  */
 204
 205/* regexec searches for a given pattern, specified by PREG, in the
 206   string STRING.
 207
 208   If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 209   `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 210   least NMATCH elements, and we set them to the offsets of the
 211   corresponding matched substrings.
 212
 213   EFLAGS specifies `execution flags' which affect matching: if
 214   REG_NOTBOL is set, then ^ does not match at the beginning of the
 215   string; if REG_NOTEOL is set, then $ does not match at the end.
 216
 217   We return 0 if we find a match and REG_NOMATCH if not.  */
 218
 219int
 220regexec (
 221        const regex_t *__restrict preg,
 222        const char *__restrict string,
 223        size_t nmatch,
 224        regmatch_t pmatch[],
 225        int eflags)
 226{
 227  reg_errcode_t err;
 228  int start, length;
 229
 230  if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 231    return REG_BADPAT;
 232
 233  if (eflags & REG_STARTEND)
 234    {
 235      start = pmatch[0].rm_so;
 236      length = pmatch[0].rm_eo;
 237    }
 238  else
 239    {
 240      start = 0;
 241      length = strlen (string);
 242    }
 243
 244  __libc_lock_lock (dfa->lock);
 245  if (preg->no_sub)
 246    err = re_search_internal (preg, string, length, start, length - start,
 247                              length, 0, NULL, eflags);
 248  else
 249    err = re_search_internal (preg, string, length, start, length - start,
 250                              length, nmatch, pmatch, eflags);
 251  __libc_lock_unlock (dfa->lock);
 252  return err != REG_NOERROR;
 253}
 254
 255#ifdef _LIBC
 256# include <shlib-compat.h>
 257versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 258
 259# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 260__typeof__ (__regexec) __compat_regexec;
 261
 262int
 263attribute_compat_text_section
 264__compat_regexec (const regex_t *__restrict preg,
 265                  const char *__restrict string, size_t nmatch,
 266                  regmatch_t pmatch[], int eflags)
 267{
 268  return regexec (preg, string, nmatch, pmatch,
 269                  eflags & (REG_NOTBOL | REG_NOTEOL));
 270}
 271compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 272# endif
 273#endif
 274
 275/* Entry points for GNU code.  */
 276
 277/* re_match, re_search, re_match_2, re_search_2
 278
 279   The former two functions operate on STRING with length LENGTH,
 280   while the later two operate on concatenation of STRING1 and STRING2
 281   with lengths LENGTH1 and LENGTH2, respectively.
 282
 283   re_match() matches the compiled pattern in BUFP against the string,
 284   starting at index START.
 285
 286   re_search() first tries matching at index START, then it tries to match
 287   starting from index START + 1, and so on.  The last start position tried
 288   is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 289   way as re_match().)
 290
 291   The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 292   the first STOP characters of the concatenation of the strings should be
 293   concerned.
 294
 295   If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 296   and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 297   computed relative to the concatenation, not relative to the individual
 298   strings.)
 299
 300   On success, re_match* functions return the length of the match, re_search*
 301   return the position of the start of the match.  Return value -1 means no
 302   match was found and -2 indicates an internal error.  */
 303
 304int
 305re_match (struct re_pattern_buffer *bufp,
 306          const char *string,
 307          int length,
 308          int start,
 309          struct re_registers *regs)
 310{
 311  return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 312}
 313#ifdef _LIBC
 314weak_alias (__re_match, re_match)
 315#endif
 316
 317int
 318re_search (struct re_pattern_buffer *bufp,
 319           const char *string,
 320           int length, int start, int range,
 321           struct re_registers *regs)
 322{
 323  return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 324}
 325#ifdef _LIBC
 326weak_alias (__re_search, re_search)
 327#endif
 328
 329int
 330re_match_2 (struct re_pattern_buffer *bufp,
 331            const char *string1, int length1,
 332            const char *string2, int length2, int start,
 333            struct re_registers *regs, int stop)
 334{
 335  return re_search_2_stub (bufp, string1, length1, string2, length2,
 336                           start, 0, regs, stop, 1);
 337}
 338#ifdef _LIBC
 339weak_alias (__re_match_2, re_match_2)
 340#endif
 341
 342int
 343re_search_2 (struct re_pattern_buffer *bufp,
 344             const char *string1, int length1,
 345             const char *string2, int length2, int start,
 346             int range, struct re_registers *regs,  int stop)
 347{
 348  return re_search_2_stub (bufp, string1, length1, string2, length2,
 349                           start, range, regs, stop, 0);
 350}
 351#ifdef _LIBC
 352weak_alias (__re_search_2, re_search_2)
 353#endif
 354
 355static int
 356re_search_2_stub (struct re_pattern_buffer *bufp,
 357                  const char *string1, int length1,
 358                  const char *string2, int length2, int start,
 359                  int range, struct re_registers *regs,
 360                  int stop, int ret_len)
 361{
 362  const char *str;
 363  int rval;
 364  int len = length1 + length2;
 365  int free_str = 0;
 366
 367  if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 368    return -2;
 369
 370  /* Concatenate the strings.  */
 371  if (length2 > 0)
 372    if (length1 > 0)
 373      {
 374        char *s = re_malloc (char, len);
 375
 376        if (BE (s == NULL, 0))
 377          return -2;
 378        memcpy (s, string1, length1);
 379        memcpy (s + length1, string2, length2);
 380        str = s;
 381        free_str = 1;
 382      }
 383    else
 384      str = string2;
 385  else
 386    str = string1;
 387
 388  rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 389  if (free_str)
 390    re_free ((char *) str);
 391  return rval;
 392}
 393
 394/* The parameters have the same meaning as those of re_search.
 395   Additional parameters:
 396   If RET_LEN is nonzero the length of the match is returned (re_match style);
 397   otherwise the position of the match is returned.  */
 398
 399static int
 400re_search_stub (struct re_pattern_buffer *bufp,
 401                const char *string, int length, int start,
 402                int range, int stop,
 403                struct re_registers *regs, int ret_len)
 404{
 405  reg_errcode_t result;
 406  regmatch_t *pmatch;
 407  int nregs, rval;
 408  int eflags = 0;
 409
 410  /* Check for out-of-range.  */
 411  if (BE (start < 0 || start > length, 0))
 412    return -1;
 413  if (BE (start + range > length, 0))
 414    range = length - start;
 415  else if (BE (start + range < 0, 0))
 416    range = -start;
 417
 418  __libc_lock_lock (dfa->lock);
 419
 420  eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 421  eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 422
 423  /* Compile fastmap if we haven't yet.  */
 424  if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 425    re_compile_fastmap (bufp);
 426
 427  if (BE (bufp->no_sub, 0))
 428    regs = NULL;
 429
 430  /* We need at least 1 register.  */
 431  if (regs == NULL)
 432    nregs = 1;
 433  else if (BE (bufp->regs_allocated == REGS_FIXED &&
 434               regs->num_regs < bufp->re_nsub + 1, 0))
 435    {
 436      nregs = regs->num_regs;
 437      if (BE (nregs < 1, 0))
 438        {
 439          /* Nothing can be copied to regs.  */
 440          regs = NULL;
 441          nregs = 1;
 442        }
 443    }
 444  else
 445    nregs = bufp->re_nsub + 1;
 446  pmatch = re_malloc (regmatch_t, nregs);
 447  if (BE (pmatch == NULL, 0))
 448    {
 449      rval = -2;
 450      goto out;
 451    }
 452
 453  result = re_search_internal (bufp, string, length, start, range, stop,
 454                               nregs, pmatch, eflags);
 455
 456  rval = 0;
 457
 458  /* I hope we needn't fill ther regs with -1's when no match was found.  */
 459  if (result != REG_NOERROR)
 460    rval = -1;
 461  else if (regs != NULL)
 462    {
 463      /* If caller wants register contents data back, copy them.  */
 464      bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 465                                           bufp->regs_allocated);
 466      if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 467        rval = -2;
 468    }
 469
 470  if (BE (rval == 0, 1))
 471    {
 472      if (ret_len)
 473        {
 474          assert (pmatch[0].rm_so == start);
 475          rval = pmatch[0].rm_eo - start;
 476        }
 477      else
 478        rval = pmatch[0].rm_so;
 479    }
 480  re_free (pmatch);
 481 out:
 482  __libc_lock_unlock (dfa->lock);
 483  return rval;
 484}
 485
 486static unsigned
 487re_copy_regs (struct re_registers *regs,
 488              regmatch_t *pmatch,
 489              int nregs, int regs_allocated)
 490{
 491  int rval = REGS_REALLOCATE;
 492  int i;
 493  int need_regs = nregs + 1;
 494  /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 495     uses.  */
 496
 497  /* Have the register data arrays been allocated?  */
 498  if (regs_allocated == REGS_UNALLOCATED)
 499    { /* No.  So allocate them with malloc.  */
 500      regs->start = re_malloc (regoff_t, need_regs);
 501      if (BE (regs->start == NULL, 0))
 502        return REGS_UNALLOCATED;
 503      regs->end = re_malloc (regoff_t, need_regs);
 504      if (BE (regs->end == NULL, 0))
 505        {
 506          re_free (regs->start);
 507          return REGS_UNALLOCATED;
 508        }
 509      regs->num_regs = need_regs;
 510    }
 511  else if (regs_allocated == REGS_REALLOCATE)
 512    { /* Yes.  If we need more elements than were already
 513         allocated, reallocate them.  If we need fewer, just
 514         leave it alone.  */
 515      if (BE (need_regs > regs->num_regs, 0))
 516        {
 517          regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 518          regoff_t *new_end;
 519          if (BE (new_start == NULL, 0))
 520            return REGS_UNALLOCATED;
 521          new_end = re_realloc (regs->end, regoff_t, need_regs);
 522          if (BE (new_end == NULL, 0))
 523            {
 524              re_free (new_start);
 525              return REGS_UNALLOCATED;
 526            }
 527          regs->start = new_start;
 528          regs->end = new_end;
 529          regs->num_regs = need_regs;
 530        }
 531    }
 532  else
 533    {
 534      assert (regs_allocated == REGS_FIXED);
 535      /* This function may not be called with REGS_FIXED and nregs too big.  */
 536      assert (regs->num_regs >= nregs);
 537      rval = REGS_FIXED;
 538    }
 539
 540  /* Copy the regs.  */
 541  for (i = 0; i < nregs; ++i)
 542    {
 543      regs->start[i] = pmatch[i].rm_so;
 544      regs->end[i] = pmatch[i].rm_eo;
 545    }
 546  for ( ; i < regs->num_regs; ++i)
 547    regs->start[i] = regs->end[i] = -1;
 548
 549  return rval;
 550}
 551
 552/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 553   ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 554   this memory for recording register information.  STARTS and ENDS
 555   must be allocated using the malloc library routine, and must each
 556   be at least NUM_REGS * sizeof (regoff_t) bytes long.
 557
 558   If NUM_REGS == 0, then subsequent matches should allocate their own
 559   register data.
 560
 561   Unless this function is called, the first search or match using
 562   PATTERN_BUFFER will allocate its own register data, without
 563   freeing the old data.  */
 564
 565void
 566re_set_registers (struct re_pattern_buffer *bufp,
 567                  struct re_registers *regs,
 568                  unsigned num_regs,
 569                  regoff_t *starts,
 570                  regoff_t *ends)
 571{
 572  if (num_regs)
 573    {
 574      bufp->regs_allocated = REGS_REALLOCATE;
 575      regs->num_regs = num_regs;
 576      regs->start = starts;
 577      regs->end = ends;
 578    }
 579  else
 580    {
 581      bufp->regs_allocated = REGS_UNALLOCATED;
 582      regs->num_regs = 0;
 583      regs->start = regs->end = (regoff_t *) 0;
 584    }
 585}
 586#ifdef _LIBC
 587weak_alias (__re_set_registers, re_set_registers)
 588#endif
 589\f
 590/* Entry points compatible with 4.2 BSD regex library.  We don't define
 591   them unless specifically requested.  */
 592
 593#if defined _REGEX_RE_COMP || defined _LIBC
 594int
 595# ifdef _LIBC
 596weak_function
 597# endif
 598re_exec (s)
 599     const char *s;
 600{
 601  return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 602}
 603#endif /* _REGEX_RE_COMP */
 604\f
 605/* Internal entry point.  */
 606
 607/* Searches for a compiled pattern PREG in the string STRING, whose
 608   length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 609   mingings with regexec.  START, and RANGE have the same meanings
 610   with re_search.
 611   Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 612   otherwise return the error code.
 613   Note: We assume front end functions already check ranges.
 614   (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 615
 616static reg_errcode_t
 617re_search_internal (const regex_t *preg,
 618                    const char *string,
 619                    int length, int start, int range, int stop,
 620                    size_t nmatch, regmatch_t pmatch[],
 621                    int eflags)
 622{
 623  reg_errcode_t err;
 624  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 625  int left_lim, right_lim, incr;
 626  int fl_longest_match, match_first, match_kind, match_last = -1;
 627  int extra_nmatch;
 628  int sb, ch;
 629#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 630  re_match_context_t mctx = { .dfa = dfa };
 631#else
 632  re_match_context_t mctx;
 633#endif
 634  char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 635                   && range && !preg->can_be_null) ? preg->fastmap : NULL;
 636  RE_TRANSLATE_TYPE t = preg->translate;
 637
 638#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 639  memset (&mctx, '\0', sizeof (re_match_context_t));
 640  mctx.dfa = dfa;
 641#endif
 642
 643  extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 644  nmatch -= extra_nmatch;
 645
 646  /* Check if the DFA haven't been compiled.  */
 647  if (BE (preg->used == 0 || dfa->init_state == NULL
 648          || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 649          || dfa->init_state_begbuf == NULL, 0))
 650    return REG_NOMATCH;
 651
 652#ifdef DEBUG
 653  /* We assume front-end functions already check them.  */
 654  assert (start + range >= 0 && start + range <= length);
 655#endif
 656
 657  /* If initial states with non-begbuf contexts have no elements,
 658     the regex must be anchored.  If preg->newline_anchor is set,
 659     we'll never use init_state_nl, so do not check it.  */
 660  if (dfa->init_state->nodes.nelem == 0
 661      && dfa->init_state_word->nodes.nelem == 0
 662      && (dfa->init_state_nl->nodes.nelem == 0
 663          || !preg->newline_anchor))
 664    {
 665      if (start != 0 && start + range != 0)
 666        return REG_NOMATCH;
 667      start = range = 0;
 668    }
 669
 670  /* We must check the longest matching, if nmatch > 0.  */
 671  fl_longest_match = (nmatch != 0 || dfa->nbackref);
 672
 673  err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 674                            preg->translate, preg->syntax & RE_ICASE, dfa);
 675  if (BE (err != REG_NOERROR, 0))
 676    goto free_return;
 677  mctx.input.stop = stop;
 678  mctx.input.raw_stop = stop;
 679  mctx.input.newline_anchor = preg->newline_anchor;
 680
 681  err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 682  if (BE (err != REG_NOERROR, 0))
 683    goto free_return;
 684
 685  /* We will log all the DFA states through which the dfa pass,
 686     if nmatch > 1, or this dfa has "multibyte node", which is a
 687     back-reference or a node which can accept multibyte character or
 688     multi character collating element.  */
 689  if (nmatch > 1 || dfa->has_mb_node)
 690    {
 691      /* Avoid overflow.  */
 692      if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 693        {
 694          err = REG_ESPACE;
 695          goto free_return;
 696        }
 697
 698      mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 699      if (BE (mctx.state_log == NULL, 0))
 700        {
 701          err = REG_ESPACE;
 702          goto free_return;
 703        }
 704    }
 705  else
 706    mctx.state_log = NULL;
 707
 708  match_first = start;
 709  mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 710                           : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 711
 712  /* Check incrementally whether of not the input string match.  */
 713  incr = (range < 0) ? -1 : 1;
 714  left_lim = (range < 0) ? start + range : start;
 715  right_lim = (range < 0) ? start : start + range;
 716  sb = dfa->mb_cur_max == 1;
 717  match_kind =
 718    (fastmap
 719     ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 720        | (range >= 0 ? 2 : 0)
 721        | (t != NULL ? 1 : 0))
 722     : 8);
 723
 724  for (;; match_first += incr)
 725    {
 726      err = REG_NOMATCH;
 727      if (match_first < left_lim || right_lim < match_first)
 728        goto free_return;
 729
 730      /* Advance as rapidly as possible through the string, until we
 731         find a plausible place to start matching.  This may be done
 732         with varying efficiency, so there are various possibilities:
 733         only the most common of them are specialized, in order to
 734         save on code size.  We use a switch statement for speed.  */
 735      switch (match_kind)
 736        {
 737        case 8:
 738          /* No fastmap.  */
 739          break;
 740
 741        case 7:
 742          /* Fastmap with single-byte translation, match forward.  */
 743          while (BE (match_first < right_lim, 1)
 744                 && !fastmap[t[(unsigned char) string[match_first]]])
 745            ++match_first;
 746          goto forward_match_found_start_or_reached_end;
 747
 748        case 6:
 749          /* Fastmap without translation, match forward.  */
 750          while (BE (match_first < right_lim, 1)
 751                 && !fastmap[(unsigned char) string[match_first]])
 752            ++match_first;
 753
 754        forward_match_found_start_or_reached_end:
 755          if (BE (match_first == right_lim, 0))
 756            {
 757              ch = match_first >= length
 758                       ? 0 : (unsigned char) string[match_first];
 759              if (!fastmap[t ? t[ch] : ch])
 760                goto free_return;
 761            }
 762          break;
 763
 764        case 4:
 765        case 5:
 766          /* Fastmap without multi-byte translation, match backwards.  */
 767          while (match_first >= left_lim)
 768            {
 769              ch = match_first >= length
 770                       ? 0 : (unsigned char) string[match_first];
 771              if (fastmap[t ? t[ch] : ch])
 772                break;
 773              --match_first;
 774            }
 775          if (match_first < left_lim)
 776            goto free_return;
 777          break;
 778
 779        default:
 780          /* In this case, we can't determine easily the current byte,
 781             since it might be a component byte of a multibyte
 782             character.  Then we use the constructed buffer instead.  */
 783          for (;;)
 784            {
 785              /* If MATCH_FIRST is out of the valid range, reconstruct the
 786                 buffers.  */
 787              unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 788              if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 789                {
 790                  err = re_string_reconstruct (&mctx.input, match_first,
 791                                               eflags);
 792                  if (BE (err != REG_NOERROR, 0))
 793                    goto free_return;
 794
 795                  offset = match_first - mctx.input.raw_mbs_idx;
 796                }
 797              /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 798                 Note that MATCH_FIRST must not be smaller than 0.  */
 799              ch = (match_first >= length
 800                    ? 0 : re_string_byte_at (&mctx.input, offset));
 801              if (fastmap[ch])
 802                break;
 803              match_first += incr;
 804              if (match_first < left_lim || match_first > right_lim)
 805                {
 806                  err = REG_NOMATCH;
 807                  goto free_return;
 808                }
 809            }
 810          break;
 811        }
 812
 813      /* Reconstruct the buffers so that the matcher can assume that
 814         the matching starts from the beginning of the buffer.  */
 815      err = re_string_reconstruct (&mctx.input, match_first, eflags);
 816      if (BE (err != REG_NOERROR, 0))
 817        goto free_return;
 818
 819#ifdef RE_ENABLE_I18N
 820     /* Don't consider this char as a possible match start if it part,
 821        yet isn't the head, of a multibyte character.  */
 822      if (!sb && !re_string_first_byte (&mctx.input, 0))
 823        continue;
 824#endif
 825
 826      /* It seems to be appropriate one, then use the matcher.  */
 827      /* We assume that the matching starts from 0.  */
 828      mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 829      match_last = check_matching (&mctx, fl_longest_match,
 830                                   range >= 0 ? &match_first : NULL);
 831      if (match_last != -1)
 832        {
 833          if (BE (match_last == -2, 0))
 834            {
 835              err = REG_ESPACE;
 836              goto free_return;
 837            }
 838          else
 839            {
 840              mctx.match_last = match_last;
 841              if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 842                {
 843                  re_dfastate_t *pstate = mctx.state_log[match_last];
 844                  mctx.last_node = check_halt_state_context (&mctx, pstate,
 845                                                             match_last);
 846                }
 847              if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 848                  || dfa->nbackref)
 849                {
 850                  err = prune_impossible_nodes (&mctx);
 851                  if (err == REG_NOERROR)
 852                    break;
 853                  if (BE (err != REG_NOMATCH, 0))
 854                    goto free_return;
 855                  match_last = -1;
 856                }
 857              else
 858                break; /* We found a match.  */
 859            }
 860        }
 861
 862      match_ctx_clean (&mctx);
 863    }
 864
 865#ifdef DEBUG
 866  assert (match_last != -1);
 867  assert (err == REG_NOERROR);
 868#endif
 869
 870  /* Set pmatch[] if we need.  */
 871  if (nmatch > 0)
 872    {
 873      int reg_idx;
 874
 875      /* Initialize registers.  */
 876      for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 877        pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 878
 879      /* Set the points where matching start/end.  */
 880      pmatch[0].rm_so = 0;
 881      pmatch[0].rm_eo = mctx.match_last;
 882
 883      if (!preg->no_sub && nmatch > 1)
 884        {
 885          err = set_regs (preg, &mctx, nmatch, pmatch,
 886                          dfa->has_plural_match && dfa->nbackref > 0);
 887          if (BE (err != REG_NOERROR, 0))
 888            goto free_return;
 889        }
 890
 891      /* At last, add the offset to the each registers, since we slided
 892         the buffers so that we could assume that the matching starts
 893         from 0.  */
 894      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 895        if (pmatch[reg_idx].rm_so != -1)
 896          {
 897#ifdef RE_ENABLE_I18N
 898            if (BE (mctx.input.offsets_needed != 0, 0))
 899              {
 900                pmatch[reg_idx].rm_so =
 901                  (pmatch[reg_idx].rm_so == mctx.input.valid_len
 902                   ? mctx.input.valid_raw_len
 903                   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 904                pmatch[reg_idx].rm_eo =
 905                  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 906                   ? mctx.input.valid_raw_len
 907                   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 908              }
 909#else
 910            assert (mctx.input.offsets_needed == 0);
 911#endif
 912            pmatch[reg_idx].rm_so += match_first;
 913            pmatch[reg_idx].rm_eo += match_first;
 914          }
 915      for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 916        {
 917          pmatch[nmatch + reg_idx].rm_so = -1;
 918          pmatch[nmatch + reg_idx].rm_eo = -1;
 919        }
 920
 921      if (dfa->subexp_map)
 922        for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 923          if (dfa->subexp_map[reg_idx] != reg_idx)
 924            {
 925              pmatch[reg_idx + 1].rm_so
 926                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 927              pmatch[reg_idx + 1].rm_eo
 928                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 929            }
 930    }
 931
 932 free_return:
 933  re_free (mctx.state_log);
 934  if (dfa->nbackref)
 935    match_ctx_free (&mctx);
 936  re_string_destruct (&mctx.input);
 937  return err;
 938}
 939
 940static reg_errcode_t
 941prune_impossible_nodes (re_match_context_t *mctx)
 942{
 943  const re_dfa_t *const dfa = mctx->dfa;
 944  int halt_node, match_last;
 945  reg_errcode_t ret;
 946  re_dfastate_t **sifted_states;
 947  re_dfastate_t **lim_states = NULL;
 948  re_sift_context_t sctx;
 949#ifdef DEBUG
 950  assert (mctx->state_log != NULL);
 951#endif
 952  match_last = mctx->match_last;
 953  halt_node = mctx->last_node;
 954
 955  /* Avoid overflow.  */
 956  if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 957    return REG_ESPACE;
 958
 959  sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 960  if (BE (sifted_states == NULL, 0))
 961    {
 962      ret = REG_ESPACE;
 963      goto free_return;
 964    }
 965  if (dfa->nbackref)
 966    {
 967      lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 968      if (BE (lim_states == NULL, 0))
 969        {
 970          ret = REG_ESPACE;
 971          goto free_return;
 972        }
 973      while (1)
 974        {
 975          memset (lim_states, '\0',
 976                  sizeof (re_dfastate_t *) * (match_last + 1));
 977          sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 978                         match_last);
 979          ret = sift_states_backward (mctx, &sctx);
 980          re_node_set_free (&sctx.limits);
 981          if (BE (ret != REG_NOERROR, 0))
 982              goto free_return;
 983          if (sifted_states[0] != NULL || lim_states[0] != NULL)
 984            break;
 985          do
 986            {
 987              --match_last;
 988              if (match_last < 0)
 989                {
 990                  ret = REG_NOMATCH;
 991                  goto free_return;
 992                }
 993            } while (mctx->state_log[match_last] == NULL
 994                     || !mctx->state_log[match_last]->halt);
 995          halt_node = check_halt_state_context (mctx,
 996                                                mctx->state_log[match_last],
 997                                                match_last);
 998        }
 999      ret = merge_state_array (dfa, sifted_states, lim_states,
1000                               match_last + 1);
1001      re_free (lim_states);
1002      lim_states = NULL;
1003      if (BE (ret != REG_NOERROR, 0))
1004        goto free_return;
1005    }
1006  else
1007    {
1008      sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1009      ret = sift_states_backward (mctx, &sctx);
1010      re_node_set_free (&sctx.limits);
1011      if (BE (ret != REG_NOERROR, 0))
1012        goto free_return;
1013      if (sifted_states[0] == NULL)
1014        {
1015          ret = REG_NOMATCH;
1016          goto free_return;
1017        }
1018    }
1019  re_free (mctx->state_log);
1020  mctx->state_log = sifted_states;
1021  sifted_states = NULL;
1022  mctx->last_node = halt_node;
1023  mctx->match_last = match_last;
1024  ret = REG_NOERROR;
1025 free_return:
1026  re_free (sifted_states);
1027  re_free (lim_states);
1028  return ret;
1029}
1030
1031/* Acquire an initial state and return it.
1032   We must select appropriate initial state depending on the context,
1033   since initial states may have constraints like "\<", "^", etc..  */
1034
1035static inline re_dfastate_t *
1036__attribute ((always_inline)) internal_function
1037acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1038                            int idx)
1039{
1040  const re_dfa_t *const dfa = mctx->dfa;
1041  if (dfa->init_state->has_constraint)
1042    {
1043      unsigned int context;
1044      context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1045      if (IS_WORD_CONTEXT (context))
1046        return dfa->init_state_word;
1047      else if (IS_ORDINARY_CONTEXT (context))
1048        return dfa->init_state;
1049      else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1050        return dfa->init_state_begbuf;
1051      else if (IS_NEWLINE_CONTEXT (context))
1052        return dfa->init_state_nl;
1053      else if (IS_BEGBUF_CONTEXT (context))
1054        {
1055          /* It is relatively rare case, then calculate on demand.  */
1056          return re_acquire_state_context (err, dfa,
1057                                           dfa->init_state->entrance_nodes,
1058                                           context);
1059        }
1060      else
1061        /* Must not happen?  */
1062        return dfa->init_state;
1063    }
1064  else
1065    return dfa->init_state;
1066}
1067
1068/* Check whether the regular expression match input string INPUT or not,
1069   and return the index where the matching end, return -1 if not match,
1070   or return -2 in case of an error.
1071   FL_LONGEST_MATCH means we want the POSIX longest matching.
1072   If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1073   next place where we may want to try matching.
1074   Note that the matcher assume that the maching starts from the current
1075   index of the buffer.  */
1076
1077static int
1078internal_function
1079check_matching (re_match_context_t *mctx, int fl_longest_match,
1080                int *p_match_first)
1081{
1082  const re_dfa_t *const dfa = mctx->dfa;
1083  reg_errcode_t err;
1084  int match = 0;
1085  int match_last = -1;
1086  int cur_str_idx = re_string_cur_idx (&mctx->input);
1087  re_dfastate_t *cur_state;
1088  int at_init_state = p_match_first != NULL;
1089  int next_start_idx = cur_str_idx;
1090
1091  err = REG_NOERROR;
1092  cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1093  /* An initial state must not be NULL (invalid).  */
1094  if (BE (cur_state == NULL, 0))
1095    {
1096      assert (err == REG_ESPACE);
1097      return -2;
1098    }
1099
1100  if (mctx->state_log != NULL)
1101    {
1102      mctx->state_log[cur_str_idx] = cur_state;
1103
1104      /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1105         later.  E.g. Processing back references.  */
1106      if (BE (dfa->nbackref, 0))
1107        {
1108          at_init_state = 0;
1109          err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1110          if (BE (err != REG_NOERROR, 0))
1111            return err;
1112
1113          if (cur_state->has_backref)
1114            {
1115              err = transit_state_bkref (mctx, &cur_state->nodes);
1116              if (BE (err != REG_NOERROR, 0))
1117                return err;
1118            }
1119        }
1120    }
1121
1122  /* If the RE accepts NULL string.  */
1123  if (BE (cur_state->halt, 0))
1124    {
1125      if (!cur_state->has_constraint
1126          || check_halt_state_context (mctx, cur_state, cur_str_idx))
1127        {
1128          if (!fl_longest_match)
1129            return cur_str_idx;
1130          else
1131            {
1132              match_last = cur_str_idx;
1133              match = 1;
1134            }
1135        }
1136    }
1137
1138  while (!re_string_eoi (&mctx->input))
1139    {
1140      re_dfastate_t *old_state = cur_state;
1141      int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1142
1143      if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1144          || (BE (next_char_idx >= mctx->input.valid_len, 0)
1145              && mctx->input.valid_len < mctx->input.len))
1146        {
1147          err = extend_buffers (mctx);
1148          if (BE (err != REG_NOERROR, 0))
1149            {
1150              assert (err == REG_ESPACE);
1151              return -2;
1152            }
1153        }
1154
1155      cur_state = transit_state (&err, mctx, cur_state);
1156      if (mctx->state_log != NULL)
1157        cur_state = merge_state_with_log (&err, mctx, cur_state);
1158
1159      if (cur_state == NULL)
1160        {
1161          /* Reached the invalid state or an error.  Try to recover a valid
1162             state using the state log, if available and if we have not
1163             already found a valid (even if not the longest) match.  */
1164          if (BE (err != REG_NOERROR, 0))
1165            return -2;
1166
1167          if (mctx->state_log == NULL
1168              || (match && !fl_longest_match)
1169              || (cur_state = find_recover_state (&err, mctx)) == NULL)
1170            break;
1171        }
1172
1173      if (BE (at_init_state, 0))
1174        {
1175          if (old_state == cur_state)
1176            next_start_idx = next_char_idx;
1177          else
1178            at_init_state = 0;
1179        }
1180
1181      if (cur_state->halt)
1182        {
1183          /* Reached a halt state.
1184             Check the halt state can satisfy the current context.  */
1185          if (!cur_state->has_constraint
1186              || check_halt_state_context (mctx, cur_state,
1187                                           re_string_cur_idx (&mctx->input)))
1188            {
1189              /* We found an appropriate halt state.  */
1190              match_last = re_string_cur_idx (&mctx->input);
1191              match = 1;
1192
1193              /* We found a match, do not modify match_first below.  */
1194              p_match_first = NULL;
1195              if (!fl_longest_match)
1196                break;
1197            }
1198        }
1199    }
1200
1201  if (p_match_first)
1202    *p_match_first += next_start_idx;
1203
1204  return match_last;
1205}
1206
1207/* Check NODE match the current context.  */
1208
1209static int
1210internal_function
1211check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1212{
1213  re_token_type_t type = dfa->nodes[node].type;
1214  unsigned int constraint = dfa->nodes[node].constraint;
1215  if (type != END_OF_RE)
1216    return 0;
1217  if (!constraint)
1218    return 1;
1219  if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1220    return 0;
1221  return 1;
1222}
1223
1224/* Check the halt state STATE match the current context.
1225   Return 0 if not match, if the node, STATE has, is a halt node and
1226   match the context, return the node.  */
1227
1228static int
1229internal_function
1230check_halt_state_context (const re_match_context_t *mctx,
1231                          const re_dfastate_t *state, int idx)
1232{
1233  int i;
1234  unsigned int context;
1235#ifdef DEBUG
1236  assert (state->halt);
1237#endif
1238  context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1239  for (i = 0; i < state->nodes.nelem; ++i)
1240    if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1241      return state->nodes.elems[i];
1242  return 0;
1243}
1244
1245/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1246   corresponding to the DFA).
1247   Return the destination node, and update EPS_VIA_NODES, return -1 in case
1248   of errors.  */
1249
1250static int
1251internal_function
1252proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1253                   int *pidx, int node, re_node_set *eps_via_nodes,
1254                   struct re_fail_stack_t *fs)
1255{
1256  const re_dfa_t *const dfa = mctx->dfa;
1257  int i, err;
1258  if (IS_EPSILON_NODE (dfa->nodes[node].type))
1259    {
1260      re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1261      re_node_set *edests = &dfa->edests[node];
1262      int dest_node;
1263      err = re_node_set_insert (eps_via_nodes, node);
1264      if (BE (err < 0, 0))
1265        return -2;
1266      /* Pick up a valid destination, or return -1 if none is found.  */
1267      for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1268        {
1269          int candidate = edests->elems[i];
1270          if (!re_node_set_contains (cur_nodes, candidate))
1271            continue;
1272          if (dest_node == -1)
1273            dest_node = candidate;
1274
1275          else
1276            {
1277              /* In order to avoid infinite loop like "(a*)*", return the second
1278                 epsilon-transition if the first was already considered.  */
1279              if (re_node_set_contains (eps_via_nodes, dest_node))
1280                return candidate;
1281
1282              /* Otherwise, push the second epsilon-transition on the fail stack.  */
1283              else if (fs != NULL
1284                       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1285                                           eps_via_nodes))
1286                return -2;
1287
1288              /* We know we are going to exit.  */
1289              break;
1290            }
1291        }
1292      return dest_node;
1293    }
1294  else
1295    {
1296      int naccepted = 0;
1297      re_token_type_t type = dfa->nodes[node].type;
1298
1299#ifdef RE_ENABLE_I18N
1300      if (dfa->nodes[node].accept_mb)
1301        naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1302      else
1303#endif /* RE_ENABLE_I18N */
1304      if (type == OP_BACK_REF)
1305        {
1306          int subexp_idx = dfa->nodes[node].opr.idx + 1;
1307          naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1308          if (fs != NULL)
1309            {
1310              if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1311                return -1;
1312              else if (naccepted)
1313                {
1314                  char *buf = (char *) re_string_get_buffer (&mctx->input);
1315                  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1316                              naccepted) != 0)
1317                    return -1;
1318                }
1319            }
1320
1321          if (naccepted == 0)
1322            {
1323              int dest_node;
1324              err = re_node_set_insert (eps_via_nodes, node);
1325              if (BE (err < 0, 0))
1326                return -2;
1327              dest_node = dfa->edests[node].elems[0];
1328              if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1329                                        dest_node))
1330                return dest_node;
1331            }
1332        }
1333
1334      if (naccepted != 0
1335          || check_node_accept (mctx, dfa->nodes + node, *pidx))
1336        {
1337          int dest_node = dfa->nexts[node];
1338          *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1339          if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1340                     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1341                                               dest_node)))
1342            return -1;
1343          re_node_set_empty (eps_via_nodes);
1344          return dest_node;
1345        }
1346    }
1347  return -1;
1348}
1349
1350static reg_errcode_t
1351internal_function
1352push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1353                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1354{
1355  reg_errcode_t err;
1356  int num = fs->num++;
1357  if (fs->num == fs->alloc)
1358    {
1359      struct re_fail_stack_ent_t *new_array;
1360      new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1361                                       * fs->alloc * 2));
1362      if (new_array == NULL)
1363        return REG_ESPACE;
1364      fs->alloc *= 2;
1365      fs->stack = new_array;
1366    }
1367  fs->stack[num].idx = str_idx;
1368  fs->stack[num].node = dest_node;
1369  fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1370  if (fs->stack[num].regs == NULL)
1371    return REG_ESPACE;
1372  memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1373  err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1374  return err;
1375}
1376
1377static int
1378internal_function
1379pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1380                regmatch_t *regs, re_node_set *eps_via_nodes)
1381{
1382  int num = --fs->num;
1383  assert (num >= 0);
1384  *pidx = fs->stack[num].idx;
1385  memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1386  re_node_set_free (eps_via_nodes);
1387  re_free (fs->stack[num].regs);
1388  *eps_via_nodes = fs->stack[num].eps_via_nodes;
1389  return fs->stack[num].node;
1390}
1391
1392/* Set the positions where the subexpressions are starts/ends to registers
1393   PMATCH.
1394   Note: We assume that pmatch[0] is already set, and
1395   pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1396
1397static reg_errcode_t
1398internal_function
1399set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1400          regmatch_t *pmatch, int fl_backtrack)
1401{
1402  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1403  int idx, cur_node;
1404  re_node_set eps_via_nodes;
1405  struct re_fail_stack_t *fs;
1406  struct re_fail_stack_t fs_body = { 0, 2, NULL };
1407  regmatch_t *prev_idx_match;
1408  int prev_idx_match_malloced = 0;
1409
1410#ifdef DEBUG
1411  assert (nmatch > 1);
1412  assert (mctx->state_log != NULL);
1413#endif
1414  if (fl_backtrack)
1415    {
1416      fs = &fs_body;
1417      fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1418      if (fs->stack == NULL)
1419        return REG_ESPACE;
1420    }
1421  else
1422    fs = NULL;
1423
1424  cur_node = dfa->init_node;
1425  re_node_set_init_empty (&eps_via_nodes);
1426
1427#ifdef HAVE_ALLOCA
1428  if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1429    prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1430  else
1431#endif
1432    {
1433      prev_idx_match = re_malloc (regmatch_t, nmatch);
1434      if (prev_idx_match == NULL)
1435        {
1436          free_fail_stack_return (fs);
1437          return REG_ESPACE;
1438        }
1439      prev_idx_match_malloced = 1;
1440    }
1441  memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1442
1443  for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1444    {
1445      update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1446
1447      if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1448        {
1449          int reg_idx;
1450          if (fs)
1451            {
1452              for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1453                if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1454                  break;
1455              if (reg_idx == nmatch)
1456                {
1457                  re_node_set_free (&eps_via_nodes);
1458                  if (prev_idx_match_malloced)
1459                    re_free (prev_idx_match);
1460                  return free_fail_stack_return (fs);
1461                }
1462              cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1463                                         &eps_via_nodes);
1464            }
1465          else
1466            {
1467              re_node_set_free (&eps_via_nodes);
1468              if (prev_idx_match_malloced)
1469                re_free (prev_idx_match);
1470              return REG_NOERROR;
1471            }
1472        }
1473
1474      /* Proceed to next node.  */
1475      cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1476                                    &eps_via_nodes, fs);
1477
1478      if (BE (cur_node < 0, 0))
1479        {
1480          if (BE (cur_node == -2, 0))
1481            {
1482              re_node_set_free (&eps_via_nodes);
1483              if (prev_idx_match_malloced)
1484                re_free (prev_idx_match);
1485              free_fail_stack_return (fs);
1486              return REG_ESPACE;
1487            }
1488          if (fs)
1489            cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1490                                       &eps_via_nodes);
1491          else
1492            {
1493              re_node_set_free (&eps_via_nodes);
1494              if (prev_idx_match_malloced)
1495                re_free (prev_idx_match);
1496              return REG_NOMATCH;
1497            }
1498        }
1499    }
1500  re_node_set_free (&eps_via_nodes);
1501  if (prev_idx_match_malloced)
1502    re_free (prev_idx_match);
1503  return free_fail_stack_return (fs);
1504}
1505
1506static reg_errcode_t
1507internal_function
1508free_fail_stack_return (struct re_fail_stack_t *fs)
1509{
1510  if (fs)
1511    {
1512      int fs_idx;
1513      for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1514        {
1515          re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1516          re_free (fs->stack[fs_idx].regs);
1517        }
1518      re_free (fs->stack);
1519    }
1520  return REG_NOERROR;
1521}
1522
1523static void
1524internal_function
1525update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1526             regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1527{
1528  int type = dfa->nodes[cur_node].type;
1529  if (type == OP_OPEN_SUBEXP)
1530    {
1531      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1532
1533      /* We are at the first node of this sub expression.  */
1534      if (reg_num < nmatch)
1535        {
1536          pmatch[reg_num].rm_so = cur_idx;
1537          pmatch[reg_num].rm_eo = -1;
1538        }
1539    }
1540  else if (type == OP_CLOSE_SUBEXP)
1541    {
1542      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1543      if (reg_num < nmatch)
1544        {
1545          /* We are at the last node of this sub expression.  */
1546          if (pmatch[reg_num].rm_so < cur_idx)
1547            {
1548              pmatch[reg_num].rm_eo = cur_idx;
1549              /* This is a non-empty match or we are not inside an optional
1550                 subexpression.  Accept this right away.  */
1551              memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1552            }
1553          else
1554            {
1555              if (dfa->nodes[cur_node].opt_subexp
1556                  && prev_idx_match[reg_num].rm_so != -1)
1557                /* We transited through an empty match for an optional
1558                   subexpression, like (a?)*, and this is not the subexp's
1559                   first match.  Copy back the old content of the registers
1560                   so that matches of an inner subexpression are undone as
1561                   well, like in ((a?))*.  */
1562                memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1563              else
1564                /* We completed a subexpression, but it may be part of
1565                   an optional one, so do not update PREV_IDX_MATCH.  */
1566                pmatch[reg_num].rm_eo = cur_idx;
1567            }
1568        }
1569    }
1570}
1571
1572/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1573   and sift the nodes in each states according to the following rules.
1574   Updated state_log will be wrote to STATE_LOG.
1575
1576   Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1577     1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1578        If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1579        the LAST_NODE, we throw away the node `a'.
1580     2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1581        string `s' and transit to `b':
1582        i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1583           away the node `a'.
1584        ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1585            thrown away, we throw away the node `a'.
1586     3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1587        i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1588           node `a'.
1589        ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1590            we throw away the node `a'.  */
1591
1592#define STATE_NODE_CONTAINS(state,node) \
1593  ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1594
1595static reg_errcode_t
1596internal_function
1597sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1598{
1599  reg_errcode_t err;
1600  int null_cnt = 0;
1601  int str_idx = sctx->last_str_idx;
1602  re_node_set cur_dest;
1603
1604#ifdef DEBUG
1605  assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1606#endif
1607
1608  /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1609     transit to the last_node and the last_node itself.  */
1610  err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1611  if (BE (err != REG_NOERROR, 0))
1612    return err;
1613  err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1614  if (BE (err != REG_NOERROR, 0))
1615    goto free_return;
1616
1617  /* Then check each states in the state_log.  */
1618  while (str_idx > 0)
1619    {
1620      /* Update counters.  */
1621      null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1622      if (null_cnt > mctx->max_mb_elem_len)
1623        {
1624          memset (sctx->sifted_states, '\0',
1625                  sizeof (re_dfastate_t *) * str_idx);
1626          re_node_set_free (&cur_dest);
1627          return REG_NOERROR;
1628        }
1629      re_node_set_empty (&cur_dest);
1630      --str_idx;
1631
1632      if (mctx->state_log[str_idx])
1633        {
1634          err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1635          if (BE (err != REG_NOERROR, 0))
1636            goto free_return;
1637        }
1638
1639      /* Add all the nodes which satisfy the following conditions:
1640         - It can epsilon transit to a node in CUR_DEST.
1641         - It is in CUR_SRC.
1642         And update state_log.  */
1643      err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1644      if (BE (err != REG_NOERROR, 0))
1645        goto free_return;
1646    }
1647  err = REG_NOERROR;
1648 free_return:
1649  re_node_set_free (&cur_dest);
1650  return err;
1651}
1652
1653static reg_errcode_t
1654internal_function
1655build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1656                     int str_idx, re_node_set *cur_dest)
1657{
1658  const re_dfa_t *const dfa = mctx->dfa;
1659  const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1660  int i;
1661
1662  /* Then build the next sifted state.
1663     We build the next sifted state on `cur_dest', and update
1664     `sifted_states[str_idx]' with `cur_dest'.
1665     Note:
1666     `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1667     `cur_src' points the node_set of the old `state_log[str_idx]'
1668     (with the epsilon nodes pre-filtered out).  */
1669  for (i = 0; i < cur_src->nelem; i++)
1670    {
1671      int prev_node = cur_src->elems[i];
1672      int naccepted = 0;
1673      int ret;
1674
1675#ifdef DEBUG
1676      re_token_type_t type = dfa->nodes[prev_node].type;
1677      assert (!IS_EPSILON_NODE (type));
1678#endif
1679#ifdef RE_ENABLE_I18N
1680      /* If the node may accept `multi byte'.  */
1681      if (dfa->nodes[prev_node].accept_mb)
1682        naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1683                                         str_idx, sctx->last_str_idx);
1684#endif /* RE_ENABLE_I18N */
1685
1686      /* We don't check backreferences here.
1687         See update_cur_sifted_state().  */
1688      if (!naccepted
1689          && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1690          && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1691                                  dfa->nexts[prev_node]))
1692        naccepted = 1;
1693
1694      if (naccepted == 0)
1695        continue;
1696
1697      if (sctx->limits.nelem)
1698        {
1699          int to_idx = str_idx + naccepted;
1700          if (check_dst_limits (mctx, &sctx->limits,
1701                                dfa->nexts[prev_node], to_idx,
1702                                prev_node, str_idx))
1703            continue;
1704        }
1705      ret = re_node_set_insert (cur_dest, prev_node);
1706      if (BE (ret == -1, 0))
1707        return REG_ESPACE;
1708    }
1709
1710  return REG_NOERROR;
1711}
1712
1713/* Helper functions.  */
1714
1715static reg_errcode_t
1716internal_function
1717clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1718{
1719  int top = mctx->state_log_top;
1720
1721  if (next_state_log_idx >= mctx->input.bufs_len
1722      || (next_state_log_idx >= mctx->input.valid_len
1723          && mctx->input.valid_len < mctx->input.len))
1724    {
1725      reg_errcode_t err;
1726      err = extend_buffers (mctx);
1727      if (BE (err != REG_NOERROR, 0))
1728        return err;
1729    }
1730
1731  if (top < next_state_log_idx)
1732    {
1733      memset (mctx->state_log + top + 1, '\0',
1734              sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1735      mctx->state_log_top = next_state_log_idx;
1736    }
1737  return REG_NOERROR;
1738}
1739
1740static reg_errcode_t
1741internal_function
1742merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1743                   re_dfastate_t **src, int num)
1744{
1745  int st_idx;
1746  reg_errcode_t err;
1747  for (st_idx = 0; st_idx < num; ++st_idx)
1748    {
1749      if (dst[st_idx] == NULL)
1750        dst[st_idx] = src[st_idx];
1751      else if (src[st_idx] != NULL)
1752        {
1753          re_node_set merged_set;
1754          err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1755                                        &src[st_idx]->nodes);
1756          if (BE (err != REG_NOERROR, 0))
1757            return err;
1758          dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1759          re_node_set_free (&merged_set);
1760          if (BE (err != REG_NOERROR, 0))
1761            return err;
1762        }
1763    }
1764  return REG_NOERROR;
1765}
1766
1767static reg_errcode_t
1768internal_function
1769update_cur_sifted_state (const re_match_context_t *mctx,
1770                         re_sift_context_t *sctx, int str_idx,
1771                         re_node_set *dest_nodes)
1772{
1773  const re_dfa_t *const dfa = mctx->dfa;
1774  reg_errcode_t err = REG_NOERROR;
1775  const re_node_set *candidates;
1776  candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1777                : &mctx->state_log[str_idx]->nodes);
1778
1779  if (dest_nodes->nelem == 0)
1780    sctx->sifted_states[str_idx] = NULL;
1781  else
1782    {
1783      if (candidates)
1784        {
1785          /* At first, add the nodes which can epsilon transit to a node in
1786             DEST_NODE.  */
1787          err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1788          if (BE (err != REG_NOERROR, 0))
1789            return err;
1790
1791          /* Then, check the limitations in the current sift_context.  */
1792          if (sctx->limits.nelem)
1793            {
1794              err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1795                                         mctx->bkref_ents, str_idx);
1796              if (BE (err != REG_NOERROR, 0))
1797                return err;
1798            }
1799        }
1800
1801      sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1802      if (BE (err != REG_NOERROR, 0))
1803        return err;
1804    }
1805
1806  if (candidates && mctx->state_log[str_idx]->has_backref)
1807    {
1808      err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1809      if (BE (err != REG_NOERROR, 0))
1810        return err;
1811    }
1812  return REG_NOERROR;
1813}
1814
1815static reg_errcode_t
1816internal_function
1817add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1818                       const re_node_set *candidates)
1819{
1820  reg_errcode_t err = REG_NOERROR;
1821  int i;
1822
1823  re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1824  if (BE (err != REG_NOERROR, 0))
1825    return err;
1826
1827  if (!state->inveclosure.alloc)
1828    {
1829      err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1830      if (BE (err != REG_NOERROR, 0))
1831        return REG_ESPACE;
1832      for (i = 0; i < dest_nodes->nelem; i++)
1833        {
1834          err = re_node_set_merge (&state->inveclosure,
1835                                   dfa->inveclosures + dest_nodes->elems[i]);
1836          if (BE (err != REG_NOERROR, 0))
1837            return REG_ESPACE;
1838        }
1839    }
1840  return re_node_set_add_intersect (dest_nodes, candidates,
1841                                    &state->inveclosure);
1842}
1843
1844static reg_errcode_t
1845internal_function
1846sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1847                       const re_node_set *candidates)
1848{
1849    int ecl_idx;
1850    reg_errcode_t err;
1851    re_node_set *inv_eclosure = dfa->inveclosures + node;
1852    re_node_set except_nodes;
1853    re_node_set_init_empty (&except_nodes);
1854    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1855      {
1856        int cur_node = inv_eclosure->elems[ecl_idx];
1857        if (cur_node == node)
1858          continue;
1859        if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1860          {
1861            int edst1 = dfa->edests[cur_node].elems[0];
1862            int edst2 = ((dfa->edests[cur_node].nelem > 1)
1863                         ? dfa->edests[cur_node].elems[1] : -1);
1864            if ((!re_node_set_contains (inv_eclosure, edst1)
1865                 && re_node_set_contains (dest_nodes, edst1))
1866                || (edst2 > 0
1867                    && !re_node_set_contains (inv_eclosure, edst2)
1868                    && re_node_set_contains (dest_nodes, edst2)))
1869              {
1870                err = re_node_set_add_intersect (&except_nodes, candidates,
1871                                                 dfa->inveclosures + cur_node);
1872                if (BE (err != REG_NOERROR, 0))
1873                  {
1874                    re_node_set_free (&except_nodes);
1875                    return err;
1876                  }
1877              }
1878          }
1879      }
1880    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1881      {
1882        int cur_node = inv_eclosure->elems[ecl_idx];
1883        if (!re_node_set_contains (&except_nodes, cur_node))
1884          {
1885            int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1886            re_node_set_remove_at (dest_nodes, idx);
1887          }
1888      }
1889    re_node_set_free (&except_nodes);
1890    return REG_NOERROR;
1891}
1892
1893static int
1894internal_function
1895check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1896                  int dst_node, int dst_idx, int src_node, int src_idx)
1897{
1898  const re_dfa_t *const dfa = mctx->dfa;
1899  int lim_idx, src_pos, dst_pos;
1900
1901  int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1902  int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1903  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1904    {
1905      int subexp_idx;
1906      struct re_backref_cache_entry *ent;
1907      ent = mctx->bkref_ents + limits->elems[lim_idx];
1908      subexp_idx = dfa->nodes[ent->node].opr.idx;
1909
1910      dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1911                                           subexp_idx, dst_node, dst_idx,
1912                                           dst_bkref_idx);
1913      src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1914                                           subexp_idx, src_node, src_idx,
1915                                           src_bkref_idx);
1916
1917      /* In case of:
1918         <src> <dst> ( <subexp> )
1919         ( <subexp> ) <src> <dst>
1920         ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1921      if (src_pos == dst_pos)
1922        continue; /* This is unrelated limitation.  */
1923      else
1924        return 1;
1925    }
1926  return 0;
1927}
1928
1929static int
1930internal_function
1931check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1932                             int subexp_idx, int from_node, int bkref_idx)
1933{
1934  const re_dfa_t *const dfa = mctx->dfa;
1935  const re_node_set *eclosures = dfa->eclosures + from_node;
1936  int node_idx;
1937
1938  /* Else, we are on the boundary: examine the nodes on the epsilon
1939     closure.  */
1940  for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1941    {
1942      int node = eclosures->elems[node_idx];
1943      switch (dfa->nodes[node].type)
1944        {
1945        case OP_BACK_REF:
1946          if (bkref_idx != -1)
1947            {
1948              struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1949              do
1950                {
1951                  int dst, cpos;
1952
1953                  if (ent->node != node)
1954                    continue;
1955
1956                  if (subexp_idx < BITSET_WORD_BITS
1957                      && !(ent->eps_reachable_subexps_map
1958                           & ((bitset_word_t) 1 << subexp_idx)))
1959                    continue;
1960
1961                  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1962                     OP_CLOSE_SUBEXP cases below.  But, if the
1963                     destination node is the same node as the source
1964                     node, don't recurse because it would cause an
1965                     infinite loop: a regex that exhibits this behavior
1966                     is ()\1*\1*  */
1967                  dst = dfa->edests[node].elems[0];
1968                  if (dst == from_node)
1969                    {
1970                      if (boundaries & 1)
1971                        return -1;
1972                      else /* if (boundaries & 2) */
1973                        return 0;
1974                    }
1975
1976                  cpos =
1977                    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1978                                                 dst, bkref_idx);
1979                  if (cpos == -1 /* && (boundaries & 1) */)
1980                    return -1;
1981                  if (cpos == 0 && (boundaries & 2))
1982                    return 0;
1983
1984                  if (subexp_idx < BITSET_WORD_BITS)
1985                    ent->eps_reachable_subexps_map
1986                      &= ~((bitset_word_t) 1 << subexp_idx);
1987                }
1988              while (ent++->more);
1989            }
1990          break;
1991
1992        case OP_OPEN_SUBEXP:
1993          if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1994            return -1;
1995          break;
1996
1997        case OP_CLOSE_SUBEXP:
1998          if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1999            return 0;
2000          break;
2001
2002        default:
2003            break;
2004        }
2005    }
2006
2007  return (boundaries & 2) ? 1 : 0;
2008}
2009
2010static int
2011internal_function
2012check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2013                           int subexp_idx, int from_node, int str_idx,
2014                           int bkref_idx)
2015{
2016  struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2017  int boundaries;
2018
2019  /* If we are outside the range of the subexpression, return -1 or 1.  */
2020  if (str_idx < lim->subexp_from)
2021    return -1;
2022
2023  if (lim->subexp_to < str_idx)
2024    return 1;
2025
2026  /* If we are within the subexpression, return 0.  */
2027  boundaries = (str_idx == lim->subexp_from);
2028  boundaries |= (str_idx == lim->subexp_to) << 1;
2029  if (boundaries == 0)
2030    return 0;
2031
2032  /* Else, examine epsilon closure.  */
2033  return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2034                                      from_node, bkref_idx);
2035}
2036
2037/* Check the limitations of sub expressions LIMITS, and remove the nodes
2038   which are against limitations from DEST_NODES. */
2039
2040static reg_errcode_t
2041internal_function
2042check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2043                     const re_node_set *candidates, re_node_set *limits,
2044                     struct re_backref_cache_entry *bkref_ents, int str_idx)
2045{
2046  reg_errcode_t err;
2047  int node_idx, lim_idx;
2048
2049  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2050    {
2051      int subexp_idx;
2052      struct re_backref_cache_entry *ent;
2053      ent = bkref_ents + limits->elems[lim_idx];
2054
2055      if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2056        continue; /* This is unrelated limitation.  */
2057
2058      subexp_idx = dfa->nodes[ent->node].opr.idx;
2059      if (ent->subexp_to == str_idx)
2060        {
2061          int ops_node = -1;
2062          int cls_node = -1;
2063          for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2064            {
2065              int node = dest_nodes->elems[node_idx];
2066              re_token_type_t type = dfa->nodes[node].type;
2067              if (type == OP_OPEN_SUBEXP
2068                  && subexp_idx == dfa->nodes[node].opr.idx)
2069                ops_node = node;
2070              else if (type == OP_CLOSE_SUBEXP
2071                       && subexp_idx == dfa->nodes[node].opr.idx)
2072                cls_node = node;
2073            }
2074
2075          /* Check the limitation of the open subexpression.  */
2076          /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2077          if (ops_node >= 0)
2078            {
2079              err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2080                                           candidates);
2081              if (BE (err != REG_NOERROR, 0))
2082                return err;
2083            }
2084
2085          /* Check the limitation of the close subexpression.  */
2086          if (cls_node >= 0)
2087            for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2088              {
2089                int node = dest_nodes->elems[node_idx];
2090                if (!re_node_set_contains (dfa->inveclosures + node,
2091                                           cls_node)
2092                    && !re_node_set_contains (dfa->eclosures + node,
2093                                              cls_node))
2094                  {
2095                    /* It is against this limitation.
2096                       Remove it form the current sifted state.  */
2097                    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2098                                                 candidates);
2099                    if (BE (err != REG_NOERROR, 0))
2100                      return err;
2101                    --node_idx;
2102                  }
2103              }
2104        }
2105      else /* (ent->subexp_to != str_idx)  */
2106        {
2107          for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2108            {
2109              int node = dest_nodes->elems[node_idx];
2110              re_token_type_t type = dfa->nodes[node].type;
2111              if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2112                {
2113                  if (subexp_idx != dfa->nodes[node].opr.idx)
2114                    continue;
2115                  /* It is against this limitation.
2116                     Remove it form the current sifted state.  */
2117                  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2118                                               candidates);
2119                  if (BE (err != REG_NOERROR, 0))
2120                    return err;
2121                }
2122            }
2123        }
2124    }
2125  return REG_NOERROR;
2126}
2127
2128static reg_errcode_t
2129internal_function
2130sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2131                   int str_idx, const re_node_set *candidates)
2132{
2133  const re_dfa_t *const dfa = mctx->dfa;
2134  reg_errcode_t err;
2135  int node_idx, node;
2136  re_sift_context_t local_sctx;
2137  int first_idx = search_cur_bkref_entry (mctx, str_idx);
2138
2139  if (first_idx == -1)
2140    return REG_NOERROR;
2141
2142  local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2143
2144  for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2145    {
2146      int enabled_idx;
2147      re_token_type_t type;
2148      struct re_backref_cache_entry *entry;
2149      node = candidates->elems[node_idx];
2150      type = dfa->nodes[node].type;
2151      /* Avoid infinite loop for the REs like "()\1+".  */
2152      if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2153        continue;
2154      if (type != OP_BACK_REF)
2155        continue;
2156
2157      entry = mctx->bkref_ents + first_idx;
2158      enabled_idx = first_idx;
2159      do
2160        {
2161          int subexp_len;
2162          int to_idx;
2163          int dst_node;
2164          int ret;
2165          re_dfastate_t *cur_state;
2166
2167          if (entry->node != node)
2168            continue;
2169          subexp_len = entry->subexp_to - entry->subexp_from;
2170          to_idx = str_idx + subexp_len;
2171          dst_node = (subexp_len ? dfa->nexts[node]
2172                      : dfa->edests[node].elems[0]);
2173
2174          if (to_idx > sctx->last_str_idx
2175              || sctx->sifted_states[to_idx] == NULL
2176              || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2177              || check_dst_limits (mctx, &sctx->limits, node,
2178                                   str_idx, dst_node, to_idx))
2179            continue;
2180
2181          if (local_sctx.sifted_states == NULL)
2182            {
2183              local_sctx = *sctx;
2184              err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2185              if (BE (err != REG_NOERROR, 0))
2186                goto free_return;
2187            }
2188          local_sctx.last_node = node;
2189          local_sctx.last_str_idx = str_idx;
2190          ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2191          if (BE (ret < 0, 0))
2192            {
2193              err = REG_ESPACE;
2194              goto free_return;
2195            }
2196          cur_state = local_sctx.sifted_states[str_idx];
2197          err = sift_states_backward (mctx, &local_sctx);
2198          if (BE (err != REG_NOERROR, 0))
2199            goto free_return;
2200          if (sctx->limited_states != NULL)
2201            {
2202              err = merge_state_array (dfa, sctx->limited_states,
2203                                       local_sctx.sifted_states,
2204                                       str_idx + 1);
2205              if (BE (err != REG_NOERROR, 0))
2206                goto free_return;
2207            }
2208          local_sctx.sifted_states[str_idx] = cur_state;
2209          re_node_set_remove (&local_sctx.limits, enabled_idx);
2210
2211          /* mctx->bkref_ents may have changed, reload the pointer.  */
2212          entry = mctx->bkref_ents + enabled_idx;
2213        }
2214      while (enabled_idx++, entry++->more);
2215    }
2216  err = REG_NOERROR;
2217 free_return:
2218  if (local_sctx.sifted_states != NULL)
2219    {
2220      re_node_set_free (&local_sctx.limits);
2221    }
2222
2223  return err;
2224}
2225
2226
2227#ifdef RE_ENABLE_I18N
2228static int
2229internal_function
2230sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2231                     int node_idx, int str_idx, int max_str_idx)
2232{
2233  const re_dfa_t *const dfa = mctx->dfa;
2234  int naccepted;
2235  /* Check the node can accept `multi byte'.  */
2236  naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2237  if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2238      !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2239                            dfa->nexts[node_idx]))
2240    /* The node can't accept the `multi byte', or the
2241       destination was already thrown away, then the node
2242       could't accept the current input `multi byte'.   */
2243    naccepted = 0;
2244  /* Otherwise, it is sure that the node could accept
2245     `naccepted' bytes input.  */
2246  return naccepted;
2247}
2248#endif /* RE_ENABLE_I18N */
2249
2250\f
2251/* Functions for state transition.  */
2252
2253/* Return the next state to which the current state STATE will transit by
2254   accepting the current input byte, and update STATE_LOG if necessary.
2255   If STATE can accept a multibyte char/collating element/back reference
2256   update the destination of STATE_LOG.  */
2257
2258static re_dfastate_t *
2259internal_function
2260transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2261               re_dfastate_t *state)
2262{
2263  re_dfastate_t **trtable;
2264  unsigned char ch;
2265
2266#ifdef RE_ENABLE_I18N
2267  /* If the current state can accept multibyte.  */
2268  if (BE (state->accept_mb, 0))
2269    {
2270      *err = transit_state_mb (mctx, state);
2271      if (BE (*err != REG_NOERROR, 0))
2272        return NULL;
2273    }
2274#endif /* RE_ENABLE_I18N */
2275
2276  /* Then decide the next state with the single byte.  */
2277#if 0
2278  if (0)
2279    /* don't use transition table  */
2280    return transit_state_sb (err, mctx, state);
2281#endif
2282
2283  /* Use transition table  */
2284  ch = re_string_fetch_byte (&mctx->input);
2285  for (;;)
2286    {
2287      trtable = state->trtable;
2288      if (BE (trtable != NULL, 1))
2289        return trtable[ch];
2290
2291      trtable = state->word_trtable;
2292      if (BE (trtable != NULL, 1))
2293        {
2294          unsigned int context;
2295          context
2296            = re_string_context_at (&mctx->input,
2297                                    re_string_cur_idx (&mctx->input) - 1,
2298                                    mctx->eflags);
2299          if (IS_WORD_CONTEXT (context))
2300            return trtable[ch + SBC_MAX];
2301          else
2302            return trtable[ch];
2303        }
2304
2305      if (!build_trtable (mctx->dfa, state))
2306        {
2307          *err = REG_ESPACE;
2308          return NULL;
2309        }
2310
2311      /* Retry, we now have a transition table.  */
2312    }
2313}
2314
2315/* Update the state_log if we need */
2316re_dfastate_t *
2317internal_function
2318merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2319                      re_dfastate_t *next_state)
2320{
2321  const re_dfa_t *const dfa = mctx->dfa;
2322  int cur_idx = re_string_cur_idx (&mctx->input);
2323
2324  if (cur_idx > mctx->state_log_top)
2325    {
2326      mctx->state_log[cur_idx] = next_state;
2327      mctx->state_log_top = cur_idx;
2328    }
2329  else if (mctx->state_log[cur_idx] == 0)
2330    {
2331      mctx->state_log[cur_idx] = next_state;
2332    }
2333  else
2334    {
2335      re_dfastate_t *pstate;
2336      unsigned int context;
2337      re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2338      /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2339         the destination of a multibyte char/collating element/
2340         back reference.  Then the next state is the union set of
2341         these destinations and the results of the transition table.  */
2342      pstate = mctx->state_log[cur_idx];
2343      log_nodes = pstate->entrance_nodes;
2344      if (next_state != NULL)
2345        {
2346          table_nodes = next_state->entrance_nodes;
2347          *err = re_node_set_init_union (&next_nodes, table_nodes,
2348                                             log_nodes);
2349          if (BE (*err != REG_NOERROR, 0))
2350            return NULL;
2351        }
2352      else
2353        next_nodes = *log_nodes;
2354      /* Note: We already add the nodes of the initial state,
2355         then we don't need to add them here.  */
2356
2357      context = re_string_context_at (&mctx->input,
2358                                      re_string_cur_idx (&mctx->input) - 1,
2359                                      mctx->eflags);
2360      next_state = mctx->state_log[cur_idx]
2361        = re_acquire_state_context (err, dfa, &next_nodes, context);
2362      /* We don't need to check errors here, since the return value of
2363         this function is next_state and ERR is already set.  */
2364
2365      if (table_nodes != NULL)
2366        re_node_set_free (&next_nodes);
2367    }
2368
2369  if (BE (dfa->nbackref, 0) && next_state != NULL)
2370    {
2371      /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2372         later.  We must check them here, since the back references in the
2373         next state might use them.  */
2374      *err = check_subexp_matching_top (mctx, &next_state->nodes,
2375                                        cur_idx);
2376      if (BE (*err != REG_NOERROR, 0))
2377        return NULL;
2378
2379      /* If the next state has back references.  */
2380      if (next_state->has_backref)
2381        {
2382          *err = transit_state_bkref (mctx, &next_state->nodes);
2383          if (BE (*err != REG_NOERROR, 0))
2384            return NULL;
2385          next_state = mctx->state_log[cur_idx];
2386        }
2387    }
2388
2389  return next_state;
2390}
2391
2392/* Skip bytes in the input that correspond to part of a
2393   multi-byte match, then look in the log for a state
2394   from which to restart matching.  */
2395re_dfastate_t *
2396internal_function
2397find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2398{
2399  re_dfastate_t *cur_state;
2400  do
2401    {
2402      int max = mctx->state_log_top;
2403      int cur_str_idx = re_string_cur_idx (&mctx->input);
2404
2405      do
2406        {
2407          if (++cur_str_idx > max)
2408            return NULL;
2409          re_string_skip_bytes (&mctx->input, 1);
2410        }
2411      while (mctx->state_log[cur_str_idx] == NULL);
2412
2413      cur_state = merge_state_with_log (err, mctx, NULL);
2414    }
2415  while (*err == REG_NOERROR && cur_state == NULL);
2416  return cur_state;
2417}
2418
2419/* Helper functions for transit_state.  */
2420
2421/* From the node set CUR_NODES, pick up the nodes whose types are
2422   OP_OPEN_SUBEXP and which have corresponding back references in the regular
2423   expression. And register them to use them later for evaluating the
2424   correspoding back references.  */
2425
2426static reg_errcode_t
2427internal_function
2428check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2429                           int str_idx)
2430{
2431  const re_dfa_t *const dfa = mctx->dfa;
2432  int node_idx;
2433  reg_errcode_t err;
2434
2435  /* TODO: This isn't efficient.
2436           Because there might be more than one nodes whose types are
2437           OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2438           nodes.
2439           E.g. RE: (a){2}  */
2440  for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2441    {
2442      int node = cur_nodes->elems[node_idx];
2443      if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2444          && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2445          && (dfa->used_bkref_map
2446              & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2447        {
2448          err = match_ctx_add_subtop (mctx, node, str_idx);
2449          if (BE (err != REG_NOERROR, 0))
2450            return err;
2451        }
2452    }
2453  return REG_NOERROR;
2454}
2455
2456#if 0
2457/* Return the next state to which the current state STATE will transit by
2458   accepting the current input byte.  */
2459
2460static re_dfastate_t *
2461transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2462                  re_dfastate_t *state)
2463{
2464  const re_dfa_t *const dfa = mctx->dfa;
2465  re_node_set next_nodes;
2466  re_dfastate_t *next_state;
2467  int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2468  unsigned int context;
2469
2470  *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2471  if (BE (*err != REG_NOERROR, 0))
2472    return NULL;
2473  for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2474    {
2475      int cur_node = state->nodes.elems[node_cnt];
2476      if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2477        {
2478          *err = re_node_set_merge (&next_nodes,
2479                                    dfa->eclosures + dfa->nexts[cur_node]);
2480          if (BE (*err != REG_NOERROR, 0))
2481            {
2482              re_node_set_free (&next_nodes);
2483              return NULL;
2484            }
2485        }
2486    }
2487  context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2488  next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2489  /* We don't need to check errors here, since the return value of
2490     this function is next_state and ERR is already set.  */
2491
2492  re_node_set_free (&next_nodes);
2493  re_string_skip_bytes (&mctx->input, 1);
2494  return next_state;
2495}
2496#endif
2497
2498#ifdef RE_ENABLE_I18N
2499static reg_errcode_t
2500internal_function
2501transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2502{
2503  const re_dfa_t *const dfa = mctx->dfa;
2504  reg_errcode_t err;
2505  int i;
2506
2507  for (i = 0; i < pstate->nodes.nelem; ++i)
2508    {
2509      re_node_set dest_nodes, *new_nodes;
2510      int cur_node_idx = pstate->nodes.elems[i];
2511      int naccepted, dest_idx;
2512      unsigned int context;
2513      re_dfastate_t *dest_state;
2514
2515      if (!dfa->nodes[cur_node_idx].accept_mb)
2516        continue;
2517
2518      if (dfa->nodes[cur_node_idx].constraint)
2519        {
2520          context = re_string_context_at (&mctx->input,
2521                                          re_string_cur_idx (&mctx->input),
2522                                          mctx->eflags);
2523          if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2524                                           context))
2525            continue;
2526        }
2527
2528      /* How many bytes the node can accept?  */
2529      naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2530                                           re_string_cur_idx (&mctx->input));
2531      if (naccepted == 0)
2532        continue;
2533
2534      /* The node can accepts `naccepted' bytes.  */
2535      dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2536      mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2537                               : mctx->max_mb_elem_len);
2538      err = clean_state_log_if_needed (mctx, dest_idx);
2539      if (BE (err != REG_NOERROR, 0))
2540        return err;
2541#ifdef DEBUG
2542      assert (dfa->nexts[cur_node_idx] != -1);
2543#endif
2544      new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2545
2546      dest_state = mctx->state_log[dest_idx];
2547      if (dest_state == NULL)
2548        dest_nodes = *new_nodes;
2549      else
2550        {
2551          err = re_node_set_init_union (&dest_nodes,
2552                                        dest_state->entrance_nodes, new_nodes);
2553          if (BE (err != REG_NOERROR, 0))
2554            return err;
2555        }
2556      context = re_string_context_at (&mctx->input, dest_idx - 1,
2557                                      mctx->eflags);
2558      mctx->state_log[dest_idx]
2559        = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2560      if (dest_state != NULL)
2561        re_node_set_free (&dest_nodes);
2562      if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2563        return err;
2564    }
2565  return REG_NOERROR;
2566}
2567#endif /* RE_ENABLE_I18N */
2568
2569static reg_errcode_t
2570internal_function
2571transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2572{
2573  const re_dfa_t *const dfa = mctx->dfa;
2574  reg_errcode_t err;
2575  int i;
2576  int cur_str_idx = re_string_cur_idx (&mctx->input);
2577
2578  for (i = 0; i < nodes->nelem; ++i)
2579    {
2580      int dest_str_idx, prev_nelem, bkc_idx;
2581      int node_idx = nodes->elems[i];
2582      unsigned int context;
2583      const re_token_t *node = dfa->nodes + node_idx;
2584      re_node_set *new_dest_nodes;
2585
2586      /* Check whether `node' is a backreference or not.  */
2587      if (node->type != OP_BACK_REF)
2588        continue;
2589
2590      if (node->constraint)
2591        {
2592          context = re_string_context_at (&mctx->input, cur_str_idx,
2593                                          mctx->eflags);
2594          if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2595            continue;
2596        }
2597
2598      /* `node' is a backreference.
2599         Check the substring which the substring matched.  */
2600      bkc_idx = mctx->nbkref_ents;
2601      err = get_subexp (mctx, node_idx, cur_str_idx);
2602      if (BE (err != REG_NOERROR, 0))
2603        goto free_return;
2604
2605      /* And add the epsilon closures (which is `new_dest_nodes') of
2606         the backreference to appropriate state_log.  */
2607#ifdef DEBUG
2608      assert (dfa->nexts[node_idx] != -1);
2609#endif
2610      for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2611        {
2612          int subexp_len;
2613          re_dfastate_t *dest_state;
2614          struct re_backref_cache_entry *bkref_ent;
2615          bkref_ent = mctx->bkref_ents + bkc_idx;
2616          if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2617            continue;
2618          subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2619          new_dest_nodes = (subexp_len == 0
2620                            ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2621                            : dfa->eclosures + dfa->nexts[node_idx]);
2622          dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2623                          - bkref_ent->subexp_from);
2624          context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2625                                          mctx->eflags);
2626          dest_state = mctx->state_log[dest_str_idx];
2627          prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2628                        : mctx->state_log[cur_str_idx]->nodes.nelem);
2629          /* Add `new_dest_node' to state_log.  */
2630          if (dest_state == NULL)
2631            {
2632              mctx->state_log[dest_str_idx]
2633                = re_acquire_state_context (&err, dfa, new_dest_nodes,
2634                                            context);
2635              if (BE (mctx->state_log[dest_str_idx] == NULL
2636                      && err != REG_NOERROR, 0))
2637                goto free_return;
2638            }
2639          else
2640            {
2641              re_node_set dest_nodes;
2642              err = re_node_set_init_union (&dest_nodes,
2643                                            dest_state->entrance_nodes,
2644                                            new_dest_nodes);
2645              if (BE (err != REG_NOERROR, 0))
2646                {
2647                  re_node_set_free (&dest_nodes);
2648                  goto free_return;
2649                }
2650              mctx->state_log[dest_str_idx]
2651                = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2652              re_node_set_free (&dest_nodes);
2653              if (BE (mctx->state_log[dest_str_idx] == NULL
2654                      && err != REG_NOERROR, 0))
2655                goto free_return;
2656            }
2657          /* We need to check recursively if the backreference can epsilon
2658             transit.  */
2659          if (subexp_len == 0
2660              && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2661            {
2662              err = check_subexp_matching_top (mctx, new_dest_nodes,
2663                                               cur_str_idx);
2664              if (BE (err != REG_NOERROR, 0))
2665                goto free_return;
2666              err = transit_state_bkref (mctx, new_dest_nodes);
2667              if (BE (err != REG_NOERROR, 0))
2668                goto free_return;
2669            }
2670        }
2671    }
2672  err = REG_NOERROR;
2673 free_return:
2674  return err;
2675}
2676
2677/* Enumerate all the candidates which the backreference BKREF_NODE can match
2678   at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2679   Note that we might collect inappropriate candidates here.
2680   However, the cost of checking them strictly here is too high, then we
2681   delay these checking for prune_impossible_nodes().  */
2682
2683static reg_errcode_t
2684internal_function
2685get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2686{
2687  const re_dfa_t *const dfa = mctx->dfa;
2688  int subexp_num, sub_top_idx;
2689  const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2690  /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2691  int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2692  if (cache_idx != -1)
2693    {
2694      const struct re_backref_cache_entry *entry
2695        = mctx->bkref_ents + cache_idx;
2696      do
2697        if (entry->node == bkref_node)
2698          return REG_NOERROR; /* We already checked it.  */
2699      while (entry++->more);
2700    }
2701
2702  subexp_num = dfa->nodes[bkref_node].opr.idx;
2703
2704  /* For each sub expression  */
2705  for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2706    {
2707      reg_errcode_t err;
2708      re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2709      re_sub_match_last_t *sub_last;
2710      int sub_last_idx, sl_str, bkref_str_off;
2711
2712      if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2713        continue; /* It isn't related.  */
2714
2715      sl_str = sub_top->str_idx;
2716      bkref_str_off = bkref_str_idx;
2717      /* At first, check the last node of sub expressions we already
2718         evaluated.  */
2719      for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2720        {
2721          int sl_str_diff;
2722          sub_last = sub_top->lasts[sub_last_idx];
2723          sl_str_diff = sub_last->str_idx - sl_str;
2724          /* The matched string by the sub expression match with the substring
2725             at the back reference?  */
2726          if (sl_str_diff > 0)
2727            {
2728              if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2729                {
2730                  /* Not enough chars for a successful match.  */
2731                  if (bkref_str_off + sl_str_diff > mctx->input.len)
2732                    break;
2733
2734                  err = clean_state_log_if_needed (mctx,
2735                                                   bkref_str_off
2736                                                   + sl_str_diff);
2737                  if (BE (err != REG_NOERROR, 0))
2738                    return err;
2739                  buf = (const char *) re_string_get_buffer (&mctx->input);
2740                }
2741              if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2742                /* We don't need to search this sub expression any more.  */
2743                break;
2744            }
2745          bkref_str_off += sl_str_diff;
2746          sl_str += sl_str_diff;
2747          err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2748                                bkref_str_idx);
2749
2750          /* Reload buf, since the preceding call might have reallocated
2751             the buffer.  */
2752          buf = (const char *) re_string_get_buffer (&mctx->input);
2753
2754          if (err == REG_NOMATCH)
2755            continue;
2756          if (BE (err != REG_NOERROR, 0))
2757            return err;
2758        }
2759
2760      if (sub_last_idx < sub_top->nlasts)
2761        continue;
2762      if (sub_last_idx > 0)
2763        ++sl_str;
2764      /* Then, search for the other last nodes of the sub expression.  */
2765      for (; sl_str <= bkref_str_idx; ++sl_str)
2766        {
2767          int cls_node, sl_str_off;
2768          const re_node_set *nodes;
2769          sl_str_off = sl_str - sub_top->str_idx;
2770          /* The matched string by the sub expression match with the substring
2771             at the back reference?  */
2772          if (sl_str_off > 0)
2773            {
2774              if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2775                {
2776                  /* If we are at the end of the input, we cannot match.  */
2777                  if (bkref_str_off >= mctx->input.len)
2778                    break;
2779
2780                  err = extend_buffers (mctx);
2781                  if (BE (err != REG_NOERROR, 0))
2782                    return err;
2783
2784                  buf = (const char *) re_string_get_buffer (&mctx->input);
2785                }
2786              if (buf [bkref_str_off++] != buf[sl_str - 1])
2787                break; /* We don't need to search this sub expression
2788                          any more.  */
2789            }
2790          if (mctx->state_log[sl_str] == NULL)
2791            continue;
2792          /* Does this state have a ')' of the sub expression?  */
2793          nodes = &mctx->state_log[sl_str]->nodes;
2794          cls_node = find_subexp_node (dfa, nodes, subexp_num,
2795                                       OP_CLOSE_SUBEXP);
2796          if (cls_node == -1)
2797            continue; /* No.  */
2798          if (sub_top->path == NULL)
2799            {
2800              sub_top->path = calloc (sizeof (state_array_t),
2801                                      sl_str - sub_top->str_idx + 1);
2802              if (sub_top->path == NULL)
2803                return REG_ESPACE;
2804            }
2805          /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2806             in the current context?  */
2807          err = check_arrival (mctx, sub_top->path, sub_top->node,
2808                               sub_top->str_idx, cls_node, sl_str,
2809                               OP_CLOSE_SUBEXP);
2810          if (err == REG_NOMATCH)
2811              continue;
2812          if (BE (err != REG_NOERROR, 0))
2813              return err;
2814          sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2815          if (BE (sub_last == NULL, 0))
2816            return REG_ESPACE;
2817          err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2818                                bkref_str_idx);
2819          if (err == REG_NOMATCH)
2820            continue;
2821        }
2822    }
2823  return REG_NOERROR;
2824}
2825
2826/* Helper functions for get_subexp().  */
2827
2828/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2829   If it can arrive, register the sub expression expressed with SUB_TOP
2830   and SUB_LAST.  */
2831
2832static reg_errcode_t
2833internal_function
2834get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2835                re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2836{
2837  reg_errcode_t err;
2838  int to_idx;
2839  /* Can the subexpression arrive the back reference?  */
2840  err = check_arrival (mctx, &sub_last->path, sub_last->node,
2841                       sub_last->str_idx, bkref_node, bkref_str,
2842                       OP_OPEN_SUBEXP);
2843  if (err != REG_NOERROR)
2844    return err;
2845  err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2846                             sub_last->str_idx);
2847  if (BE (err != REG_NOERROR, 0))
2848    return err;
2849  to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2850  return clean_state_log_if_needed (mctx, to_idx);
2851}
2852
2853/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2854   Search '(' if FL_OPEN, or search ')' otherwise.
2855   TODO: This function isn't efficient...
2856         Because there might be more than one nodes whose types are
2857         OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2858         nodes.
2859         E.g. RE: (a){2}  */
2860
2861static int
2862internal_function
2863find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2864                  int subexp_idx, int type)
2865{
2866  int cls_idx;
2867  for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2868    {
2869      int cls_node = nodes->elems[cls_idx];
2870      const re_token_t *node = dfa->nodes + cls_node;
2871      if (node->type == type
2872          && node->opr.idx == subexp_idx)
2873        return cls_node;
2874    }
2875  return -1;
2876}
2877
2878/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2879   LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2880   heavily reused.
2881   Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2882
2883static reg_errcode_t
2884internal_function
2885check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2886               int top_str, int last_node, int last_str, int type)
2887{
2888  const re_dfa_t *const dfa = mctx->dfa;
2889  reg_errcode_t err = REG_NOERROR;
2890  int subexp_num, backup_cur_idx, str_idx, null_cnt;
2891  re_dfastate_t *cur_state = NULL;
2892  re_node_set *cur_nodes, next_nodes;
2893  re_dfastate_t **backup_state_log;
2894  unsigned int context;
2895
2896  subexp_num = dfa->nodes[top_node].opr.idx;
2897  /* Extend the buffer if we need.  */
2898  if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2899    {
2900      re_dfastate_t **new_array;
2901      int old_alloc = path->alloc;
2902      path->alloc += last_str + mctx->max_mb_elem_len + 1;
2903      new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2904      if (BE (new_array == NULL, 0))
2905        {
2906          path->alloc = old_alloc;
2907          return REG_ESPACE;
2908        }
2909      path->array = new_array;
2910      memset (new_array + old_alloc, '\0',
2911              sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2912    }
2913
2914  str_idx = path->next_idx ? path->next_idx : top_str;
2915
2916  /* Temporary modify MCTX.  */
2917  backup_state_log = mctx->state_log;
2918  backup_cur_idx = mctx->input.cur_idx;
2919  mctx->state_log = path->array;
2920  mctx->input.cur_idx = str_idx;
2921
2922  /* Setup initial node set.  */
2923  context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2924  if (str_idx == top_str)
2925    {
2926      err = re_node_set_init_1 (&next_nodes, top_node);
2927      if (BE (err != REG_NOERROR, 0))
2928        return err;
2929      err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2930      if (BE (err != REG_NOERROR, 0))
2931        {
2932          re_node_set_free (&next_nodes);
2933          return err;
2934        }
2935    }
2936  else
2937    {
2938      cur_state = mctx->state_log[str_idx];
2939      if (cur_state && cur_state->has_backref)
2940        {
2941          err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2942          if (BE (err != REG_NOERROR, 0))
2943            return err;
2944        }
2945      else
2946        re_node_set_init_empty (&next_nodes);
2947    }
2948  if (str_idx == top_str || (cur_state && cur_state->has_backref))
2949    {
2950      if (next_nodes.nelem)
2951        {
2952          err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2953                                    subexp_num, type);
2954          if (BE (err != REG_NOERROR, 0))
2955            {
2956              re_node_set_free (&next_nodes);
2957              return err;
2958            }
2959        }
2960      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2961      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2962        {
2963          re_node_set_free (&next_nodes);
2964          return err;
2965        }
2966      mctx->state_log[str_idx] = cur_state;
2967    }
2968
2969  for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2970    {
2971      re_node_set_empty (&next_nodes);
2972      if (mctx->state_log[str_idx + 1])
2973        {
2974          err = re_node_set_merge (&next_nodes,
2975                                   &mctx->state_log[str_idx + 1]->nodes);
2976          if (BE (err != REG_NOERROR, 0))
2977            {
2978              re_node_set_free (&next_nodes);
2979              return err;
2980            }
2981        }
2982      if (cur_state)
2983        {
2984          err = check_arrival_add_next_nodes (mctx, str_idx,
2985                                              &cur_state->non_eps_nodes,
2986                                              &next_nodes);
2987          if (BE (err != REG_NOERROR, 0))
2988            {
2989              re_node_set_free (&next_nodes);
2990              return err;
2991            }
2992        }
2993      ++str_idx;
2994      if (next_nodes.nelem)
2995        {
2996          err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2997          if (BE (err != REG_NOERROR, 0))
2998            {
2999              re_node_set_free (&next_nodes);
3000              return err;
3001            }
3002          err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3003                                    subexp_num, type);
3004          if (BE (err != REG_NOERROR, 0))
3005            {
3006              re_node_set_free (&next_nodes);
3007              return err;
3008            }
3009        }
3010      context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3011      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3012      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3013        {
3014          re_node_set_free (&next_nodes);
3015          return err;
3016        }
3017      mctx->state_log[str_idx] = cur_state;
3018      null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3019    }
3020  re_node_set_free (&next_nodes);
3021  cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3022               : &mctx->state_log[last_str]->nodes);
3023  path->next_idx = str_idx;
3024
3025  /* Fix MCTX.  */
3026  mctx->state_log = backup_state_log;
3027  mctx->input.cur_idx = backup_cur_idx;
3028
3029  /* Then check the current node set has the node LAST_NODE.  */
3030  if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3031    return REG_NOERROR;
3032
3033  return REG_NOMATCH;
3034}
3035
3036/* Helper functions for check_arrival.  */
3037
3038/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3039   to NEXT_NODES.
3040   TODO: This function is similar to the functions transit_state*(),
3041         however this function has many additional works.
3042         Can't we unify them?  */
3043
3044static reg_errcode_t
3045internal_function
3046check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3047                              re_node_set *cur_nodes, re_node_set *next_nodes)
3048{
3049  const re_dfa_t *const dfa = mctx->dfa;
3050  int result;
3051  int cur_idx;
3052#ifdef RE_ENABLE_I18N
3053  reg_errcode_t err = REG_NOERROR;
3054#endif
3055  re_node_set union_set;
3056  re_node_set_init_empty (&union_set);
3057  for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3058    {
3059      int naccepted = 0;
3060      int cur_node = cur_nodes->elems[cur_idx];
3061#ifdef DEBUG
3062      re_token_type_t type = dfa->nodes[cur_node].type;
3063      assert (!IS_EPSILON_NODE (type));
3064#endif
3065#ifdef RE_ENABLE_I18N
3066      /* If the node may accept `multi byte'.  */
3067      if (dfa->nodes[cur_node].accept_mb)
3068        {
3069          naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3070                                               str_idx);
3071          if (naccepted > 1)
3072            {
3073              re_dfastate_t *dest_state;
3074              int next_node = dfa->nexts[cur_node];
3075              int next_idx = str_idx + naccepted;
3076              dest_state = mctx->state_log[next_idx];
3077              re_node_set_empty (&union_set);
3078              if (dest_state)
3079                {
3080                  err = re_node_set_merge (&union_set, &dest_state->nodes);
3081                  if (BE (err != REG_NOERROR, 0))
3082                    {
3083                      re_node_set_free (&union_set);
3084                      return err;
3085                    }
3086                }
3087              result = re_node_set_insert (&union_set, next_node);
3088              if (BE (result < 0, 0))
3089                {
3090                  re_node_set_free (&union_set);
3091                  return REG_ESPACE;
3092                }
3093              mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3094                                                            &union_set);
3095              if (BE (mctx->state_log[next_idx] == NULL
3096                      && err != REG_NOERROR, 0))
3097                {
3098                  re_node_set_free (&union_set);
3099                  return err;
3100                }
3101            }
3102        }
3103#endif /* RE_ENABLE_I18N */
3104      if (naccepted
3105          || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3106        {
3107          result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3108          if (BE (result < 0, 0))
3109            {
3110              re_node_set_free (&union_set);
3111              return REG_ESPACE;
3112            }
3113        }
3114    }
3115  re_node_set_free (&union_set);
3116  return REG_NOERROR;
3117}
3118
3119/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3120   CUR_NODES, however exclude the nodes which are:
3121    - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3122    - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3123*/
3124
3125static reg_errcode_t
3126internal_function
3127check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3128                          int ex_subexp, int type)
3129{
3130  reg_errcode_t err;
3131  int idx, outside_node;
3132  re_node_set new_nodes;
3133#ifdef DEBUG
3134  assert (cur_nodes->nelem);
3135#endif
3136  err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3137  if (BE (err != REG_NOERROR, 0))
3138    return err;
3139  /* Create a new node set NEW_NODES with the nodes which are epsilon
3140     closures of the node in CUR_NODES.  */
3141
3142  for (idx = 0; idx < cur_nodes->nelem; ++idx)
3143    {
3144      int cur_node = cur_nodes->elems[idx];
3145      const re_node_set *eclosure = dfa->eclosures + cur_node;
3146      outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3147      if (outside_node == -1)
3148        {
3149          /* There are no problematic nodes, just merge them.  */
3150          err = re_node_set_merge (&new_nodes, eclosure);
3151          if (BE (err != REG_NOERROR, 0))
3152            {
3153              re_node_set_free (&new_nodes);
3154              return err;
3155            }
3156        }
3157      else
3158        {
3159          /* There are problematic nodes, re-calculate incrementally.  */
3160          err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3161                                              ex_subexp, type);
3162          if (BE (err != REG_NOERROR, 0))
3163            {
3164              re_node_set_free (&new_nodes);
3165              return err;
3166            }
3167        }
3168    }
3169  re_node_set_free (cur_nodes);
3170  *cur_nodes = new_nodes;
3171  return REG_NOERROR;
3172}
3173
3174/* Helper function for check_arrival_expand_ecl.
3175   Check incrementally the epsilon closure of TARGET, and if it isn't
3176   problematic append it to DST_NODES.  */
3177
3178static reg_errcode_t
3179internal_function
3180check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3181                              int target, int ex_subexp, int type)
3182{
3183  int cur_node;
3184  for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3185    {
3186      int err;
3187
3188      if (dfa->nodes[cur_node].type == type
3189          && dfa->nodes[cur_node].opr.idx == ex_subexp)
3190        {
3191          if (type == OP_CLOSE_SUBEXP)
3192            {
3193              err = re_node_set_insert (dst_nodes, cur_node);
3194              if (BE (err == -1, 0))
3195                return REG_ESPACE;
3196            }
3197          break;
3198        }
3199      err = re_node_set_insert (dst_nodes, cur_node);
3200      if (BE (err == -1, 0))
3201        return REG_ESPACE;
3202      if (dfa->edests[cur_node].nelem == 0)
3203        break;
3204      if (dfa->edests[cur_node].nelem == 2)
3205        {
3206          err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3207                                              dfa->edests[cur_node].elems[1],
3208                                              ex_subexp, type);
3209          if (BE (err != REG_NOERROR, 0))
3210            return err;
3211        }
3212      cur_node = dfa->edests[cur_node].elems[0];
3213    }
3214  return REG_NOERROR;
3215}
3216
3217
3218/* For all the back references in the current state, calculate the
3219   destination of the back references by the appropriate entry
3220   in MCTX->BKREF_ENTS.  */
3221
3222static reg_errcode_t
3223internal_function
3224expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3225                    int cur_str, int subexp_num, int type)
3226{
3227  const re_dfa_t *const dfa = mctx->dfa;
3228  reg_errcode_t err;
3229  int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3230  struct re_backref_cache_entry *ent;
3231
3232  if (cache_idx_start == -1)
3233    return REG_NOERROR;
3234
3235 restart:
3236  ent = mctx->bkref_ents + cache_idx_start;
3237  do
3238    {
3239      int to_idx, next_node;
3240
3241      /* Is this entry ENT is appropriate?  */
3242      if (!re_node_set_contains (cur_nodes, ent->node))
3243        continue; /* No.  */
3244
3245      to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3246      /* Calculate the destination of the back reference, and append it
3247         to MCTX->STATE_LOG.  */
3248      if (to_idx == cur_str)
3249        {
3250          /* The backreference did epsilon transit, we must re-check all the
3251             node in the current state.  */
3252          re_node_set new_dests;
3253          reg_errcode_t err2, err3;
3254          next_node = dfa->edests[ent->node].elems[0];
3255          if (re_node_set_contains (cur_nodes, next_node))
3256            continue;
3257          err = re_node_set_init_1 (&new_dests, next_node);
3258          err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3259          err3 = re_node_set_merge (cur_nodes, &new_dests);
3260          re_node_set_free (&new_dests);
3261          if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3262                  || err3 != REG_NOERROR, 0))
3263            {
3264              err = (err != REG_NOERROR ? err
3265                     : (err2 != REG_NOERROR ? err2 : err3));
3266              return err;
3267            }
3268          /* TODO: It is still inefficient...  */
3269          goto restart;
3270        }
3271      else
3272        {
3273          re_node_set union_set;
3274          next_node = dfa->nexts[ent->node];
3275          if (mctx->state_log[to_idx])
3276            {
3277              int ret;
3278              if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3279                                        next_node))
3280                continue;
3281              err = re_node_set_init_copy (&union_set,
3282                                           &mctx->state_log[to_idx]->nodes);
3283              ret = re_node_set_insert (&union_set, next_node);
3284              if (BE (err != REG_NOERROR || ret < 0, 0))
3285                {
3286                  re_node_set_free (&union_set);
3287                  err = err != REG_NOERROR ? err : REG_ESPACE;
3288                  return err;
3289                }
3290            }
3291          else
3292            {
3293              err = re_node_set_init_1 (&union_set, next_node);
3294              if (BE (err != REG_NOERROR, 0))
3295                return err;
3296            }
3297          mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3298          re_node_set_free (&union_set);
3299          if (BE (mctx->state_log[to_idx] == NULL
3300                  && err != REG_NOERROR, 0))
3301            return err;
3302        }
3303    }
3304  while (ent++->more);
3305  return REG_NOERROR;
3306}
3307
3308/* Build transition table for the state.
3309   Return 1 if succeeded, otherwise return NULL.  */
3310
3311static int
3312internal_function
3313build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3314{
3315  reg_errcode_t err;
3316  int i, j, ch, need_word_trtable = 0;
3317  bitset_word_t elem, mask;
3318  bool dests_node_malloced = false;
3319  bool dest_states_malloced = false;
3320  int ndests; /* Number of the destination states from `state'.  */
3321  re_dfastate_t **trtable;
3322  re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3323  re_node_set follows, *dests_node;
3324  bitset_t *dests_ch;
3325  bitset_t acceptable;
3326
3327  struct dests_alloc
3328  {
3329    re_node_set dests_node[SBC_MAX];
3330    bitset_t dests_ch[SBC_MAX];
3331  } *dests_alloc;
3332
3333  /* We build DFA states which corresponds to the destination nodes
3334     from `state'.  `dests_node[i]' represents the nodes which i-th
3335     destination state contains, and `dests_ch[i]' represents the
3336     characters which i-th destination state accepts.  */
3337#ifdef HAVE_ALLOCA
3338  if (__libc_use_alloca (sizeof (struct dests_alloc)))
3339    dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3340  else
3341#endif
3342    {
3343      dests_alloc = re_malloc (struct dests_alloc, 1);
3344      if (BE (dests_alloc == NULL, 0))
3345        return 0;
3346      dests_node_malloced = true;
3347    }
3348  dests_node = dests_alloc->dests_node;
3349  dests_ch = dests_alloc->dests_ch;
3350
3351  /* Initialize transiton table.  */
3352  state->word_trtable = state->trtable = NULL;
3353
3354  /* At first, group all nodes belonging to `state' into several
3355     destinations.  */
3356  ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3357  if (BE (ndests <= 0, 0))
3358    {
3359      if (dests_node_malloced)
3360        free (dests_alloc);
3361      /* Return 0 in case of an error, 1 otherwise.  */
3362      if (ndests == 0)
3363        {
3364          state->trtable = (re_dfastate_t **)
3365            calloc (sizeof (re_dfastate_t *), SBC_MAX);
3366          return 1;
3367        }
3368      return 0;
3369    }
3370
3371  err = re_node_set_alloc (&follows, ndests + 1);
3372  if (BE (err != REG_NOERROR, 0))
3373    goto out_free;
3374
3375  /* Avoid arithmetic overflow in size calculation.  */
3376  if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3377            / (3 * sizeof (re_dfastate_t *)))
3378           < ndests),
3379          0))
3380    goto out_free;
3381
3382#ifdef HAVE_ALLOCA
3383  if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3384                         + ndests * 3 * sizeof (re_dfastate_t *)))
3385    dest_states = (re_dfastate_t **)
3386      alloca (ndests * 3 * sizeof (re_dfastate_t *));
3387  else
3388#endif
3389    {
3390      dest_states = (re_dfastate_t **)
3391        malloc (ndests * 3 * sizeof (re_dfastate_t *));
3392      if (BE (dest_states == NULL, 0))
3393        {
3394out_free:
3395          if (dest_states_malloced)
3396            free (dest_states);
3397          re_node_set_free (&follows);
3398          for (i = 0; i < ndests; ++i)
3399            re_node_set_free (dests_node + i);
3400          if (dests_node_malloced)
3401            free (dests_alloc);
3402          return 0;
3403        }
3404      dest_states_malloced = true;
3405    }
3406  dest_states_word = dest_states + ndests;
3407  dest_states_nl = dest_states_word + ndests;
3408  bitset_empty (acceptable);
3409
3410  /* Then build the states for all destinations.  */
3411  for (i = 0; i < ndests; ++i)
3412    {
3413      int next_node;
3414      re_node_set_empty (&follows);
3415      /* Merge the follows of this destination states.  */
3416      for (j = 0; j < dests_node[i].nelem; ++j)
3417        {
3418          next_node = dfa->nexts[dests_node[i].elems[j]];
3419          if (next_node != -1)
3420            {
3421              err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3422              if (BE (err != REG_NOERROR, 0))
3423                goto out_free;
3424            }
3425        }
3426      dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3427      if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3428        goto out_free;
3429      /* If the new state has context constraint,
3430         build appropriate states for these contexts.  */
3431      if (dest_states[i]->has_constraint)
3432        {
3433          dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3434                                                          CONTEXT_WORD);
3435          if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3436            goto out_free;
3437
3438          if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3439            need_word_trtable = 1;
3440
3441          dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3442                                                        CONTEXT_NEWLINE);
3443          if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3444            goto out_free;
3445        }
3446      else
3447        {
3448          dest_states_word[i] = dest_states[i];
3449          dest_states_nl[i] = dest_states[i];
3450        }
3451      bitset_merge (acceptable, dests_ch[i]);
3452    }
3453
3454  if (!BE (need_word_trtable, 0))
3455    {
3456      /* We don't care about whether the following character is a word
3457         character, or we are in a single-byte character set so we can
3458         discern by looking at the character code: allocate a
3459         256-entry transition table.  */
3460      trtable = state->trtable =
3461        (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3462      if (BE (trtable == NULL, 0))
3463        goto out_free;
3464
3465      /* For all characters ch...:  */
3466      for (i = 0; i < BITSET_WORDS; ++i)
3467        for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3468             elem;
3469             mask <<= 1, elem >>= 1, ++ch)
3470          if (BE (elem & 1, 0))
3471            {
3472              /* There must be exactly one destination which accepts
3473                 character ch.  See group_nodes_into_DFAstates.  */
3474              for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3475                ;
3476
3477              /* j-th destination accepts the word character ch.  */
3478              if (dfa->word_char[i] & mask)
3479                trtable[ch] = dest_states_word[j];
3480              else
3481                trtable[ch] = dest_states[j];
3482            }
3483    }
3484  else
3485    {
3486      /* We care about whether the following character is a word
3487         character, and we are in a multi-byte character set: discern
3488         by looking at the character code: build two 256-entry
3489         transition tables, one starting at trtable[0] and one
3490         starting at trtable[SBC_MAX].  */
3491      trtable = state->word_trtable =
3492        (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3493      if (BE (trtable == NULL, 0))
3494        goto out_free;
3495
3496      /* For all characters ch...:  */
3497      for (i = 0; i < BITSET_WORDS; ++i)
3498        for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3499             elem;
3500             mask <<= 1, elem >>= 1, ++ch)
3501          if (BE (elem & 1, 0))
3502            {
3503              /* There must be exactly one destination which accepts
3504                 character ch.  See group_nodes_into_DFAstates.  */
3505              for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3506                ;
3507
3508              /* j-th destination accepts the word character ch.  */
3509              trtable[ch] = dest_states[j];
3510              trtable[ch + SBC_MAX] = dest_states_word[j];
3511            }
3512    }
3513
3514  /* new line */
3515  if (bitset_contain (acceptable, NEWLINE_CHAR))
3516    {
3517      /* The current state accepts newline character.  */
3518      for (j = 0; j < ndests; ++j)
3519        if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3520          {
3521            /* k-th destination accepts newline character.  */
3522            trtable[NEWLINE_CHAR] = dest_states_nl[j];
3523            if (need_word_trtable)
3524              trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3525            /* There must be only one destination which accepts
3526               newline.  See group_nodes_into_DFAstates.  */
3527            break;
3528          }
3529    }
3530
3531  if (dest_states_malloced)
3532    free (dest_states);
3533
3534  re_node_set_free (&follows);
3535  for (i = 0; i < ndests; ++i)
3536    re_node_set_free (dests_node + i);
3537
3538  if (dests_node_malloced)
3539    free (dests_alloc);
3540
3541  return 1;
3542}
3543
3544/* Group all nodes belonging to STATE into several destinations.
3545   Then for all destinations, set the nodes belonging to the destination
3546   to DESTS_NODE[i] and set the characters accepted by the destination
3547   to DEST_CH[i].  This function return the number of destinations.  */
3548
3549static int
3550internal_function
3551group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3552                            re_node_set *dests_node, bitset_t *dests_ch)
3553{
3554  reg_errcode_t err;
3555  int result;
3556  int i, j, k;
3557  int ndests; /* Number of the destinations from `state'.  */
3558  bitset_t accepts; /* Characters a node can accept.  */
3559  const re_node_set *cur_nodes = &state->nodes;
3560  bitset_empty (accepts);
3561  ndests = 0;
3562
3563  /* For all the nodes belonging to `state',  */
3564  for (i = 0; i < cur_nodes->nelem; ++i)
3565    {
3566      re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3567      re_token_type_t type = node->type;
3568      unsigned int constraint = node->constraint;
3569
3570      /* Enumerate all single byte character this node can accept.  */
3571      if (type == CHARACTER)
3572        bitset_set (accepts, node->opr.c);
3573      else if (type == SIMPLE_BRACKET)
3574        {
3575          bitset_merge (accepts, node->opr.sbcset);
3576        }
3577      else if (type == OP_PERIOD)
3578        {
3579#ifdef RE_ENABLE_I18N
3580          if (dfa->mb_cur_max > 1)
3581            bitset_merge (accepts, dfa->sb_char);
3582          else
3583#endif
3584            bitset_set_all (accepts);
3585          if (!(dfa->syntax & RE_DOT_NEWLINE))
3586            bitset_clear (accepts, '\n');
3587          if (dfa->syntax & RE_DOT_NOT_NULL)
3588            bitset_clear (accepts, '\0');
3589        }
3590#ifdef RE_ENABLE_I18N
3591      else if (type == OP_UTF8_PERIOD)
3592        {
3593          memset (accepts, '\xff', sizeof (bitset_t) / 2);
3594          if (!(dfa->syntax & RE_DOT_NEWLINE))
3595            bitset_clear (accepts, '\n');
3596          if (dfa->syntax & RE_DOT_NOT_NULL)
3597            bitset_clear (accepts, '\0');
3598        }
3599#endif
3600      else
3601        continue;
3602
3603      /* Check the `accepts' and sift the characters which are not
3604         match it the context.  */
3605      if (constraint)
3606        {
3607          if (constraint & NEXT_NEWLINE_CONSTRAINT)
3608            {
3609              bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3610              bitset_empty (accepts);
3611              if (accepts_newline)
3612                bitset_set (accepts, NEWLINE_CHAR);
3613              else
3614                continue;
3615            }
3616          if (constraint & NEXT_ENDBUF_CONSTRAINT)
3617            {
3618              bitset_empty (accepts);
3619              continue;
3620            }
3621
3622          if (constraint & NEXT_WORD_CONSTRAINT)
3623            {
3624              bitset_word_t any_set = 0;
3625              if (type == CHARACTER && !node->word_char)
3626                {
3627                  bitset_empty (accepts);
3628                  continue;
3629                }
3630#ifdef RE_ENABLE_I18N
3631              if (dfa->mb_cur_max > 1)
3632                for (j = 0; j < BITSET_WORDS; ++j)
3633                  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3634              else
3635#endif
3636                for (j = 0; j < BITSET_WORDS; ++j)
3637                  any_set |= (accepts[j] &= dfa->word_char[j]);
3638              if (!any_set)
3639                continue;
3640            }
3641          if (constraint & NEXT_NOTWORD_CONSTRAINT)
3642            {
3643              bitset_word_t any_set = 0;
3644              if (type == CHARACTER && node->word_char)
3645                {
3646                  bitset_empty (accepts);
3647                  continue;
3648                }
3649#ifdef RE_ENABLE_I18N
3650              if (dfa->mb_cur_max > 1)
3651                for (j = 0; j < BITSET_WORDS; ++j)
3652                  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3653              else
3654#endif
3655                for (j = 0; j < BITSET_WORDS; ++j)
3656                  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3657              if (!any_set)
3658                continue;
3659            }
3660        }
3661
3662      /* Then divide `accepts' into DFA states, or create a new
3663         state.  Above, we make sure that accepts is not empty.  */
3664      for (j = 0; j < ndests; ++j)
3665        {
3666          bitset_t intersec; /* Intersection sets, see below.  */
3667          bitset_t remains;
3668          /* Flags, see below.  */
3669          bitset_word_t has_intersec, not_subset, not_consumed;
3670
3671          /* Optimization, skip if this state doesn't accept the character.  */
3672          if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3673            continue;
3674
3675          /* Enumerate the intersection set of this state and `accepts'.  */
3676          has_intersec = 0;
3677          for (k = 0; k < BITSET_WORDS; ++k)
3678            has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3679          /* And skip if the intersection set is empty.  */
3680          if (!has_intersec)
3681            continue;
3682
3683          /* Then check if this state is a subset of `accepts'.  */
3684          not_subset = not_consumed = 0;
3685          for (k = 0; k < BITSET_WORDS; ++k)
3686            {
3687              not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3688              not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3689            }
3690
3691          /* If this state isn't a subset of `accepts', create a
3692             new group state, which has the `remains'. */
3693          if (not_subset)
3694            {
3695              bitset_copy (dests_ch[ndests], remains);
3696              bitset_copy (dests_ch[j], intersec);
3697              err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3698              if (BE (err != REG_NOERROR, 0))
3699                goto error_return;
3700              ++ndests;
3701            }
3702
3703          /* Put the position in the current group. */
3704          result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3705          if (BE (result < 0, 0))
3706            goto error_return;
3707
3708          /* If all characters are consumed, go to next node. */
3709          if (!not_consumed)
3710            break;
3711        }
3712      /* Some characters remain, create a new group. */
3713      if (j == ndests)
3714        {
3715          bitset_copy (dests_ch[ndests], accepts);
3716          err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3717          if (BE (err != REG_NOERROR, 0))
3718            goto error_return;
3719          ++ndests;
3720          bitset_empty (accepts);
3721        }
3722    }
3723  return ndests;
3724 error_return:
3725  for (j = 0; j < ndests; ++j)
3726    re_node_set_free (dests_node + j);
3727  return -1;
3728}
3729
3730#ifdef RE_ENABLE_I18N
3731/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3732   Return the number of the bytes the node accepts.
3733   STR_IDX is the current index of the input string.
3734
3735   This function handles the nodes which can accept one character, or
3736   one collating element like '.', '[a-z]', opposite to the other nodes
3737   can only accept one byte.  */
3738
3739static int
3740internal_function
3741check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3742                         const re_string_t *input, int str_idx)
3743{
3744  const re_token_t *node = dfa->nodes + node_idx;
3745  int char_len, elem_len;
3746  int i;
3747  wint_t wc;
3748
3749  if (BE (node->type == OP_UTF8_PERIOD, 0))
3750    {
3751      unsigned char c = re_string_byte_at (input, str_idx), d;
3752      if (BE (c < 0xc2, 1))
3753        return 0;
3754
3755      if (str_idx + 2 > input->len)
3756        return 0;
3757
3758      d = re_string_byte_at (input, str_idx + 1);
3759      if (c < 0xe0)
3760        return (d < 0x80 || d > 0xbf) ? 0 : 2;
3761      else if (c < 0xf0)
3762        {
3763          char_len = 3;
3764          if (c == 0xe0 && d < 0xa0)
3765            return 0;
3766        }
3767      else if (c < 0xf8)
3768        {
3769          char_len = 4;
3770          if (c == 0xf0 && d < 0x90)
3771            return 0;
3772        }
3773      else if (c < 0xfc)
3774        {
3775          char_len = 5;
3776          if (c == 0xf8 && d < 0x88)
3777            return 0;
3778        }
3779      else if (c < 0xfe)
3780        {
3781          char_len = 6;
3782          if (c == 0xfc && d < 0x84)
3783            return 0;
3784        }
3785      else
3786        return 0;
3787
3788      if (str_idx + char_len > input->len)
3789        return 0;
3790
3791      for (i = 1; i < char_len; ++i)
3792        {
3793          d = re_string_byte_at (input, str_idx + i);
3794          if (d < 0x80 || d > 0xbf)
3795            return 0;
3796        }
3797      return char_len;
3798    }
3799
3800  char_len = re_string_char_size_at (input, str_idx);
3801  if (node->type == OP_PERIOD)
3802    {
3803      if (char_len <= 1)
3804        return 0;
3805      /* FIXME: I don't think this if is needed, as both '\n'
3806         and '\0' are char_len == 1.  */
3807      /* '.' accepts any one character except the following two cases.  */
3808      if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3809           re_string_byte_at (input, str_idx) == '\n') ||
3810          ((dfa->syntax & RE_DOT_NOT_NULL) &&
3811           re_string_byte_at (input, str_idx) == '\0'))
3812        return 0;
3813      return char_len;
3814    }
3815
3816  elem_len = re_string_elem_size_at (input, str_idx);
3817  wc = __btowc(*(input->mbs+str_idx));
3818  if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3819    return 0;
3820
3821  if (node->type == COMPLEX_BRACKET)
3822    {
3823      const re_charset_t *cset = node->opr.mbcset;
3824# ifdef _LIBC
3825      const unsigned char *pin
3826        = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3827      int j;
3828      uint32_t nrules;
3829# endif /* _LIBC */
3830      int match_len = 0;
3831      wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3832                    ? re_string_wchar_at (input, str_idx) : 0);
3833
3834      /* match with multibyte character?  */
3835      for (i = 0; i < cset->nmbchars; ++i)
3836        if (wc == cset->mbchars[i])
3837          {
3838            match_len = char_len;
3839            goto check_node_accept_bytes_match;
3840          }
3841      /* match with character_class?  */
3842      for (i = 0; i < cset->nchar_classes; ++i)
3843        {
3844          wctype_t wt = cset->char_classes[i];
3845          if (__iswctype (wc, wt))
3846            {
3847              match_len = char_len;
3848              goto check_node_accept_bytes_match;
3849            }
3850        }
3851
3852# ifdef _LIBC
3853      nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3854      if (nrules != 0)
3855        {
3856          unsigned int in_collseq = 0;
3857          const int32_t *table, *indirect;
3858          const unsigned char *weights, *extra;
3859          const char *collseqwc;
3860          /* This #include defines a local function!  */
3861#  include <locale/weight.h>
3862
3863          /* match with collating_symbol?  */
3864          if (cset->ncoll_syms)
3865            extra = (const unsigned char *)
3866              _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3867          for (i = 0; i < cset->ncoll_syms; ++i)
3868            {
3869              const unsigned char *coll_sym = extra + cset->coll_syms[i];
3870              /* Compare the length of input collating element and
3871                 the length of current collating element.  */
3872              if (*coll_sym != elem_len)
3873                continue;
3874              /* Compare each bytes.  */
3875              for (j = 0; j < *coll_sym; j++)
3876                if (pin[j] != coll_sym[1 + j])
3877                  break;
3878              if (j == *coll_sym)
3879                {
3880                  /* Match if every bytes is equal.  */
3881                  match_len = j;
3882                  goto check_node_accept_bytes_match;
3883                }
3884            }
3885
3886          if (cset->nranges)
3887            {
3888              if (elem_len <= char_len)
3889                {
3890                  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3891                  in_collseq = __collseq_table_lookup (collseqwc, wc);
3892                }
3893              else
3894                in_collseq = find_collation_sequence_value (pin, elem_len);
3895            }
3896          /* match with range expression?  */
3897          for (i = 0; i < cset->nranges; ++i)
3898            if (cset->range_starts[i] <= in_collseq
3899                && in_collseq <= cset->range_ends[i])
3900              {
3901                match_len = elem_len;
3902                goto check_node_accept_bytes_match;
3903              }
3904
3905          /* match with equivalence_class?  */
3906          if (cset->nequiv_classes)
3907            {
3908              const unsigned char *cp = pin;
3909              table = (const int32_t *)
3910                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3911              weights = (const unsigned char *)
3912                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3913              extra = (const unsigned char *)
3914                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3915              indirect = (const int32_t *)
3916                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3917              int32_t idx = findidx (&cp);
3918              if (idx > 0)
3919                for (i = 0; i < cset->nequiv_classes; ++i)
3920                  {
3921                    int32_t equiv_class_idx = cset->equiv_classes[i];
3922                    size_t weight_len = weights[idx & 0xffffff];
3923                    if (weight_len == weights[equiv_class_idx & 0xffffff]
3924                        && (idx >> 24) == (equiv_class_idx >> 24))
3925                      {
3926                        int cnt = 0;
3927
3928                        idx &= 0xffffff;
3929                        equiv_class_idx &= 0xffffff;
3930
3931                        while (cnt <= weight_len
3932                               && (weights[equiv_class_idx + 1 + cnt]
3933                                   == weights[idx + 1 + cnt]))
3934                          ++cnt;
3935                        if (cnt > weight_len)
3936                          {
3937                            match_len = elem_len;
3938                            goto check_node_accept_bytes_match;
3939                          }
3940                      }
3941                  }
3942            }
3943        }
3944      else
3945# endif /* _LIBC */
3946        {
3947          /* match with range expression?  */
3948#if __GNUC__ >= 2
3949          wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3950#else
3951          wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3952          cmp_buf[2] = wc;
3953#endif
3954          for (i = 0; i < cset->nranges; ++i)
3955            {
3956              cmp_buf[0] = cset->range_starts[i];
3957              cmp_buf[4] = cset->range_ends[i];
3958              if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3959                  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3960                {
3961                  match_len = char_len;
3962                  goto check_node_accept_bytes_match;
3963                }
3964            }
3965        }
3966    check_node_accept_bytes_match:
3967      if (!cset->non_match)
3968        return match_len;
3969      else
3970        {
3971          if (match_len > 0)
3972            return 0;
3973          else
3974            return (elem_len > char_len) ? elem_len : char_len;
3975        }
3976    }
3977  return 0;
3978}
3979
3980# ifdef _LIBC
3981static unsigned int
3982internal_function
3983find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3984{
3985  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3986  if (nrules == 0)
3987    {
3988      if (mbs_len == 1)
3989        {
3990          /* No valid character.  Match it as a single byte character.  */
3991          const unsigned char *collseq = (const unsigned char *)
3992            _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3993          return collseq[mbs[0]];
3994        }
3995      return UINT_MAX;
3996    }
3997  else
3998    {
3999      int32_t idx;
4000      const unsigned char *extra = (const unsigned char *)
4001        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4002      int32_t extrasize = (const unsigned char *)
4003        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4004
4005      for (idx = 0; idx < extrasize;)
4006        {
4007          int mbs_cnt, found = 0;
4008          int32_t elem_mbs_len;
4009          /* Skip the name of collating element name.  */
4010          idx = idx + extra[idx] + 1;
4011          elem_mbs_len = extra[idx++];
4012          if (mbs_len == elem_mbs_len)
4013            {
4014              for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4015                if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4016                  break;
4017              if (mbs_cnt == elem_mbs_len)
4018                /* Found the entry.  */
4019                found = 1;
4020            }
4021          /* Skip the byte sequence of the collating element.  */
4022          idx += elem_mbs_len;
4023          /* Adjust for the alignment.  */
4024          idx = (idx + 3) & ~3;
4025          /* Skip the collation sequence value.  */
4026          idx += sizeof (uint32_t);
4027          /* Skip the wide char sequence of the collating element.  */
4028          idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4029          /* If we found the entry, return the sequence value.  */
4030          if (found)
4031            return *(uint32_t *) (extra + idx);
4032          /* Skip the collation sequence value.  */
4033          idx += sizeof (uint32_t);
4034        }
4035      return UINT_MAX;
4036    }
4037}
4038# endif /* _LIBC */
4039#endif /* RE_ENABLE_I18N */
4040
4041/* Check whether the node accepts the byte which is IDX-th
4042   byte of the INPUT.  */
4043
4044static int
4045internal_function
4046check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4047                   int idx)
4048{
4049  unsigned char ch;
4050  ch = re_string_byte_at (&mctx->input, idx);
4051  switch (node->type)
4052    {
4053    case CHARACTER:
4054      if (node->opr.c != ch)
4055        return 0;
4056      break;
4057
4058    case SIMPLE_BRACKET:
4059      if (!bitset_contain (node->opr.sbcset, ch))
4060        return 0;
4061      break;
4062
4063#ifdef RE_ENABLE_I18N
4064    case OP_UTF8_PERIOD:
4065      if (ch >= 0x80)
4066        return 0;
4067      /* FALLTHROUGH */
4068#endif
4069    case OP_PERIOD:
4070      if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4071          || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4072        return 0;
4073      break;
4074
4075    default:
4076      return 0;
4077    }
4078
4079  if (node->constraint)
4080    {
4081      /* The node has constraints.  Check whether the current context
4082         satisfies the constraints.  */
4083      unsigned int context = re_string_context_at (&mctx->input, idx,
4084                                                   mctx->eflags);
4085      if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4086        return 0;
4087    }
4088
4089  return 1;
4090}
4091
4092/* Extend the buffers, if the buffers have run out.  */
4093
4094static reg_errcode_t
4095internal_function
4096extend_buffers (re_match_context_t *mctx)
4097{
4098  reg_errcode_t ret;
4099  re_string_t *pstr = &mctx->input;
4100
4101  /* Avoid overflow.  */
4102  if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4103    return REG_ESPACE;
4104
4105  /* Double the lengthes of the buffers.  */
4106  ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4107  if (BE (ret != REG_NOERROR, 0))
4108    return ret;
4109
4110  if (mctx->state_log != NULL)
4111    {
4112      /* And double the length of state_log.  */
4113      /* XXX We have no indication of the size of this buffer.  If this
4114         allocation fail we have no indication that the state_log array
4115         does not have the right size.  */
4116      re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4117                                              pstr->bufs_len + 1);
4118      if (BE (new_array == NULL, 0))
4119        return REG_ESPACE;
4120      mctx->state_log = new_array;
4121    }
4122
4123  /* Then reconstruct the buffers.  */
4124  if (pstr->icase)
4125    {
4126#ifdef RE_ENABLE_I18N
4127      if (pstr->mb_cur_max > 1)
4128        {
4129          ret = build_wcs_upper_buffer (pstr);
4130          if (BE (ret != REG_NOERROR, 0))
4131            return ret;
4132        }
4133      else
4134#endif /* RE_ENABLE_I18N  */
4135        build_upper_buffer (pstr);
4136    }
4137  else
4138    {
4139#ifdef RE_ENABLE_I18N
4140      if (pstr->mb_cur_max > 1)
4141        build_wcs_buffer (pstr);
4142      else
4143#endif /* RE_ENABLE_I18N  */
4144        {
4145          if (pstr->trans != NULL)
4146            re_string_translate_buffer (pstr);
4147        }
4148    }
4149  return REG_NOERROR;
4150}
4151
4152\f
4153/* Functions for matching context.  */
4154
4155/* Initialize MCTX.  */
4156
4157static reg_errcode_t
4158internal_function
4159match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4160{
4161  mctx->eflags = eflags;
4162  mctx->match_last = -1;
4163  if (n > 0)
4164    {
4165      mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4166      mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4167      if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4168        return REG_ESPACE;
4169    }
4170  /* Already zero-ed by the caller.
4171     else
4172       mctx->bkref_ents = NULL;
4173     mctx->nbkref_ents = 0;
4174     mctx->nsub_tops = 0;  */
4175  mctx->abkref_ents = n;
4176  mctx->max_mb_elem_len = 1;
4177  mctx->asub_tops = n;
4178  return REG_NOERROR;
4179}
4180
4181/* Clean the entries which depend on the current input in MCTX.
4182   This function must be invoked when the matcher changes the start index
4183   of the input, or changes the input string.  */
4184
4185static void
4186internal_function
4187match_ctx_clean (re_match_context_t *mctx)
4188{
4189  int st_idx;
4190  for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4191    {
4192      int sl_idx;
4193      re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4194      for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4195        {
4196          re_sub_match_last_t *last = top->lasts[sl_idx];
4197          re_free (last->path.array);
4198          re_free (last);
4199        }
4200      re_free (top->lasts);
4201      if (top->path)
4202        {
4203          re_free (top->path->array);
4204          re_free (top->path);
4205        }
4206      free (top);
4207    }
4208
4209  mctx->nsub_tops = 0;
4210  mctx->nbkref_ents = 0;
4211}
4212
4213/* Free all the memory associated with MCTX.  */
4214
4215static void
4216internal_function
4217match_ctx_free (re_match_context_t *mctx)
4218{
4219  /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4220  match_ctx_clean (mctx);
4221  re_free (mctx->sub_tops);
4222  re_free (mctx->bkref_ents);
4223}
4224
4225/* Add a new backreference entry to MCTX.
4226   Note that we assume that caller never call this function with duplicate
4227   entry, and call with STR_IDX which isn't smaller than any existing entry.
4228*/
4229
4230static reg_errcode_t
4231internal_function
4232match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4233                     int to)
4234{
4235  if (mctx->nbkref_ents >= mctx->abkref_ents)
4236    {
4237      struct re_backref_cache_entry* new_entry;
4238      new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4239                              mctx->abkref_ents * 2);
4240      if (BE (new_entry == NULL, 0))
4241        {
4242          re_free (mctx->bkref_ents);
4243          return REG_ESPACE;
4244        }
4245      mctx->bkref_ents = new_entry;
4246      memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4247              sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4248      mctx->abkref_ents *= 2;
4249    }
4250  if (mctx->nbkref_ents > 0
4251      && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4252    mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4253
4254  mctx->bkref_ents[mctx->nbkref_ents].node = node;
4255  mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4256  mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4257  mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4258
4259  /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4260     If bit N is clear, means that this entry won't epsilon-transition to
4261     an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4262     it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4263     such node.
4264
4265     A backreference does not epsilon-transition unless it is empty, so set
4266     to all zeros if FROM != TO.  */
4267  mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4268    = (from == to ? ~0 : 0);
4269
4270  mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4271  if (mctx->max_mb_elem_len < to - from)
4272    mctx->max_mb_elem_len = to - from;
4273  return REG_NOERROR;
4274}
4275
4276/* Search for the first entry which has the same str_idx, or -1 if none is
4277   found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4278
4279static int
4280internal_function
4281search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4282{
4283  int left, right, mid, last;
4284  last = right = mctx->nbkref_ents;
4285  for (left = 0; left < right;)
4286    {
4287      mid = (left + right) / 2;
4288      if (mctx->bkref_ents[mid].str_idx < str_idx)
4289        left = mid + 1;
4290      else
4291        right = mid;
4292    }
4293  if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4294    return left;
4295  else
4296    return -1;
4297}
4298
4299/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4300   at STR_IDX.  */
4301
4302static reg_errcode_t
4303internal_function
4304match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4305{
4306#ifdef DEBUG
4307  assert (mctx->sub_tops != NULL);
4308  assert (mctx->asub_tops > 0);
4309#endif
4310  if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4311    {
4312      int new_asub_tops = mctx->asub_tops * 2;
4313      re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4314                                                   re_sub_match_top_t *,
4315                                                   new_asub_tops);
4316      if (BE (new_array == NULL, 0))
4317        return REG_ESPACE;
4318      mctx->sub_tops = new_array;
4319      mctx->asub_tops = new_asub_tops;
4320    }
4321  mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4322  if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4323    return REG_ESPACE;
4324  mctx->sub_tops[mctx->nsub_tops]->node = node;
4325  mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4326  return REG_NOERROR;
4327}
4328
4329/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4330   at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4331
4332static re_sub_match_last_t *
4333internal_function
4334match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4335{
4336  re_sub_match_last_t *new_entry;
4337  if (BE (subtop->nlasts == subtop->alasts, 0))
4338    {
4339      int new_alasts = 2 * subtop->alasts + 1;
4340      re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4341                                                    re_sub_match_last_t *,
4342                                                    new_alasts);
4343      if (BE (new_array == NULL, 0))
4344        return NULL;
4345      subtop->lasts = new_array;
4346      subtop->alasts = new_alasts;
4347    }
4348  new_entry = calloc (1, sizeof (re_sub_match_last_t));
4349  if (BE (new_entry != NULL, 1))
4350    {
4351      subtop->lasts[subtop->nlasts] = new_entry;
4352      new_entry->node = node;
4353      new_entry->str_idx = str_idx;
4354      ++subtop->nlasts;
4355    }
4356  return new_entry;
4357}
4358
4359static void
4360internal_function
4361sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4362               re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4363{
4364  sctx->sifted_states = sifted_sts;
4365  sctx->limited_states = limited_sts;
4366  sctx->last_node = last_node;
4367  sctx->last_str_idx = last_str_idx;
4368  re_node_set_init_empty (&sctx->limits);
4369}