Andrew's git - gitweb.git/blob - compat/regex/regexec.c

   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, write to the Free
  18   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19   02110-1301 USA.  */
  20
  21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                     int n) internal_function;
  23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24static void match_ctx_free (re_match_context_t *cache) internal_function;
  25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                          int str_idx, int from, int to)
  27     internal_function;
  28static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29     internal_function;
  30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                           int str_idx) internal_function;
  32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                   int node, int str_idx)
  34     internal_function;
  35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                           re_dfastate_t **limited_sts, int last_node,
  37                           int last_str_idx)
  38     internal_function;
  39static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                         const char *string, int length,
  41                                         int start, int range, int stop,
  42                                         size_t nmatch, regmatch_t pmatch[],
  43                                         int eflags) internal_function;
  44static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                             const char *string1, int length1,
  46                             const char *string2, int length2,
  47                             int start, int range, struct re_registers *regs,
  48                             int stop, int ret_len) internal_function;
  49static int re_search_stub (struct re_pattern_buffer *bufp,
  50                           const char *string, int length, int start,
  51                           int range, int stop, struct re_registers *regs,
  52                           int ret_len) internal_function;
  53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                              int nregs, int regs_allocated) internal_function;
  55static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56     internal_function;
  57static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                           int *p_match_first) internal_function;
  59static int check_halt_state_context (const re_match_context_t *mctx,
  60                                     const re_dfastate_t *state, int idx)
  61     internal_function;
  62static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                         regmatch_t *prev_idx_match, int cur_node,
  64                         int cur_idx, int nmatch) internal_function;
  65static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                      int str_idx, int dest_node, int nregs,
  67                                      regmatch_t *regs,
  68                                      re_node_set *eps_via_nodes)
  69     internal_function;
  70static reg_errcode_t set_regs (const regex_t *preg,
  71                               const re_match_context_t *mctx,
  72                               size_t nmatch, regmatch_t *pmatch,
  73                               int fl_backtrack) internal_function;
  74static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75     internal_function;
  76
  77#ifdef RE_ENABLE_I18N
  78static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                re_sift_context_t *sctx,
  80                                int node_idx, int str_idx, int max_str_idx)
  81     internal_function;
  82#endif /* RE_ENABLE_I18N */
  83static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                           re_sift_context_t *sctx)
  85     internal_function;
  86static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                          re_sift_context_t *sctx, int str_idx,
  88                                          re_node_set *cur_dest)
  89     internal_function;
  90static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                              re_sift_context_t *sctx,
  92                                              int str_idx,
  93                                              re_node_set *dest_nodes)
  94     internal_function;
  95static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                            re_node_set *dest_nodes,
  97                                            const re_node_set *candidates)
  98     internal_function;
  99static int check_dst_limits (const re_match_context_t *mctx,
 100                             re_node_set *limits,
 101                             int dst_node, int dst_idx, int src_node,
 102                             int src_idx) internal_function;
 103static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                        int boundaries, int subexp_idx,
 105                                        int from_node, int bkref_idx)
 106     internal_function;
 107static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                      int limit, int subexp_idx,
 109                                      int node, int str_idx,
 110                                      int bkref_idx) internal_function;
 111static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                          re_node_set *dest_nodes,
 113                                          const re_node_set *candidates,
 114                                          re_node_set *limits,
 115                                          struct re_backref_cache_entry *bkref_ents,
 116                                          int str_idx) internal_function;
 117static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                        re_sift_context_t *sctx,
 119                                        int str_idx, const re_node_set *candidates)
 120     internal_function;
 121static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                        re_dfastate_t **dst,
 123                                        re_dfastate_t **src, int num)
 124     internal_function;
 125static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                         re_match_context_t *mctx) internal_function;
 127static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                     re_match_context_t *mctx,
 129                                     re_dfastate_t *state) internal_function;
 130static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                            re_match_context_t *mctx,
 132                                            re_dfastate_t *next_state)
 133     internal_function;
 134static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                re_node_set *cur_nodes,
 136                                                int str_idx) internal_function;
 137#if 0
 138static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                        re_match_context_t *mctx,
 140                                        re_dfastate_t *pstate)
 141     internal_function;
 142#endif
 143#ifdef RE_ENABLE_I18N
 144static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                       re_dfastate_t *pstate)
 146     internal_function;
 147#endif /* RE_ENABLE_I18N */
 148static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                          const re_node_set *nodes)
 150     internal_function;
 151static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                 int bkref_node, int bkref_str_idx)
 153     internal_function;
 154static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                     const re_sub_match_top_t *sub_top,
 156                                     re_sub_match_last_t *sub_last,
 157                                     int bkref_node, int bkref_str)
 158     internal_function;
 159static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                             int subexp_idx, int type) internal_function;
 161static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                    state_array_t *path, int top_node,
 163                                    int top_str, int last_node, int last_str,
 164                                    int type) internal_function;
 165static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                   int str_idx,
 167                                                   re_node_set *cur_nodes,
 168                                                   re_node_set *next_nodes)
 169     internal_function;
 170static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                               re_node_set *cur_nodes,
 172                                               int ex_subexp, int type)
 173     internal_function;
 174static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                   re_node_set *dst_nodes,
 176                                                   int target, int ex_subexp,
 177                                                   int type) internal_function;
 178static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                         re_node_set *cur_nodes, int cur_str,
 180                                         int subexp_num, int type)
 181     internal_function;
 182static int build_trtable (const re_dfa_t *dfa,
 183                          re_dfastate_t *state) internal_function;
 184#ifdef RE_ENABLE_I18N
 185static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                    const re_string_t *input, int idx)
 187     internal_function;
 188# ifdef _LIBC
 189static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                   size_t name_len)
 191     internal_function;
 192# endif /* _LIBC */
 193#endif /* RE_ENABLE_I18N */
 194static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                       const re_dfastate_t *state,
 196                                       re_node_set *states_node,
 197                                       bitset_t *states_ch) internal_function;
 198static int check_node_accept (const re_match_context_t *mctx,
 199                              const re_token_t *node, int idx)
 200     internal_function;
 201static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202     internal_function;
 203\f
 204/* Entry point for POSIX code.  */
 205
 206/* regexec searches for a given pattern, specified by PREG, in the
 207   string STRING.
 208
 209   If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210   `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211   least NMATCH elements, and we set them to the offsets of the
 212   corresponding matched substrings.
 213
 214   EFLAGS specifies `execution flags' which affect matching: if
 215   REG_NOTBOL is set, then ^ does not match at the beginning of the
 216   string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218   We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220int
 221regexec (
 222        const regex_t *__restrict preg,
 223        const char *__restrict string,
 224        size_t nmatch,
 225        regmatch_t pmatch[],
 226        int eflags)
 227{
 228  reg_errcode_t err;
 229  int start, length;
 230
 231  if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 232    return REG_BADPAT;
 233
 234  if (eflags & REG_STARTEND)
 235    {
 236      start = pmatch[0].rm_so;
 237      length = pmatch[0].rm_eo;
 238    }
 239  else
 240    {
 241      start = 0;
 242      length = strlen (string);
 243    }
 244
 245  __libc_lock_lock (dfa->lock);
 246  if (preg->no_sub)
 247    err = re_search_internal (preg, string, length, start, length - start,
 248                              length, 0, NULL, eflags);
 249  else
 250    err = re_search_internal (preg, string, length, start, length - start,
 251                              length, nmatch, pmatch, eflags);
 252  __libc_lock_unlock (dfa->lock);
 253  return err != REG_NOERROR;
 254}
 255
 256#ifdef _LIBC
 257# include <shlib-compat.h>
 258versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 259
 260# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 261__typeof__ (__regexec) __compat_regexec;
 262
 263int
 264attribute_compat_text_section
 265__compat_regexec (const regex_t *__restrict preg,
 266                  const char *__restrict string, size_t nmatch,
 267                  regmatch_t pmatch[], int eflags)
 268{
 269  return regexec (preg, string, nmatch, pmatch,
 270                  eflags & (REG_NOTBOL | REG_NOTEOL));
 271}
 272compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 273# endif
 274#endif
 275
 276/* Entry points for GNU code.  */
 277
 278/* re_match, re_search, re_match_2, re_search_2
 279
 280   The former two functions operate on STRING with length LENGTH,
 281   while the later two operate on concatenation of STRING1 and STRING2
 282   with lengths LENGTH1 and LENGTH2, respectively.
 283
 284   re_match() matches the compiled pattern in BUFP against the string,
 285   starting at index START.
 286
 287   re_search() first tries matching at index START, then it tries to match
 288   starting from index START + 1, and so on.  The last start position tried
 289   is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 290   way as re_match().)
 291
 292   The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 293   the first STOP characters of the concatenation of the strings should be
 294   concerned.
 295
 296   If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 297   and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 298   computed relative to the concatenation, not relative to the individual
 299   strings.)
 300
 301   On success, re_match* functions return the length of the match, re_search*
 302   return the position of the start of the match.  Return value -1 means no
 303   match was found and -2 indicates an internal error.  */
 304
 305int
 306re_match (struct re_pattern_buffer *bufp,
 307          const char *string,
 308          int length,
 309          int start,
 310          struct re_registers *regs)
 311{
 312  return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 313}
 314#ifdef _LIBC
 315weak_alias (__re_match, re_match)
 316#endif
 317
 318int
 319re_search (struct re_pattern_buffer *bufp,
 320           const char *string,
 321           int length, int start, int range,
 322           struct re_registers *regs)
 323{
 324  return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 325}
 326#ifdef _LIBC
 327weak_alias (__re_search, re_search)
 328#endif
 329
 330int
 331re_match_2 (struct re_pattern_buffer *bufp,
 332            const char *string1, int length1,
 333            const char *string2, int length2, int start,
 334            struct re_registers *regs, int stop)
 335{
 336  return re_search_2_stub (bufp, string1, length1, string2, length2,
 337                           start, 0, regs, stop, 1);
 338}
 339#ifdef _LIBC
 340weak_alias (__re_match_2, re_match_2)
 341#endif
 342
 343int
 344re_search_2 (struct re_pattern_buffer *bufp,
 345             const char *string1, int length1,
 346             const char *string2, int length2, int start,
 347             int range, struct re_registers *regs,  int stop)
 348{
 349  return re_search_2_stub (bufp, string1, length1, string2, length2,
 350                           start, range, regs, stop, 0);
 351}
 352#ifdef _LIBC
 353weak_alias (__re_search_2, re_search_2)
 354#endif
 355
 356static int
 357re_search_2_stub (struct re_pattern_buffer *bufp,
 358                  const char *string1, int length1,
 359                  const char *string2, int length2, int start,
 360                  int range, struct re_registers *regs,
 361                  int stop, int ret_len)
 362{
 363  const char *str;
 364  int rval;
 365  int len = length1 + length2;
 366  int free_str = 0;
 367
 368  if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 369    return -2;
 370
 371  /* Concatenate the strings.  */
 372  if (length2 > 0)
 373    if (length1 > 0)
 374      {
 375        char *s = re_malloc (char, len);
 376
 377        if (BE (s == NULL, 0))
 378          return -2;
 379        memcpy (s, string1, length1);
 380        memcpy (s + length1, string2, length2);
 381        str = s;
 382        free_str = 1;
 383      }
 384    else
 385      str = string2;
 386  else
 387    str = string1;
 388
 389  rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 390  if (free_str)
 391    re_free ((char *) str);
 392  return rval;
 393}
 394
 395/* The parameters have the same meaning as those of re_search.
 396   Additional parameters:
 397   If RET_LEN is nonzero the length of the match is returned (re_match style);
 398   otherwise the position of the match is returned.  */
 399
 400static int
 401re_search_stub (struct re_pattern_buffer *bufp,
 402                const char *string, int length, int start,
 403                int range, int stop,
 404                struct re_registers *regs, int ret_len)
 405{
 406  reg_errcode_t result;
 407  regmatch_t *pmatch;
 408  int nregs, rval;
 409  int eflags = 0;
 410
 411  /* Check for out-of-range.  */
 412  if (BE (start < 0 || start > length, 0))
 413    return -1;
 414  if (BE (start + range > length, 0))
 415    range = length - start;
 416  else if (BE (start + range < 0, 0))
 417    range = -start;
 418
 419  __libc_lock_lock (dfa->lock);
 420
 421  eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 422  eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 423
 424  /* Compile fastmap if we haven't yet.  */
 425  if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 426    re_compile_fastmap (bufp);
 427
 428  if (BE (bufp->no_sub, 0))
 429    regs = NULL;
 430
 431  /* We need at least 1 register.  */
 432  if (regs == NULL)
 433    nregs = 1;
 434  else if (BE (bufp->regs_allocated == REGS_FIXED &&
 435               regs->num_regs < bufp->re_nsub + 1, 0))
 436    {
 437      nregs = regs->num_regs;
 438      if (BE (nregs < 1, 0))
 439        {
 440          /* Nothing can be copied to regs.  */
 441          regs = NULL;
 442          nregs = 1;
 443        }
 444    }
 445  else
 446    nregs = bufp->re_nsub + 1;
 447  pmatch = re_malloc (regmatch_t, nregs);
 448  if (BE (pmatch == NULL, 0))
 449    {
 450      rval = -2;
 451      goto out;
 452    }
 453
 454  result = re_search_internal (bufp, string, length, start, range, stop,
 455                               nregs, pmatch, eflags);
 456
 457  rval = 0;
 458
 459  /* I hope we needn't fill ther regs with -1's when no match was found.  */
 460  if (result != REG_NOERROR)
 461    rval = -1;
 462  else if (regs != NULL)
 463    {
 464      /* If caller wants register contents data back, copy them.  */
 465      bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 466                                           bufp->regs_allocated);
 467      if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 468        rval = -2;
 469    }
 470
 471  if (BE (rval == 0, 1))
 472    {
 473      if (ret_len)
 474        {
 475          assert (pmatch[0].rm_so == start);
 476          rval = pmatch[0].rm_eo - start;
 477        }
 478      else
 479        rval = pmatch[0].rm_so;
 480    }
 481  re_free (pmatch);
 482 out:
 483  __libc_lock_unlock (dfa->lock);
 484  return rval;
 485}
 486
 487static unsigned
 488re_copy_regs (struct re_registers *regs,
 489              regmatch_t *pmatch,
 490              int nregs, int regs_allocated)
 491{
 492  int rval = REGS_REALLOCATE;
 493  int i;
 494  int need_regs = nregs + 1;
 495  /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 496     uses.  */
 497
 498  /* Have the register data arrays been allocated?  */
 499  if (regs_allocated == REGS_UNALLOCATED)
 500    { /* No.  So allocate them with malloc.  */
 501      regs->start = re_malloc (regoff_t, need_regs);
 502      if (BE (regs->start == NULL, 0))
 503        return REGS_UNALLOCATED;
 504      regs->end = re_malloc (regoff_t, need_regs);
 505      if (BE (regs->end == NULL, 0))
 506        {
 507          re_free (regs->start);
 508          return REGS_UNALLOCATED;
 509        }
 510      regs->num_regs = need_regs;
 511    }
 512  else if (regs_allocated == REGS_REALLOCATE)
 513    { /* Yes.  If we need more elements than were already
 514         allocated, reallocate them.  If we need fewer, just
 515         leave it alone.  */
 516      if (BE (need_regs > regs->num_regs, 0))
 517        {
 518          regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 519          regoff_t *new_end;
 520          if (BE (new_start == NULL, 0))
 521            return REGS_UNALLOCATED;
 522          new_end = re_realloc (regs->end, regoff_t, need_regs);
 523          if (BE (new_end == NULL, 0))
 524            {
 525              re_free (new_start);
 526              return REGS_UNALLOCATED;
 527            }
 528          regs->start = new_start;
 529          regs->end = new_end;
 530          regs->num_regs = need_regs;
 531        }
 532    }
 533  else
 534    {
 535      assert (regs_allocated == REGS_FIXED);
 536      /* This function may not be called with REGS_FIXED and nregs too big.  */
 537      assert (regs->num_regs >= nregs);
 538      rval = REGS_FIXED;
 539    }
 540
 541  /* Copy the regs.  */
 542  for (i = 0; i < nregs; ++i)
 543    {
 544      regs->start[i] = pmatch[i].rm_so;
 545      regs->end[i] = pmatch[i].rm_eo;
 546    }
 547  for ( ; i < regs->num_regs; ++i)
 548    regs->start[i] = regs->end[i] = -1;
 549
 550  return rval;
 551}
 552
 553/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 554   ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 555   this memory for recording register information.  STARTS and ENDS
 556   must be allocated using the malloc library routine, and must each
 557   be at least NUM_REGS * sizeof (regoff_t) bytes long.
 558
 559   If NUM_REGS == 0, then subsequent matches should allocate their own
 560   register data.
 561
 562   Unless this function is called, the first search or match using
 563   PATTERN_BUFFER will allocate its own register data, without
 564   freeing the old data.  */
 565
 566void
 567re_set_registers (struct re_pattern_buffer *bufp,
 568                  struct re_registers *regs,
 569                  unsigned num_regs,
 570                  regoff_t *starts,
 571                  regoff_t *ends)
 572{
 573  if (num_regs)
 574    {
 575      bufp->regs_allocated = REGS_REALLOCATE;
 576      regs->num_regs = num_regs;
 577      regs->start = starts;
 578      regs->end = ends;
 579    }
 580  else
 581    {
 582      bufp->regs_allocated = REGS_UNALLOCATED;
 583      regs->num_regs = 0;
 584      regs->start = regs->end = (regoff_t *) 0;
 585    }
 586}
 587#ifdef _LIBC
 588weak_alias (__re_set_registers, re_set_registers)
 589#endif
 590\f
 591/* Entry points compatible with 4.2 BSD regex library.  We don't define
 592   them unless specifically requested.  */
 593
 594#if defined _REGEX_RE_COMP || defined _LIBC
 595int
 596# ifdef _LIBC
 597weak_function
 598# endif
 599re_exec (s)
 600     const char *s;
 601{
 602  return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 603}
 604#endif /* _REGEX_RE_COMP */
 605\f
 606/* Internal entry point.  */
 607
 608/* Searches for a compiled pattern PREG in the string STRING, whose
 609   length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 610   mingings with regexec.  START, and RANGE have the same meanings
 611   with re_search.
 612   Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 613   otherwise return the error code.
 614   Note: We assume front end functions already check ranges.
 615   (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 616
 617static reg_errcode_t
 618re_search_internal (const regex_t *preg,
 619                    const char *string,
 620                    int length, int start, int range, int stop,
 621                    size_t nmatch, regmatch_t pmatch[],
 622                    int eflags)
 623{
 624  reg_errcode_t err;
 625  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 626  int left_lim, right_lim, incr;
 627  int fl_longest_match, match_first, match_kind, match_last = -1;
 628  int extra_nmatch;
 629  int sb, ch;
 630#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 631  re_match_context_t mctx = { .dfa = dfa };
 632#else
 633  re_match_context_t mctx;
 634#endif
 635  char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 636                   && range && !preg->can_be_null) ? preg->fastmap : NULL;
 637  RE_TRANSLATE_TYPE t = preg->translate;
 638
 639#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 640  memset (&mctx, '\0', sizeof (re_match_context_t));
 641  mctx.dfa = dfa;
 642#endif
 643
 644  extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 645  nmatch -= extra_nmatch;
 646
 647  /* Check if the DFA haven't been compiled.  */
 648  if (BE (preg->used == 0 || dfa->init_state == NULL
 649          || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 650          || dfa->init_state_begbuf == NULL, 0))
 651    return REG_NOMATCH;
 652
 653#ifdef DEBUG
 654  /* We assume front-end functions already check them.  */
 655  assert (start + range >= 0 && start + range <= length);
 656#endif
 657
 658  /* If initial states with non-begbuf contexts have no elements,
 659     the regex must be anchored.  If preg->newline_anchor is set,
 660     we'll never use init_state_nl, so do not check it.  */
 661  if (dfa->init_state->nodes.nelem == 0
 662      && dfa->init_state_word->nodes.nelem == 0
 663      && (dfa->init_state_nl->nodes.nelem == 0
 664          || !preg->newline_anchor))
 665    {
 666      if (start != 0 && start + range != 0)
 667        return REG_NOMATCH;
 668      start = range = 0;
 669    }
 670
 671  /* We must check the longest matching, if nmatch > 0.  */
 672  fl_longest_match = (nmatch != 0 || dfa->nbackref);
 673
 674  err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 675                            preg->translate, preg->syntax & RE_ICASE, dfa);
 676  if (BE (err != REG_NOERROR, 0))
 677    goto free_return;
 678  mctx.input.stop = stop;
 679  mctx.input.raw_stop = stop;
 680  mctx.input.newline_anchor = preg->newline_anchor;
 681
 682  err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 683  if (BE (err != REG_NOERROR, 0))
 684    goto free_return;
 685
 686  /* We will log all the DFA states through which the dfa pass,
 687     if nmatch > 1, or this dfa has "multibyte node", which is a
 688     back-reference or a node which can accept multibyte character or
 689     multi character collating element.  */
 690  if (nmatch > 1 || dfa->has_mb_node)
 691    {
 692      /* Avoid overflow.  */
 693      if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 694        {
 695          err = REG_ESPACE;
 696          goto free_return;
 697        }
 698
 699      mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 700      if (BE (mctx.state_log == NULL, 0))
 701        {
 702          err = REG_ESPACE;
 703          goto free_return;
 704        }
 705    }
 706  else
 707    mctx.state_log = NULL;
 708
 709  match_first = start;
 710  mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 711                           : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 712
 713  /* Check incrementally whether of not the input string match.  */
 714  incr = (range < 0) ? -1 : 1;
 715  left_lim = (range < 0) ? start + range : start;
 716  right_lim = (range < 0) ? start : start + range;
 717  sb = dfa->mb_cur_max == 1;
 718  match_kind =
 719    (fastmap
 720     ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 721        | (range >= 0 ? 2 : 0)
 722        | (t != NULL ? 1 : 0))
 723     : 8);
 724
 725  for (;; match_first += incr)
 726    {
 727      err = REG_NOMATCH;
 728      if (match_first < left_lim || right_lim < match_first)
 729        goto free_return;
 730
 731      /* Advance as rapidly as possible through the string, until we
 732         find a plausible place to start matching.  This may be done
 733         with varying efficiency, so there are various possibilities:
 734         only the most common of them are specialized, in order to
 735         save on code size.  We use a switch statement for speed.  */
 736      switch (match_kind)
 737        {
 738        case 8:
 739          /* No fastmap.  */
 740          break;
 741
 742        case 7:
 743          /* Fastmap with single-byte translation, match forward.  */
 744          while (BE (match_first < right_lim, 1)
 745                 && !fastmap[t[(unsigned char) string[match_first]]])
 746            ++match_first;
 747          goto forward_match_found_start_or_reached_end;
 748
 749        case 6:
 750          /* Fastmap without translation, match forward.  */
 751          while (BE (match_first < right_lim, 1)
 752                 && !fastmap[(unsigned char) string[match_first]])
 753            ++match_first;
 754
 755        forward_match_found_start_or_reached_end:
 756          if (BE (match_first == right_lim, 0))
 757            {
 758              ch = match_first >= length
 759                       ? 0 : (unsigned char) string[match_first];
 760              if (!fastmap[t ? t[ch] : ch])
 761                goto free_return;
 762            }
 763          break;
 764
 765        case 4:
 766        case 5:
 767          /* Fastmap without multi-byte translation, match backwards.  */
 768          while (match_first >= left_lim)
 769            {
 770              ch = match_first >= length
 771                       ? 0 : (unsigned char) string[match_first];
 772              if (fastmap[t ? t[ch] : ch])
 773                break;
 774              --match_first;
 775            }
 776          if (match_first < left_lim)
 777            goto free_return;
 778          break;
 779
 780        default:
 781          /* In this case, we can't determine easily the current byte,
 782             since it might be a component byte of a multibyte
 783             character.  Then we use the constructed buffer instead.  */
 784          for (;;)
 785            {
 786              /* If MATCH_FIRST is out of the valid range, reconstruct the
 787                 buffers.  */
 788              unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 789              if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 790                {
 791                  err = re_string_reconstruct (&mctx.input, match_first,
 792                                               eflags);
 793                  if (BE (err != REG_NOERROR, 0))
 794                    goto free_return;
 795
 796                  offset = match_first - mctx.input.raw_mbs_idx;
 797                }
 798              /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 799                 Note that MATCH_FIRST must not be smaller than 0.  */
 800              ch = (match_first >= length
 801                    ? 0 : re_string_byte_at (&mctx.input, offset));
 802              if (fastmap[ch])
 803                break;
 804              match_first += incr;
 805              if (match_first < left_lim || match_first > right_lim)
 806                {
 807                  err = REG_NOMATCH;
 808                  goto free_return;
 809                }
 810            }
 811          break;
 812        }
 813
 814      /* Reconstruct the buffers so that the matcher can assume that
 815         the matching starts from the beginning of the buffer.  */
 816      err = re_string_reconstruct (&mctx.input, match_first, eflags);
 817      if (BE (err != REG_NOERROR, 0))
 818        goto free_return;
 819
 820#ifdef RE_ENABLE_I18N
 821     /* Don't consider this char as a possible match start if it part,
 822        yet isn't the head, of a multibyte character.  */
 823      if (!sb && !re_string_first_byte (&mctx.input, 0))
 824        continue;
 825#endif
 826
 827      /* It seems to be appropriate one, then use the matcher.  */
 828      /* We assume that the matching starts from 0.  */
 829      mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 830      match_last = check_matching (&mctx, fl_longest_match,
 831                                   range >= 0 ? &match_first : NULL);
 832      if (match_last != -1)
 833        {
 834          if (BE (match_last == -2, 0))
 835            {
 836              err = REG_ESPACE;
 837              goto free_return;
 838            }
 839          else
 840            {
 841              mctx.match_last = match_last;
 842              if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 843                {
 844                  re_dfastate_t *pstate = mctx.state_log[match_last];
 845                  mctx.last_node = check_halt_state_context (&mctx, pstate,
 846                                                             match_last);
 847                }
 848              if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 849                  || dfa->nbackref)
 850                {
 851                  err = prune_impossible_nodes (&mctx);
 852                  if (err == REG_NOERROR)
 853                    break;
 854                  if (BE (err != REG_NOMATCH, 0))
 855                    goto free_return;
 856                  match_last = -1;
 857                }
 858              else
 859                break; /* We found a match.  */
 860            }
 861        }
 862
 863      match_ctx_clean (&mctx);
 864    }
 865
 866#ifdef DEBUG
 867  assert (match_last != -1);
 868  assert (err == REG_NOERROR);
 869#endif
 870
 871  /* Set pmatch[] if we need.  */
 872  if (nmatch > 0)
 873    {
 874      int reg_idx;
 875
 876      /* Initialize registers.  */
 877      for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 878        pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 879
 880      /* Set the points where matching start/end.  */
 881      pmatch[0].rm_so = 0;
 882      pmatch[0].rm_eo = mctx.match_last;
 883
 884      if (!preg->no_sub && nmatch > 1)
 885        {
 886          err = set_regs (preg, &mctx, nmatch, pmatch,
 887                          dfa->has_plural_match && dfa->nbackref > 0);
 888          if (BE (err != REG_NOERROR, 0))
 889            goto free_return;
 890        }
 891
 892      /* At last, add the offset to the each registers, since we slided
 893         the buffers so that we could assume that the matching starts
 894         from 0.  */
 895      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 896        if (pmatch[reg_idx].rm_so != -1)
 897          {
 898#ifdef RE_ENABLE_I18N
 899            if (BE (mctx.input.offsets_needed != 0, 0))
 900              {
 901                pmatch[reg_idx].rm_so =
 902                  (pmatch[reg_idx].rm_so == mctx.input.valid_len
 903                   ? mctx.input.valid_raw_len
 904                   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 905                pmatch[reg_idx].rm_eo =
 906                  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 907                   ? mctx.input.valid_raw_len
 908                   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 909              }
 910#else
 911            assert (mctx.input.offsets_needed == 0);
 912#endif
 913            pmatch[reg_idx].rm_so += match_first;
 914            pmatch[reg_idx].rm_eo += match_first;
 915          }
 916      for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 917        {
 918          pmatch[nmatch + reg_idx].rm_so = -1;
 919          pmatch[nmatch + reg_idx].rm_eo = -1;
 920        }
 921
 922      if (dfa->subexp_map)
 923        for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 924          if (dfa->subexp_map[reg_idx] != reg_idx)
 925            {
 926              pmatch[reg_idx + 1].rm_so
 927                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 928              pmatch[reg_idx + 1].rm_eo
 929                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 930            }
 931    }
 932
 933 free_return:
 934  re_free (mctx.state_log);
 935  if (dfa->nbackref)
 936    match_ctx_free (&mctx);
 937  re_string_destruct (&mctx.input);
 938  return err;
 939}
 940
 941static reg_errcode_t
 942prune_impossible_nodes (re_match_context_t *mctx)
 943{
 944  const re_dfa_t *const dfa = mctx->dfa;
 945  int halt_node, match_last;
 946  reg_errcode_t ret;
 947  re_dfastate_t **sifted_states;
 948  re_dfastate_t **lim_states = NULL;
 949  re_sift_context_t sctx;
 950#ifdef DEBUG
 951  assert (mctx->state_log != NULL);
 952#endif
 953  match_last = mctx->match_last;
 954  halt_node = mctx->last_node;
 955
 956  /* Avoid overflow.  */
 957  if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 958    return REG_ESPACE;
 959
 960  sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 961  if (BE (sifted_states == NULL, 0))
 962    {
 963      ret = REG_ESPACE;
 964      goto free_return;
 965    }
 966  if (dfa->nbackref)
 967    {
 968      lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 969      if (BE (lim_states == NULL, 0))
 970        {
 971          ret = REG_ESPACE;
 972          goto free_return;
 973        }
 974      while (1)
 975        {
 976          memset (lim_states, '\0',
 977                  sizeof (re_dfastate_t *) * (match_last + 1));
 978          sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 979                         match_last);
 980          ret = sift_states_backward (mctx, &sctx);
 981          re_node_set_free (&sctx.limits);
 982          if (BE (ret != REG_NOERROR, 0))
 983              goto free_return;
 984          if (sifted_states[0] != NULL || lim_states[0] != NULL)
 985            break;
 986          do
 987            {
 988              --match_last;
 989              if (match_last < 0)
 990                {
 991                  ret = REG_NOMATCH;
 992                  goto free_return;
 993                }
 994            } while (mctx->state_log[match_last] == NULL
 995                     || !mctx->state_log[match_last]->halt);
 996          halt_node = check_halt_state_context (mctx,
 997                                                mctx->state_log[match_last],
 998                                                match_last);
 999        }
1000      ret = merge_state_array (dfa, sifted_states, lim_states,
1001                               match_last + 1);
1002      re_free (lim_states);
1003      lim_states = NULL;
1004      if (BE (ret != REG_NOERROR, 0))
1005        goto free_return;
1006    }
1007  else
1008    {
1009      sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1010      ret = sift_states_backward (mctx, &sctx);
1011      re_node_set_free (&sctx.limits);
1012      if (BE (ret != REG_NOERROR, 0))
1013        goto free_return;
1014      if (sifted_states[0] == NULL)
1015        {
1016          ret = REG_NOMATCH;
1017          goto free_return;
1018        }
1019    }
1020  re_free (mctx->state_log);
1021  mctx->state_log = sifted_states;
1022  sifted_states = NULL;
1023  mctx->last_node = halt_node;
1024  mctx->match_last = match_last;
1025  ret = REG_NOERROR;
1026 free_return:
1027  re_free (sifted_states);
1028  re_free (lim_states);
1029  return ret;
1030}
1031
1032/* Acquire an initial state and return it.
1033   We must select appropriate initial state depending on the context,
1034   since initial states may have constraints like "\<", "^", etc..  */
1035
1036static inline re_dfastate_t *
1037__attribute ((always_inline)) internal_function
1038acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1039                            int idx)
1040{
1041  const re_dfa_t *const dfa = mctx->dfa;
1042  if (dfa->init_state->has_constraint)
1043    {
1044      unsigned int context;
1045      context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1046      if (IS_WORD_CONTEXT (context))
1047        return dfa->init_state_word;
1048      else if (IS_ORDINARY_CONTEXT (context))
1049        return dfa->init_state;
1050      else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1051        return dfa->init_state_begbuf;
1052      else if (IS_NEWLINE_CONTEXT (context))
1053        return dfa->init_state_nl;
1054      else if (IS_BEGBUF_CONTEXT (context))
1055        {
1056          /* It is relatively rare case, then calculate on demand.  */
1057          return re_acquire_state_context (err, dfa,
1058                                           dfa->init_state->entrance_nodes,
1059                                           context);
1060        }
1061      else
1062        /* Must not happen?  */
1063        return dfa->init_state;
1064    }
1065  else
1066    return dfa->init_state;
1067}
1068
1069/* Check whether the regular expression match input string INPUT or not,
1070   and return the index where the matching end, return -1 if not match,
1071   or return -2 in case of an error.
1072   FL_LONGEST_MATCH means we want the POSIX longest matching.
1073   If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1074   next place where we may want to try matching.
1075   Note that the matcher assume that the maching starts from the current
1076   index of the buffer.  */
1077
1078static int
1079internal_function
1080check_matching (re_match_context_t *mctx, int fl_longest_match,
1081                int *p_match_first)
1082{
1083  const re_dfa_t *const dfa = mctx->dfa;
1084  reg_errcode_t err;
1085  int match = 0;
1086  int match_last = -1;
1087  int cur_str_idx = re_string_cur_idx (&mctx->input);
1088  re_dfastate_t *cur_state;
1089  int at_init_state = p_match_first != NULL;
1090  int next_start_idx = cur_str_idx;
1091
1092  err = REG_NOERROR;
1093  cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1094  /* An initial state must not be NULL (invalid).  */
1095  if (BE (cur_state == NULL, 0))
1096    {
1097      assert (err == REG_ESPACE);
1098      return -2;
1099    }
1100
1101  if (mctx->state_log != NULL)
1102    {
1103      mctx->state_log[cur_str_idx] = cur_state;
1104
1105      /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1106         later.  E.g. Processing back references.  */
1107      if (BE (dfa->nbackref, 0))
1108        {
1109          at_init_state = 0;
1110          err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1111          if (BE (err != REG_NOERROR, 0))
1112            return err;
1113
1114          if (cur_state->has_backref)
1115            {
1116              err = transit_state_bkref (mctx, &cur_state->nodes);
1117              if (BE (err != REG_NOERROR, 0))
1118                return err;
1119            }
1120        }
1121    }
1122
1123  /* If the RE accepts NULL string.  */
1124  if (BE (cur_state->halt, 0))
1125    {
1126      if (!cur_state->has_constraint
1127          || check_halt_state_context (mctx, cur_state, cur_str_idx))
1128        {
1129          if (!fl_longest_match)
1130            return cur_str_idx;
1131          else
1132            {
1133              match_last = cur_str_idx;
1134              match = 1;
1135            }
1136        }
1137    }
1138
1139  while (!re_string_eoi (&mctx->input))
1140    {
1141      re_dfastate_t *old_state = cur_state;
1142      int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1143
1144      if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1145          || (BE (next_char_idx >= mctx->input.valid_len, 0)
1146              && mctx->input.valid_len < mctx->input.len))
1147        {
1148          err = extend_buffers (mctx);
1149          if (BE (err != REG_NOERROR, 0))
1150            {
1151              assert (err == REG_ESPACE);
1152              return -2;
1153            }
1154        }
1155
1156      cur_state = transit_state (&err, mctx, cur_state);
1157      if (mctx->state_log != NULL)
1158        cur_state = merge_state_with_log (&err, mctx, cur_state);
1159
1160      if (cur_state == NULL)
1161        {
1162          /* Reached the invalid state or an error.  Try to recover a valid
1163             state using the state log, if available and if we have not
1164             already found a valid (even if not the longest) match.  */
1165          if (BE (err != REG_NOERROR, 0))
1166            return -2;
1167
1168          if (mctx->state_log == NULL
1169              || (match && !fl_longest_match)
1170              || (cur_state = find_recover_state (&err, mctx)) == NULL)
1171            break;
1172        }
1173
1174      if (BE (at_init_state, 0))
1175        {
1176          if (old_state == cur_state)
1177            next_start_idx = next_char_idx;
1178          else
1179            at_init_state = 0;
1180        }
1181
1182      if (cur_state->halt)
1183        {
1184          /* Reached a halt state.
1185             Check the halt state can satisfy the current context.  */
1186          if (!cur_state->has_constraint
1187              || check_halt_state_context (mctx, cur_state,
1188                                           re_string_cur_idx (&mctx->input)))
1189            {
1190              /* We found an appropriate halt state.  */
1191              match_last = re_string_cur_idx (&mctx->input);
1192              match = 1;
1193
1194              /* We found a match, do not modify match_first below.  */
1195              p_match_first = NULL;
1196              if (!fl_longest_match)
1197                break;
1198            }
1199        }
1200    }
1201
1202  if (p_match_first)
1203    *p_match_first += next_start_idx;
1204
1205  return match_last;
1206}
1207
1208/* Check NODE match the current context.  */
1209
1210static int
1211internal_function
1212check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1213{
1214  re_token_type_t type = dfa->nodes[node].type;
1215  unsigned int constraint = dfa->nodes[node].constraint;
1216  if (type != END_OF_RE)
1217    return 0;
1218  if (!constraint)
1219    return 1;
1220  if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1221    return 0;
1222  return 1;
1223}
1224
1225/* Check the halt state STATE match the current context.
1226   Return 0 if not match, if the node, STATE has, is a halt node and
1227   match the context, return the node.  */
1228
1229static int
1230internal_function
1231check_halt_state_context (const re_match_context_t *mctx,
1232                          const re_dfastate_t *state, int idx)
1233{
1234  int i;
1235  unsigned int context;
1236#ifdef DEBUG
1237  assert (state->halt);
1238#endif
1239  context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1240  for (i = 0; i < state->nodes.nelem; ++i)
1241    if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1242      return state->nodes.elems[i];
1243  return 0;
1244}
1245
1246/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1247   corresponding to the DFA).
1248   Return the destination node, and update EPS_VIA_NODES, return -1 in case
1249   of errors.  */
1250
1251static int
1252internal_function
1253proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1254                   int *pidx, int node, re_node_set *eps_via_nodes,
1255                   struct re_fail_stack_t *fs)
1256{
1257  const re_dfa_t *const dfa = mctx->dfa;
1258  int i, err;
1259  if (IS_EPSILON_NODE (dfa->nodes[node].type))
1260    {
1261      re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1262      re_node_set *edests = &dfa->edests[node];
1263      int dest_node;
1264      err = re_node_set_insert (eps_via_nodes, node);
1265      if (BE (err < 0, 0))
1266        return -2;
1267      /* Pick up a valid destination, or return -1 if none is found.  */
1268      for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1269        {
1270          int candidate = edests->elems[i];
1271          if (!re_node_set_contains (cur_nodes, candidate))
1272            continue;
1273          if (dest_node == -1)
1274            dest_node = candidate;
1275
1276          else
1277            {
1278              /* In order to avoid infinite loop like "(a*)*", return the second
1279                 epsilon-transition if the first was already considered.  */
1280              if (re_node_set_contains (eps_via_nodes, dest_node))
1281                return candidate;
1282
1283              /* Otherwise, push the second epsilon-transition on the fail stack.  */
1284              else if (fs != NULL
1285                       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1286                                           eps_via_nodes))
1287                return -2;
1288
1289              /* We know we are going to exit.  */
1290              break;
1291            }
1292        }
1293      return dest_node;
1294    }
1295  else
1296    {
1297      int naccepted = 0;
1298      re_token_type_t type = dfa->nodes[node].type;
1299
1300#ifdef RE_ENABLE_I18N
1301      if (dfa->nodes[node].accept_mb)
1302        naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1303      else
1304#endif /* RE_ENABLE_I18N */
1305      if (type == OP_BACK_REF)
1306        {
1307          int subexp_idx = dfa->nodes[node].opr.idx + 1;
1308          naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1309          if (fs != NULL)
1310            {
1311              if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1312                return -1;
1313              else if (naccepted)
1314                {
1315                  char *buf = (char *) re_string_get_buffer (&mctx->input);
1316                  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1317                              naccepted) != 0)
1318                    return -1;
1319                }
1320            }
1321
1322          if (naccepted == 0)
1323            {
1324              int dest_node;
1325              err = re_node_set_insert (eps_via_nodes, node);
1326              if (BE (err < 0, 0))
1327                return -2;
1328              dest_node = dfa->edests[node].elems[0];
1329              if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1330                                        dest_node))
1331                return dest_node;
1332            }
1333        }
1334
1335      if (naccepted != 0
1336          || check_node_accept (mctx, dfa->nodes + node, *pidx))
1337        {
1338          int dest_node = dfa->nexts[node];
1339          *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1340          if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1341                     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1342                                               dest_node)))
1343            return -1;
1344          re_node_set_empty (eps_via_nodes);
1345          return dest_node;
1346        }
1347    }
1348  return -1;
1349}
1350
1351static reg_errcode_t
1352internal_function
1353push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1354                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1355{
1356  reg_errcode_t err;
1357  int num = fs->num++;
1358  if (fs->num == fs->alloc)
1359    {
1360      struct re_fail_stack_ent_t *new_array;
1361      new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1362                                       * fs->alloc * 2));
1363      if (new_array == NULL)
1364        return REG_ESPACE;
1365      fs->alloc *= 2;
1366      fs->stack = new_array;
1367    }
1368  fs->stack[num].idx = str_idx;
1369  fs->stack[num].node = dest_node;
1370  fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1371  if (fs->stack[num].regs == NULL)
1372    return REG_ESPACE;
1373  memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1374  err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1375  return err;
1376}
1377
1378static int
1379internal_function
1380pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1381                regmatch_t *regs, re_node_set *eps_via_nodes)
1382{
1383  int num = --fs->num;
1384  assert (num >= 0);
1385  *pidx = fs->stack[num].idx;
1386  memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1387  re_node_set_free (eps_via_nodes);
1388  re_free (fs->stack[num].regs);
1389  *eps_via_nodes = fs->stack[num].eps_via_nodes;
1390  return fs->stack[num].node;
1391}
1392
1393/* Set the positions where the subexpressions are starts/ends to registers
1394   PMATCH.
1395   Note: We assume that pmatch[0] is already set, and
1396   pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1397
1398static reg_errcode_t
1399internal_function
1400set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1401          regmatch_t *pmatch, int fl_backtrack)
1402{
1403  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1404  int idx, cur_node;
1405  re_node_set eps_via_nodes;
1406  struct re_fail_stack_t *fs;
1407  struct re_fail_stack_t fs_body = { 0, 2, NULL };
1408  regmatch_t *prev_idx_match;
1409  int prev_idx_match_malloced = 0;
1410
1411#ifdef DEBUG
1412  assert (nmatch > 1);
1413  assert (mctx->state_log != NULL);
1414#endif
1415  if (fl_backtrack)
1416    {
1417      fs = &fs_body;
1418      fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1419      if (fs->stack == NULL)
1420        return REG_ESPACE;
1421    }
1422  else
1423    fs = NULL;
1424
1425  cur_node = dfa->init_node;
1426  re_node_set_init_empty (&eps_via_nodes);
1427
1428#ifdef HAVE_ALLOCA
1429  if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1430    prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1431  else
1432#endif
1433    {
1434      prev_idx_match = re_malloc (regmatch_t, nmatch);
1435      if (prev_idx_match == NULL)
1436        {
1437          free_fail_stack_return (fs);
1438          return REG_ESPACE;
1439        }
1440      prev_idx_match_malloced = 1;
1441    }
1442  memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1443
1444  for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1445    {
1446      update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1447
1448      if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1449        {
1450          int reg_idx;
1451          if (fs)
1452            {
1453              for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1454                if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1455                  break;
1456              if (reg_idx == nmatch)
1457                {
1458                  re_node_set_free (&eps_via_nodes);
1459                  if (prev_idx_match_malloced)
1460                    re_free (prev_idx_match);
1461                  return free_fail_stack_return (fs);
1462                }
1463              cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1464                                         &eps_via_nodes);
1465            }
1466          else
1467            {
1468              re_node_set_free (&eps_via_nodes);
1469              if (prev_idx_match_malloced)
1470                re_free (prev_idx_match);
1471              return REG_NOERROR;
1472            }
1473        }
1474
1475      /* Proceed to next node.  */
1476      cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1477                                    &eps_via_nodes, fs);
1478
1479      if (BE (cur_node < 0, 0))
1480        {
1481          if (BE (cur_node == -2, 0))
1482            {
1483              re_node_set_free (&eps_via_nodes);
1484              if (prev_idx_match_malloced)
1485                re_free (prev_idx_match);
1486              free_fail_stack_return (fs);
1487              return REG_ESPACE;
1488            }
1489          if (fs)
1490            cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1491                                       &eps_via_nodes);
1492          else
1493            {
1494              re_node_set_free (&eps_via_nodes);
1495              if (prev_idx_match_malloced)
1496                re_free (prev_idx_match);
1497              return REG_NOMATCH;
1498            }
1499        }
1500    }
1501  re_node_set_free (&eps_via_nodes);
1502  if (prev_idx_match_malloced)
1503    re_free (prev_idx_match);
1504  return free_fail_stack_return (fs);
1505}
1506
1507static reg_errcode_t
1508internal_function
1509free_fail_stack_return (struct re_fail_stack_t *fs)
1510{
1511  if (fs)
1512    {
1513      int fs_idx;
1514      for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1515        {
1516          re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1517          re_free (fs->stack[fs_idx].regs);
1518        }
1519      re_free (fs->stack);
1520    }
1521  return REG_NOERROR;
1522}
1523
1524static void
1525internal_function
1526update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1527             regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1528{
1529  int type = dfa->nodes[cur_node].type;
1530  if (type == OP_OPEN_SUBEXP)
1531    {
1532      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1533
1534      /* We are at the first node of this sub expression.  */
1535      if (reg_num < nmatch)
1536        {
1537          pmatch[reg_num].rm_so = cur_idx;
1538          pmatch[reg_num].rm_eo = -1;
1539        }
1540    }
1541  else if (type == OP_CLOSE_SUBEXP)
1542    {
1543      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1544      if (reg_num < nmatch)
1545        {
1546          /* We are at the last node of this sub expression.  */
1547          if (pmatch[reg_num].rm_so < cur_idx)
1548            {
1549              pmatch[reg_num].rm_eo = cur_idx;
1550              /* This is a non-empty match or we are not inside an optional
1551                 subexpression.  Accept this right away.  */
1552              memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1553            }
1554          else
1555            {
1556              if (dfa->nodes[cur_node].opt_subexp
1557                  && prev_idx_match[reg_num].rm_so != -1)
1558                /* We transited through an empty match for an optional
1559                   subexpression, like (a?)*, and this is not the subexp's
1560                   first match.  Copy back the old content of the registers
1561                   so that matches of an inner subexpression are undone as
1562                   well, like in ((a?))*.  */
1563                memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1564              else
1565                /* We completed a subexpression, but it may be part of
1566                   an optional one, so do not update PREV_IDX_MATCH.  */
1567                pmatch[reg_num].rm_eo = cur_idx;
1568            }
1569        }
1570    }
1571}
1572
1573/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1574   and sift the nodes in each states according to the following rules.
1575   Updated state_log will be wrote to STATE_LOG.
1576
1577   Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1578     1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1579        If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1580        the LAST_NODE, we throw away the node `a'.
1581     2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1582        string `s' and transit to `b':
1583        i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1584           away the node `a'.
1585        ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1586            thrown away, we throw away the node `a'.
1587     3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1588        i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1589           node `a'.
1590        ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1591            we throw away the node `a'.  */
1592
1593#define STATE_NODE_CONTAINS(state,node) \
1594  ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1595
1596static reg_errcode_t
1597internal_function
1598sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1599{
1600  reg_errcode_t err;
1601  int null_cnt = 0;
1602  int str_idx = sctx->last_str_idx;
1603  re_node_set cur_dest;
1604
1605#ifdef DEBUG
1606  assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1607#endif
1608
1609  /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1610     transit to the last_node and the last_node itself.  */
1611  err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1612  if (BE (err != REG_NOERROR, 0))
1613    return err;
1614  err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1615  if (BE (err != REG_NOERROR, 0))
1616    goto free_return;
1617
1618  /* Then check each states in the state_log.  */
1619  while (str_idx > 0)
1620    {
1621      /* Update counters.  */
1622      null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1623      if (null_cnt > mctx->max_mb_elem_len)
1624        {
1625          memset (sctx->sifted_states, '\0',
1626                  sizeof (re_dfastate_t *) * str_idx);
1627          re_node_set_free (&cur_dest);
1628          return REG_NOERROR;
1629        }
1630      re_node_set_empty (&cur_dest);
1631      --str_idx;
1632
1633      if (mctx->state_log[str_idx])
1634        {
1635          err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1636          if (BE (err != REG_NOERROR, 0))
1637            goto free_return;
1638        }
1639
1640      /* Add all the nodes which satisfy the following conditions:
1641         - It can epsilon transit to a node in CUR_DEST.
1642         - It is in CUR_SRC.
1643         And update state_log.  */
1644      err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1645      if (BE (err != REG_NOERROR, 0))
1646        goto free_return;
1647    }
1648  err = REG_NOERROR;
1649 free_return:
1650  re_node_set_free (&cur_dest);
1651  return err;
1652}
1653
1654static reg_errcode_t
1655internal_function
1656build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1657                     int str_idx, re_node_set *cur_dest)
1658{
1659  const re_dfa_t *const dfa = mctx->dfa;
1660  const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1661  int i;
1662
1663  /* Then build the next sifted state.
1664     We build the next sifted state on `cur_dest', and update
1665     `sifted_states[str_idx]' with `cur_dest'.
1666     Note:
1667     `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1668     `cur_src' points the node_set of the old `state_log[str_idx]'
1669     (with the epsilon nodes pre-filtered out).  */
1670  for (i = 0; i < cur_src->nelem; i++)
1671    {
1672      int prev_node = cur_src->elems[i];
1673      int naccepted = 0;
1674      int ret;
1675
1676#ifdef DEBUG
1677      re_token_type_t type = dfa->nodes[prev_node].type;
1678      assert (!IS_EPSILON_NODE (type));
1679#endif
1680#ifdef RE_ENABLE_I18N
1681      /* If the node may accept `multi byte'.  */
1682      if (dfa->nodes[prev_node].accept_mb)
1683        naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1684                                         str_idx, sctx->last_str_idx);
1685#endif /* RE_ENABLE_I18N */
1686
1687      /* We don't check backreferences here.
1688         See update_cur_sifted_state().  */
1689      if (!naccepted
1690          && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1691          && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1692                                  dfa->nexts[prev_node]))
1693        naccepted = 1;
1694
1695      if (naccepted == 0)
1696        continue;
1697
1698      if (sctx->limits.nelem)
1699        {
1700          int to_idx = str_idx + naccepted;
1701          if (check_dst_limits (mctx, &sctx->limits,
1702                                dfa->nexts[prev_node], to_idx,
1703                                prev_node, str_idx))
1704            continue;
1705        }
1706      ret = re_node_set_insert (cur_dest, prev_node);
1707      if (BE (ret == -1, 0))
1708        return REG_ESPACE;
1709    }
1710
1711  return REG_NOERROR;
1712}
1713
1714/* Helper functions.  */
1715
1716static reg_errcode_t
1717internal_function
1718clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1719{
1720  int top = mctx->state_log_top;
1721
1722  if (next_state_log_idx >= mctx->input.bufs_len
1723      || (next_state_log_idx >= mctx->input.valid_len
1724          && mctx->input.valid_len < mctx->input.len))
1725    {
1726      reg_errcode_t err;
1727      err = extend_buffers (mctx);
1728      if (BE (err != REG_NOERROR, 0))
1729        return err;
1730    }
1731
1732  if (top < next_state_log_idx)
1733    {
1734      memset (mctx->state_log + top + 1, '\0',
1735              sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1736      mctx->state_log_top = next_state_log_idx;
1737    }
1738  return REG_NOERROR;
1739}
1740
1741static reg_errcode_t
1742internal_function
1743merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1744                   re_dfastate_t **src, int num)
1745{
1746  int st_idx;
1747  reg_errcode_t err;
1748  for (st_idx = 0; st_idx < num; ++st_idx)
1749    {
1750      if (dst[st_idx] == NULL)
1751        dst[st_idx] = src[st_idx];
1752      else if (src[st_idx] != NULL)
1753        {
1754          re_node_set merged_set;
1755          err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1756                                        &src[st_idx]->nodes);
1757          if (BE (err != REG_NOERROR, 0))
1758            return err;
1759          dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1760          re_node_set_free (&merged_set);
1761          if (BE (err != REG_NOERROR, 0))
1762            return err;
1763        }
1764    }
1765  return REG_NOERROR;
1766}
1767
1768static reg_errcode_t
1769internal_function
1770update_cur_sifted_state (const re_match_context_t *mctx,
1771                         re_sift_context_t *sctx, int str_idx,
1772                         re_node_set *dest_nodes)
1773{
1774  const re_dfa_t *const dfa = mctx->dfa;
1775  reg_errcode_t err = REG_NOERROR;
1776  const re_node_set *candidates;
1777  candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1778                : &mctx->state_log[str_idx]->nodes);
1779
1780  if (dest_nodes->nelem == 0)
1781    sctx->sifted_states[str_idx] = NULL;
1782  else
1783    {
1784      if (candidates)
1785        {
1786          /* At first, add the nodes which can epsilon transit to a node in
1787             DEST_NODE.  */
1788          err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1789          if (BE (err != REG_NOERROR, 0))
1790            return err;
1791
1792          /* Then, check the limitations in the current sift_context.  */
1793          if (sctx->limits.nelem)
1794            {
1795              err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1796                                         mctx->bkref_ents, str_idx);
1797              if (BE (err != REG_NOERROR, 0))
1798                return err;
1799            }
1800        }
1801
1802      sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1803      if (BE (err != REG_NOERROR, 0))
1804        return err;
1805    }
1806
1807  if (candidates && mctx->state_log[str_idx]->has_backref)
1808    {
1809      err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1810      if (BE (err != REG_NOERROR, 0))
1811        return err;
1812    }
1813  return REG_NOERROR;
1814}
1815
1816static reg_errcode_t
1817internal_function
1818add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1819                       const re_node_set *candidates)
1820{
1821  reg_errcode_t err = REG_NOERROR;
1822  int i;
1823
1824  re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1825  if (BE (err != REG_NOERROR, 0))
1826    return err;
1827
1828  if (!state->inveclosure.alloc)
1829    {
1830      err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1831      if (BE (err != REG_NOERROR, 0))
1832        return REG_ESPACE;
1833      for (i = 0; i < dest_nodes->nelem; i++)
1834        {
1835          err = re_node_set_merge (&state->inveclosure,
1836                                   dfa->inveclosures + dest_nodes->elems[i]);
1837          if (BE (err != REG_NOERROR, 0))
1838            return REG_ESPACE;
1839        }
1840    }
1841  return re_node_set_add_intersect (dest_nodes, candidates,
1842                                    &state->inveclosure);
1843}
1844
1845static reg_errcode_t
1846internal_function
1847sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1848                       const re_node_set *candidates)
1849{
1850    int ecl_idx;
1851    reg_errcode_t err;
1852    re_node_set *inv_eclosure = dfa->inveclosures + node;
1853    re_node_set except_nodes;
1854    re_node_set_init_empty (&except_nodes);
1855    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1856      {
1857        int cur_node = inv_eclosure->elems[ecl_idx];
1858        if (cur_node == node)
1859          continue;
1860        if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1861          {
1862            int edst1 = dfa->edests[cur_node].elems[0];
1863            int edst2 = ((dfa->edests[cur_node].nelem > 1)
1864                         ? dfa->edests[cur_node].elems[1] : -1);
1865            if ((!re_node_set_contains (inv_eclosure, edst1)
1866                 && re_node_set_contains (dest_nodes, edst1))
1867                || (edst2 > 0
1868                    && !re_node_set_contains (inv_eclosure, edst2)
1869                    && re_node_set_contains (dest_nodes, edst2)))
1870              {
1871                err = re_node_set_add_intersect (&except_nodes, candidates,
1872                                                 dfa->inveclosures + cur_node);
1873                if (BE (err != REG_NOERROR, 0))
1874                  {
1875                    re_node_set_free (&except_nodes);
1876                    return err;
1877                  }
1878              }
1879          }
1880      }
1881    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1882      {
1883        int cur_node = inv_eclosure->elems[ecl_idx];
1884        if (!re_node_set_contains (&except_nodes, cur_node))
1885          {
1886            int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1887            re_node_set_remove_at (dest_nodes, idx);
1888          }
1889      }
1890    re_node_set_free (&except_nodes);
1891    return REG_NOERROR;
1892}
1893
1894static int
1895internal_function
1896check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1897                  int dst_node, int dst_idx, int src_node, int src_idx)
1898{
1899  const re_dfa_t *const dfa = mctx->dfa;
1900  int lim_idx, src_pos, dst_pos;
1901
1902  int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1903  int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1904  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1905    {
1906      int subexp_idx;
1907      struct re_backref_cache_entry *ent;
1908      ent = mctx->bkref_ents + limits->elems[lim_idx];
1909      subexp_idx = dfa->nodes[ent->node].opr.idx;
1910
1911      dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1912                                           subexp_idx, dst_node, dst_idx,
1913                                           dst_bkref_idx);
1914      src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1915                                           subexp_idx, src_node, src_idx,
1916                                           src_bkref_idx);
1917
1918      /* In case of:
1919         <src> <dst> ( <subexp> )
1920         ( <subexp> ) <src> <dst>
1921         ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1922      if (src_pos == dst_pos)
1923        continue; /* This is unrelated limitation.  */
1924      else
1925        return 1;
1926    }
1927  return 0;
1928}
1929
1930static int
1931internal_function
1932check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1933                             int subexp_idx, int from_node, int bkref_idx)
1934{
1935  const re_dfa_t *const dfa = mctx->dfa;
1936  const re_node_set *eclosures = dfa->eclosures + from_node;
1937  int node_idx;
1938
1939  /* Else, we are on the boundary: examine the nodes on the epsilon
1940     closure.  */
1941  for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1942    {
1943      int node = eclosures->elems[node_idx];
1944      switch (dfa->nodes[node].type)
1945        {
1946        case OP_BACK_REF:
1947          if (bkref_idx != -1)
1948            {
1949              struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1950              do
1951                {
1952                  int dst, cpos;
1953
1954                  if (ent->node != node)
1955                    continue;
1956
1957                  if (subexp_idx < BITSET_WORD_BITS
1958                      && !(ent->eps_reachable_subexps_map
1959                           & ((bitset_word_t) 1 << subexp_idx)))
1960                    continue;
1961
1962                  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1963                     OP_CLOSE_SUBEXP cases below.  But, if the
1964                     destination node is the same node as the source
1965                     node, don't recurse because it would cause an
1966                     infinite loop: a regex that exhibits this behavior
1967                     is ()\1*\1*  */
1968                  dst = dfa->edests[node].elems[0];
1969                  if (dst == from_node)
1970                    {
1971                      if (boundaries & 1)
1972                        return -1;
1973                      else /* if (boundaries & 2) */
1974                        return 0;
1975                    }
1976
1977                  cpos =
1978                    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1979                                                 dst, bkref_idx);
1980                  if (cpos == -1 /* && (boundaries & 1) */)
1981                    return -1;
1982                  if (cpos == 0 && (boundaries & 2))
1983                    return 0;
1984
1985                  if (subexp_idx < BITSET_WORD_BITS)
1986                    ent->eps_reachable_subexps_map
1987                      &= ~((bitset_word_t) 1 << subexp_idx);
1988                }
1989              while (ent++->more);
1990            }
1991          break;
1992
1993        case OP_OPEN_SUBEXP:
1994          if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1995            return -1;
1996          break;
1997
1998        case OP_CLOSE_SUBEXP:
1999          if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2000            return 0;
2001          break;
2002
2003        default:
2004            break;
2005        }
2006    }
2007
2008  return (boundaries & 2) ? 1 : 0;
2009}
2010
2011static int
2012internal_function
2013check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2014                           int subexp_idx, int from_node, int str_idx,
2015                           int bkref_idx)
2016{
2017  struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2018  int boundaries;
2019
2020  /* If we are outside the range of the subexpression, return -1 or 1.  */
2021  if (str_idx < lim->subexp_from)
2022    return -1;
2023
2024  if (lim->subexp_to < str_idx)
2025    return 1;
2026
2027  /* If we are within the subexpression, return 0.  */
2028  boundaries = (str_idx == lim->subexp_from);
2029  boundaries |= (str_idx == lim->subexp_to) << 1;
2030  if (boundaries == 0)
2031    return 0;
2032
2033  /* Else, examine epsilon closure.  */
2034  return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2035                                      from_node, bkref_idx);
2036}
2037
2038/* Check the limitations of sub expressions LIMITS, and remove the nodes
2039   which are against limitations from DEST_NODES. */
2040
2041static reg_errcode_t
2042internal_function
2043check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2044                     const re_node_set *candidates, re_node_set *limits,
2045                     struct re_backref_cache_entry *bkref_ents, int str_idx)
2046{
2047  reg_errcode_t err;
2048  int node_idx, lim_idx;
2049
2050  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2051    {
2052      int subexp_idx;
2053      struct re_backref_cache_entry *ent;
2054      ent = bkref_ents + limits->elems[lim_idx];
2055
2056      if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2057        continue; /* This is unrelated limitation.  */
2058
2059      subexp_idx = dfa->nodes[ent->node].opr.idx;
2060      if (ent->subexp_to == str_idx)
2061        {
2062          int ops_node = -1;
2063          int cls_node = -1;
2064          for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2065            {
2066              int node = dest_nodes->elems[node_idx];
2067              re_token_type_t type = dfa->nodes[node].type;
2068              if (type == OP_OPEN_SUBEXP
2069                  && subexp_idx == dfa->nodes[node].opr.idx)
2070                ops_node = node;
2071              else if (type == OP_CLOSE_SUBEXP
2072                       && subexp_idx == dfa->nodes[node].opr.idx)
2073                cls_node = node;
2074            }
2075
2076          /* Check the limitation of the open subexpression.  */
2077          /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2078          if (ops_node >= 0)
2079            {
2080              err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2081                                           candidates);
2082              if (BE (err != REG_NOERROR, 0))
2083                return err;
2084            }
2085
2086          /* Check the limitation of the close subexpression.  */
2087          if (cls_node >= 0)
2088            for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2089              {
2090                int node = dest_nodes->elems[node_idx];
2091                if (!re_node_set_contains (dfa->inveclosures + node,
2092                                           cls_node)
2093                    && !re_node_set_contains (dfa->eclosures + node,
2094                                              cls_node))
2095                  {
2096                    /* It is against this limitation.
2097                       Remove it form the current sifted state.  */
2098                    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2099                                                 candidates);
2100                    if (BE (err != REG_NOERROR, 0))
2101                      return err;
2102                    --node_idx;
2103                  }
2104              }
2105        }
2106      else /* (ent->subexp_to != str_idx)  */
2107        {
2108          for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2109            {
2110              int node = dest_nodes->elems[node_idx];
2111              re_token_type_t type = dfa->nodes[node].type;
2112              if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2113                {
2114                  if (subexp_idx != dfa->nodes[node].opr.idx)
2115                    continue;
2116                  /* It is against this limitation.
2117                     Remove it form the current sifted state.  */
2118                  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2119                                               candidates);
2120                  if (BE (err != REG_NOERROR, 0))
2121                    return err;
2122                }
2123            }
2124        }
2125    }
2126  return REG_NOERROR;
2127}
2128
2129static reg_errcode_t
2130internal_function
2131sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2132                   int str_idx, const re_node_set *candidates)
2133{
2134  const re_dfa_t *const dfa = mctx->dfa;
2135  reg_errcode_t err;
2136  int node_idx, node;
2137  re_sift_context_t local_sctx;
2138  int first_idx = search_cur_bkref_entry (mctx, str_idx);
2139
2140  if (first_idx == -1)
2141    return REG_NOERROR;
2142
2143  local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2144
2145  for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2146    {
2147      int enabled_idx;
2148      re_token_type_t type;
2149      struct re_backref_cache_entry *entry;
2150      node = candidates->elems[node_idx];
2151      type = dfa->nodes[node].type;
2152      /* Avoid infinite loop for the REs like "()\1+".  */
2153      if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2154        continue;
2155      if (type != OP_BACK_REF)
2156        continue;
2157
2158      entry = mctx->bkref_ents + first_idx;
2159      enabled_idx = first_idx;
2160      do
2161        {
2162          int subexp_len;
2163          int to_idx;
2164          int dst_node;
2165          int ret;
2166          re_dfastate_t *cur_state;
2167
2168          if (entry->node != node)
2169            continue;
2170          subexp_len = entry->subexp_to - entry->subexp_from;
2171          to_idx = str_idx + subexp_len;
2172          dst_node = (subexp_len ? dfa->nexts[node]
2173                      : dfa->edests[node].elems[0]);
2174
2175          if (to_idx > sctx->last_str_idx
2176              || sctx->sifted_states[to_idx] == NULL
2177              || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2178              || check_dst_limits (mctx, &sctx->limits, node,
2179                                   str_idx, dst_node, to_idx))
2180            continue;
2181
2182          if (local_sctx.sifted_states == NULL)
2183            {
2184              local_sctx = *sctx;
2185              err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2186              if (BE (err != REG_NOERROR, 0))
2187                goto free_return;
2188            }
2189          local_sctx.last_node = node;
2190          local_sctx.last_str_idx = str_idx;
2191          ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2192          if (BE (ret < 0, 0))
2193            {
2194              err = REG_ESPACE;
2195              goto free_return;
2196            }
2197          cur_state = local_sctx.sifted_states[str_idx];
2198          err = sift_states_backward (mctx, &local_sctx);
2199          if (BE (err != REG_NOERROR, 0))
2200            goto free_return;
2201          if (sctx->limited_states != NULL)
2202            {
2203              err = merge_state_array (dfa, sctx->limited_states,
2204                                       local_sctx.sifted_states,
2205                                       str_idx + 1);
2206              if (BE (err != REG_NOERROR, 0))
2207                goto free_return;
2208            }
2209          local_sctx.sifted_states[str_idx] = cur_state;
2210          re_node_set_remove (&local_sctx.limits, enabled_idx);
2211
2212          /* mctx->bkref_ents may have changed, reload the pointer.  */
2213          entry = mctx->bkref_ents + enabled_idx;
2214        }
2215      while (enabled_idx++, entry++->more);
2216    }
2217  err = REG_NOERROR;
2218 free_return:
2219  if (local_sctx.sifted_states != NULL)
2220    {
2221      re_node_set_free (&local_sctx.limits);
2222    }
2223
2224  return err;
2225}
2226
2227
2228#ifdef RE_ENABLE_I18N
2229static int
2230internal_function
2231sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2232                     int node_idx, int str_idx, int max_str_idx)
2233{
2234  const re_dfa_t *const dfa = mctx->dfa;
2235  int naccepted;
2236  /* Check the node can accept `multi byte'.  */
2237  naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2238  if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2239      !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2240                            dfa->nexts[node_idx]))
2241    /* The node can't accept the `multi byte', or the
2242       destination was already thrown away, then the node
2243       could't accept the current input `multi byte'.   */
2244    naccepted = 0;
2245  /* Otherwise, it is sure that the node could accept
2246     `naccepted' bytes input.  */
2247  return naccepted;
2248}
2249#endif /* RE_ENABLE_I18N */
2250
2251\f
2252/* Functions for state transition.  */
2253
2254/* Return the next state to which the current state STATE will transit by
2255   accepting the current input byte, and update STATE_LOG if necessary.
2256   If STATE can accept a multibyte char/collating element/back reference
2257   update the destination of STATE_LOG.  */
2258
2259static re_dfastate_t *
2260internal_function
2261transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2262               re_dfastate_t *state)
2263{
2264  re_dfastate_t **trtable;
2265  unsigned char ch;
2266
2267#ifdef RE_ENABLE_I18N
2268  /* If the current state can accept multibyte.  */
2269  if (BE (state->accept_mb, 0))
2270    {
2271      *err = transit_state_mb (mctx, state);
2272      if (BE (*err != REG_NOERROR, 0))
2273        return NULL;
2274    }
2275#endif /* RE_ENABLE_I18N */
2276
2277  /* Then decide the next state with the single byte.  */
2278#if 0
2279  if (0)
2280    /* don't use transition table  */
2281    return transit_state_sb (err, mctx, state);
2282#endif
2283
2284  /* Use transition table  */
2285  ch = re_string_fetch_byte (&mctx->input);
2286  for (;;)
2287    {
2288      trtable = state->trtable;
2289      if (BE (trtable != NULL, 1))
2290        return trtable[ch];
2291
2292      trtable = state->word_trtable;
2293      if (BE (trtable != NULL, 1))
2294        {
2295          unsigned int context;
2296          context
2297            = re_string_context_at (&mctx->input,
2298                                    re_string_cur_idx (&mctx->input) - 1,
2299                                    mctx->eflags);
2300          if (IS_WORD_CONTEXT (context))
2301            return trtable[ch + SBC_MAX];
2302          else
2303            return trtable[ch];
2304        }
2305
2306      if (!build_trtable (mctx->dfa, state))
2307        {
2308          *err = REG_ESPACE;
2309          return NULL;
2310        }
2311
2312      /* Retry, we now have a transition table.  */
2313    }
2314}
2315
2316/* Update the state_log if we need */
2317re_dfastate_t *
2318internal_function
2319merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2320                      re_dfastate_t *next_state)
2321{
2322  const re_dfa_t *const dfa = mctx->dfa;
2323  int cur_idx = re_string_cur_idx (&mctx->input);
2324
2325  if (cur_idx > mctx->state_log_top)
2326    {
2327      mctx->state_log[cur_idx] = next_state;
2328      mctx->state_log_top = cur_idx;
2329    }
2330  else if (mctx->state_log[cur_idx] == 0)
2331    {
2332      mctx->state_log[cur_idx] = next_state;
2333    }
2334  else
2335    {
2336      re_dfastate_t *pstate;
2337      unsigned int context;
2338      re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2339      /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2340         the destination of a multibyte char/collating element/
2341         back reference.  Then the next state is the union set of
2342         these destinations and the results of the transition table.  */
2343      pstate = mctx->state_log[cur_idx];
2344      log_nodes = pstate->entrance_nodes;
2345      if (next_state != NULL)
2346        {
2347          table_nodes = next_state->entrance_nodes;
2348          *err = re_node_set_init_union (&next_nodes, table_nodes,
2349                                             log_nodes);
2350          if (BE (*err != REG_NOERROR, 0))
2351            return NULL;
2352        }
2353      else
2354        next_nodes = *log_nodes;
2355      /* Note: We already add the nodes of the initial state,
2356         then we don't need to add them here.  */
2357
2358      context = re_string_context_at (&mctx->input,
2359                                      re_string_cur_idx (&mctx->input) - 1,
2360                                      mctx->eflags);
2361      next_state = mctx->state_log[cur_idx]
2362        = re_acquire_state_context (err, dfa, &next_nodes, context);
2363      /* We don't need to check errors here, since the return value of
2364         this function is next_state and ERR is already set.  */
2365
2366      if (table_nodes != NULL)
2367        re_node_set_free (&next_nodes);
2368    }
2369
2370  if (BE (dfa->nbackref, 0) && next_state != NULL)
2371    {
2372      /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2373         later.  We must check them here, since the back references in the
2374         next state might use them.  */
2375      *err = check_subexp_matching_top (mctx, &next_state->nodes,
2376                                        cur_idx);
2377      if (BE (*err != REG_NOERROR, 0))
2378        return NULL;
2379
2380      /* If the next state has back references.  */
2381      if (next_state->has_backref)
2382        {
2383          *err = transit_state_bkref (mctx, &next_state->nodes);
2384          if (BE (*err != REG_NOERROR, 0))
2385            return NULL;
2386          next_state = mctx->state_log[cur_idx];
2387        }
2388    }
2389
2390  return next_state;
2391}
2392
2393/* Skip bytes in the input that correspond to part of a
2394   multi-byte match, then look in the log for a state
2395   from which to restart matching.  */
2396re_dfastate_t *
2397internal_function
2398find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2399{
2400  re_dfastate_t *cur_state;
2401  do
2402    {
2403      int max = mctx->state_log_top;
2404      int cur_str_idx = re_string_cur_idx (&mctx->input);
2405
2406      do
2407        {
2408          if (++cur_str_idx > max)
2409            return NULL;
2410          re_string_skip_bytes (&mctx->input, 1);
2411        }
2412      while (mctx->state_log[cur_str_idx] == NULL);
2413
2414      cur_state = merge_state_with_log (err, mctx, NULL);
2415    }
2416  while (*err == REG_NOERROR && cur_state == NULL);
2417  return cur_state;
2418}
2419
2420/* Helper functions for transit_state.  */
2421
2422/* From the node set CUR_NODES, pick up the nodes whose types are
2423   OP_OPEN_SUBEXP and which have corresponding back references in the regular
2424   expression. And register them to use them later for evaluating the
2425   correspoding back references.  */
2426
2427static reg_errcode_t
2428internal_function
2429check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2430                           int str_idx)
2431{
2432  const re_dfa_t *const dfa = mctx->dfa;
2433  int node_idx;
2434  reg_errcode_t err;
2435
2436  /* TODO: This isn't efficient.
2437           Because there might be more than one nodes whose types are
2438           OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2439           nodes.
2440           E.g. RE: (a){2}  */
2441  for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2442    {
2443      int node = cur_nodes->elems[node_idx];
2444      if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2445          && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2446          && (dfa->used_bkref_map
2447              & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2448        {
2449          err = match_ctx_add_subtop (mctx, node, str_idx);
2450          if (BE (err != REG_NOERROR, 0))
2451            return err;
2452        }
2453    }
2454  return REG_NOERROR;
2455}
2456
2457#if 0
2458/* Return the next state to which the current state STATE will transit by
2459   accepting the current input byte.  */
2460
2461static re_dfastate_t *
2462transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2463                  re_dfastate_t *state)
2464{
2465  const re_dfa_t *const dfa = mctx->dfa;
2466  re_node_set next_nodes;
2467  re_dfastate_t *next_state;
2468  int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2469  unsigned int context;
2470
2471  *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2472  if (BE (*err != REG_NOERROR, 0))
2473    return NULL;
2474  for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2475    {
2476      int cur_node = state->nodes.elems[node_cnt];
2477      if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2478        {
2479          *err = re_node_set_merge (&next_nodes,
2480                                    dfa->eclosures + dfa->nexts[cur_node]);
2481          if (BE (*err != REG_NOERROR, 0))
2482            {
2483              re_node_set_free (&next_nodes);
2484              return NULL;
2485            }
2486        }
2487    }
2488  context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2489  next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2490  /* We don't need to check errors here, since the return value of
2491     this function is next_state and ERR is already set.  */
2492
2493  re_node_set_free (&next_nodes);
2494  re_string_skip_bytes (&mctx->input, 1);
2495  return next_state;
2496}
2497#endif
2498
2499#ifdef RE_ENABLE_I18N
2500static reg_errcode_t
2501internal_function
2502transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2503{
2504  const re_dfa_t *const dfa = mctx->dfa;
2505  reg_errcode_t err;
2506  int i;
2507
2508  for (i = 0; i < pstate->nodes.nelem; ++i)
2509    {
2510      re_node_set dest_nodes, *new_nodes;
2511      int cur_node_idx = pstate->nodes.elems[i];
2512      int naccepted, dest_idx;
2513      unsigned int context;
2514      re_dfastate_t *dest_state;
2515
2516      if (!dfa->nodes[cur_node_idx].accept_mb)
2517        continue;
2518
2519      if (dfa->nodes[cur_node_idx].constraint)
2520        {
2521          context = re_string_context_at (&mctx->input,
2522                                          re_string_cur_idx (&mctx->input),
2523                                          mctx->eflags);
2524          if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2525                                           context))
2526            continue;
2527        }
2528
2529      /* How many bytes the node can accept?  */
2530      naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2531                                           re_string_cur_idx (&mctx->input));
2532      if (naccepted == 0)
2533        continue;
2534
2535      /* The node can accepts `naccepted' bytes.  */
2536      dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2537      mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2538                               : mctx->max_mb_elem_len);
2539      err = clean_state_log_if_needed (mctx, dest_idx);
2540      if (BE (err != REG_NOERROR, 0))
2541        return err;
2542#ifdef DEBUG
2543      assert (dfa->nexts[cur_node_idx] != -1);
2544#endif
2545      new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2546
2547      dest_state = mctx->state_log[dest_idx];
2548      if (dest_state == NULL)
2549        dest_nodes = *new_nodes;
2550      else
2551        {
2552          err = re_node_set_init_union (&dest_nodes,
2553                                        dest_state->entrance_nodes, new_nodes);
2554          if (BE (err != REG_NOERROR, 0))
2555            return err;
2556        }
2557      context = re_string_context_at (&mctx->input, dest_idx - 1,
2558                                      mctx->eflags);
2559      mctx->state_log[dest_idx]
2560        = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2561      if (dest_state != NULL)
2562        re_node_set_free (&dest_nodes);
2563      if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2564        return err;
2565    }
2566  return REG_NOERROR;
2567}
2568#endif /* RE_ENABLE_I18N */
2569
2570static reg_errcode_t
2571internal_function
2572transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2573{
2574  const re_dfa_t *const dfa = mctx->dfa;
2575  reg_errcode_t err;
2576  int i;
2577  int cur_str_idx = re_string_cur_idx (&mctx->input);
2578
2579  for (i = 0; i < nodes->nelem; ++i)
2580    {
2581      int dest_str_idx, prev_nelem, bkc_idx;
2582      int node_idx = nodes->elems[i];
2583      unsigned int context;
2584      const re_token_t *node = dfa->nodes + node_idx;
2585      re_node_set *new_dest_nodes;
2586
2587      /* Check whether `node' is a backreference or not.  */
2588      if (node->type != OP_BACK_REF)
2589        continue;
2590
2591      if (node->constraint)
2592        {
2593          context = re_string_context_at (&mctx->input, cur_str_idx,
2594                                          mctx->eflags);
2595          if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2596            continue;
2597        }
2598
2599      /* `node' is a backreference.
2600         Check the substring which the substring matched.  */
2601      bkc_idx = mctx->nbkref_ents;
2602      err = get_subexp (mctx, node_idx, cur_str_idx);
2603      if (BE (err != REG_NOERROR, 0))
2604        goto free_return;
2605
2606      /* And add the epsilon closures (which is `new_dest_nodes') of
2607         the backreference to appropriate state_log.  */
2608#ifdef DEBUG
2609      assert (dfa->nexts[node_idx] != -1);
2610#endif
2611      for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2612        {
2613          int subexp_len;
2614          re_dfastate_t *dest_state;
2615          struct re_backref_cache_entry *bkref_ent;
2616          bkref_ent = mctx->bkref_ents + bkc_idx;
2617          if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2618            continue;
2619          subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2620          new_dest_nodes = (subexp_len == 0
2621                            ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2622                            : dfa->eclosures + dfa->nexts[node_idx]);
2623          dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2624                          - bkref_ent->subexp_from);
2625          context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2626                                          mctx->eflags);
2627          dest_state = mctx->state_log[dest_str_idx];
2628          prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2629                        : mctx->state_log[cur_str_idx]->nodes.nelem);
2630          /* Add `new_dest_node' to state_log.  */
2631          if (dest_state == NULL)
2632            {
2633              mctx->state_log[dest_str_idx]
2634                = re_acquire_state_context (&err, dfa, new_dest_nodes,
2635                                            context);
2636              if (BE (mctx->state_log[dest_str_idx] == NULL
2637                      && err != REG_NOERROR, 0))
2638                goto free_return;
2639            }
2640          else
2641            {
2642              re_node_set dest_nodes;
2643              err = re_node_set_init_union (&dest_nodes,
2644                                            dest_state->entrance_nodes,
2645                                            new_dest_nodes);
2646              if (BE (err != REG_NOERROR, 0))
2647                {
2648                  re_node_set_free (&dest_nodes);
2649                  goto free_return;
2650                }
2651              mctx->state_log[dest_str_idx]
2652                = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2653              re_node_set_free (&dest_nodes);
2654              if (BE (mctx->state_log[dest_str_idx] == NULL
2655                      && err != REG_NOERROR, 0))
2656                goto free_return;
2657            }
2658          /* We need to check recursively if the backreference can epsilon
2659             transit.  */
2660          if (subexp_len == 0
2661              && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2662            {
2663              err = check_subexp_matching_top (mctx, new_dest_nodes,
2664                                               cur_str_idx);
2665              if (BE (err != REG_NOERROR, 0))
2666                goto free_return;
2667              err = transit_state_bkref (mctx, new_dest_nodes);
2668              if (BE (err != REG_NOERROR, 0))
2669                goto free_return;
2670            }
2671        }
2672    }
2673  err = REG_NOERROR;
2674 free_return:
2675  return err;
2676}
2677
2678/* Enumerate all the candidates which the backreference BKREF_NODE can match
2679   at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2680   Note that we might collect inappropriate candidates here.
2681   However, the cost of checking them strictly here is too high, then we
2682   delay these checking for prune_impossible_nodes().  */
2683
2684static reg_errcode_t
2685internal_function
2686get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2687{
2688  const re_dfa_t *const dfa = mctx->dfa;
2689  int subexp_num, sub_top_idx;
2690  const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2691  /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2692  int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2693  if (cache_idx != -1)
2694    {
2695      const struct re_backref_cache_entry *entry
2696        = mctx->bkref_ents + cache_idx;
2697      do
2698        if (entry->node == bkref_node)
2699          return REG_NOERROR; /* We already checked it.  */
2700      while (entry++->more);
2701    }
2702
2703  subexp_num = dfa->nodes[bkref_node].opr.idx;
2704
2705  /* For each sub expression  */
2706  for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2707    {
2708      reg_errcode_t err;
2709      re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2710      re_sub_match_last_t *sub_last;
2711      int sub_last_idx, sl_str, bkref_str_off;
2712
2713      if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2714        continue; /* It isn't related.  */
2715
2716      sl_str = sub_top->str_idx;
2717      bkref_str_off = bkref_str_idx;
2718      /* At first, check the last node of sub expressions we already
2719         evaluated.  */
2720      for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2721        {
2722          int sl_str_diff;
2723          sub_last = sub_top->lasts[sub_last_idx];
2724          sl_str_diff = sub_last->str_idx - sl_str;
2725          /* The matched string by the sub expression match with the substring
2726             at the back reference?  */
2727          if (sl_str_diff > 0)
2728            {
2729              if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2730                {
2731                  /* Not enough chars for a successful match.  */
2732                  if (bkref_str_off + sl_str_diff > mctx->input.len)
2733                    break;
2734
2735                  err = clean_state_log_if_needed (mctx,
2736                                                   bkref_str_off
2737                                                   + sl_str_diff);
2738                  if (BE (err != REG_NOERROR, 0))
2739                    return err;
2740                  buf = (const char *) re_string_get_buffer (&mctx->input);
2741                }
2742              if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2743                /* We don't need to search this sub expression any more.  */
2744                break;
2745            }
2746          bkref_str_off += sl_str_diff;
2747          sl_str += sl_str_diff;
2748          err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2749                                bkref_str_idx);
2750
2751          /* Reload buf, since the preceding call might have reallocated
2752             the buffer.  */
2753          buf = (const char *) re_string_get_buffer (&mctx->input);
2754
2755          if (err == REG_NOMATCH)
2756            continue;
2757          if (BE (err != REG_NOERROR, 0))
2758            return err;
2759        }
2760
2761      if (sub_last_idx < sub_top->nlasts)
2762        continue;
2763      if (sub_last_idx > 0)
2764        ++sl_str;
2765      /* Then, search for the other last nodes of the sub expression.  */
2766      for (; sl_str <= bkref_str_idx; ++sl_str)
2767        {
2768          int cls_node, sl_str_off;
2769          const re_node_set *nodes;
2770          sl_str_off = sl_str - sub_top->str_idx;
2771          /* The matched string by the sub expression match with the substring
2772             at the back reference?  */
2773          if (sl_str_off > 0)
2774            {
2775              if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2776                {
2777                  /* If we are at the end of the input, we cannot match.  */
2778                  if (bkref_str_off >= mctx->input.len)
2779                    break;
2780
2781                  err = extend_buffers (mctx);
2782                  if (BE (err != REG_NOERROR, 0))
2783                    return err;
2784
2785                  buf = (const char *) re_string_get_buffer (&mctx->input);
2786                }
2787              if (buf [bkref_str_off++] != buf[sl_str - 1])
2788                break; /* We don't need to search this sub expression
2789                          any more.  */
2790            }
2791          if (mctx->state_log[sl_str] == NULL)
2792            continue;
2793          /* Does this state have a ')' of the sub expression?  */
2794          nodes = &mctx->state_log[sl_str]->nodes;
2795          cls_node = find_subexp_node (dfa, nodes, subexp_num,
2796                                       OP_CLOSE_SUBEXP);
2797          if (cls_node == -1)
2798            continue; /* No.  */
2799          if (sub_top->path == NULL)
2800            {
2801              sub_top->path = calloc (sizeof (state_array_t),
2802                                      sl_str - sub_top->str_idx + 1);
2803              if (sub_top->path == NULL)
2804                return REG_ESPACE;
2805            }
2806          /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2807             in the current context?  */
2808          err = check_arrival (mctx, sub_top->path, sub_top->node,
2809                               sub_top->str_idx, cls_node, sl_str,
2810                               OP_CLOSE_SUBEXP);
2811          if (err == REG_NOMATCH)
2812              continue;
2813          if (BE (err != REG_NOERROR, 0))
2814              return err;
2815          sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2816          if (BE (sub_last == NULL, 0))
2817            return REG_ESPACE;
2818          err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2819                                bkref_str_idx);
2820          if (err == REG_NOMATCH)
2821            continue;
2822        }
2823    }
2824  return REG_NOERROR;
2825}
2826
2827/* Helper functions for get_subexp().  */
2828
2829/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2830   If it can arrive, register the sub expression expressed with SUB_TOP
2831   and SUB_LAST.  */
2832
2833static reg_errcode_t
2834internal_function
2835get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2836                re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2837{
2838  reg_errcode_t err;
2839  int to_idx;
2840  /* Can the subexpression arrive the back reference?  */
2841  err = check_arrival (mctx, &sub_last->path, sub_last->node,
2842                       sub_last->str_idx, bkref_node, bkref_str,
2843                       OP_OPEN_SUBEXP);
2844  if (err != REG_NOERROR)
2845    return err;
2846  err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2847                             sub_last->str_idx);
2848  if (BE (err != REG_NOERROR, 0))
2849    return err;
2850  to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2851  return clean_state_log_if_needed (mctx, to_idx);
2852}
2853
2854/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2855   Search '(' if FL_OPEN, or search ')' otherwise.
2856   TODO: This function isn't efficient...
2857         Because there might be more than one nodes whose types are
2858         OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2859         nodes.
2860         E.g. RE: (a){2}  */
2861
2862static int
2863internal_function
2864find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2865                  int subexp_idx, int type)
2866{
2867  int cls_idx;
2868  for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2869    {
2870      int cls_node = nodes->elems[cls_idx];
2871      const re_token_t *node = dfa->nodes + cls_node;
2872      if (node->type == type
2873          && node->opr.idx == subexp_idx)
2874        return cls_node;
2875    }
2876  return -1;
2877}
2878
2879/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2880   LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2881   heavily reused.
2882   Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2883
2884static reg_errcode_t
2885internal_function
2886check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2887               int top_str, int last_node, int last_str, int type)
2888{
2889  const re_dfa_t *const dfa = mctx->dfa;
2890  reg_errcode_t err = REG_NOERROR;
2891  int subexp_num, backup_cur_idx, str_idx, null_cnt;
2892  re_dfastate_t *cur_state = NULL;
2893  re_node_set *cur_nodes, next_nodes;
2894  re_dfastate_t **backup_state_log;
2895  unsigned int context;
2896
2897  subexp_num = dfa->nodes[top_node].opr.idx;
2898  /* Extend the buffer if we need.  */
2899  if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2900    {
2901      re_dfastate_t **new_array;
2902      int old_alloc = path->alloc;
2903      path->alloc += last_str + mctx->max_mb_elem_len + 1;
2904      new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2905      if (BE (new_array == NULL, 0))
2906        {
2907          path->alloc = old_alloc;
2908          return REG_ESPACE;
2909        }
2910      path->array = new_array;
2911      memset (new_array + old_alloc, '\0',
2912              sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2913    }
2914
2915  str_idx = path->next_idx ? path->next_idx : top_str;
2916
2917  /* Temporary modify MCTX.  */
2918  backup_state_log = mctx->state_log;
2919  backup_cur_idx = mctx->input.cur_idx;
2920  mctx->state_log = path->array;
2921  mctx->input.cur_idx = str_idx;
2922
2923  /* Setup initial node set.  */
2924  context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2925  if (str_idx == top_str)
2926    {
2927      err = re_node_set_init_1 (&next_nodes, top_node);
2928      if (BE (err != REG_NOERROR, 0))
2929        return err;
2930      err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2931      if (BE (err != REG_NOERROR, 0))
2932        {
2933          re_node_set_free (&next_nodes);
2934          return err;
2935        }
2936    }
2937  else
2938    {
2939      cur_state = mctx->state_log[str_idx];
2940      if (cur_state && cur_state->has_backref)
2941        {
2942          err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2943          if (BE (err != REG_NOERROR, 0))
2944            return err;
2945        }
2946      else
2947        re_node_set_init_empty (&next_nodes);
2948    }
2949  if (str_idx == top_str || (cur_state && cur_state->has_backref))
2950    {
2951      if (next_nodes.nelem)
2952        {
2953          err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2954                                    subexp_num, type);
2955          if (BE (err != REG_NOERROR, 0))
2956            {
2957              re_node_set_free (&next_nodes);
2958              return err;
2959            }
2960        }
2961      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2962      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2963        {
2964          re_node_set_free (&next_nodes);
2965          return err;
2966        }
2967      mctx->state_log[str_idx] = cur_state;
2968    }
2969
2970  for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2971    {
2972      re_node_set_empty (&next_nodes);
2973      if (mctx->state_log[str_idx + 1])
2974        {
2975          err = re_node_set_merge (&next_nodes,
2976                                   &mctx->state_log[str_idx + 1]->nodes);
2977          if (BE (err != REG_NOERROR, 0))
2978            {
2979              re_node_set_free (&next_nodes);
2980              return err;
2981            }
2982        }
2983      if (cur_state)
2984        {
2985          err = check_arrival_add_next_nodes (mctx, str_idx,
2986                                              &cur_state->non_eps_nodes,
2987                                              &next_nodes);
2988          if (BE (err != REG_NOERROR, 0))
2989            {
2990              re_node_set_free (&next_nodes);
2991              return err;
2992            }
2993        }
2994      ++str_idx;
2995      if (next_nodes.nelem)
2996        {
2997          err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2998          if (BE (err != REG_NOERROR, 0))
2999            {
3000              re_node_set_free (&next_nodes);
3001              return err;
3002            }
3003          err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3004                                    subexp_num, type);
3005          if (BE (err != REG_NOERROR, 0))
3006            {
3007              re_node_set_free (&next_nodes);
3008              return err;
3009            }
3010        }
3011      context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3012      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3013      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3014        {
3015          re_node_set_free (&next_nodes);
3016          return err;
3017        }
3018      mctx->state_log[str_idx] = cur_state;
3019      null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3020    }
3021  re_node_set_free (&next_nodes);
3022  cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3023               : &mctx->state_log[last_str]->nodes);
3024  path->next_idx = str_idx;
3025
3026  /* Fix MCTX.  */
3027  mctx->state_log = backup_state_log;
3028  mctx->input.cur_idx = backup_cur_idx;
3029
3030  /* Then check the current node set has the node LAST_NODE.  */
3031  if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3032    return REG_NOERROR;
3033
3034  return REG_NOMATCH;
3035}
3036
3037/* Helper functions for check_arrival.  */
3038
3039/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3040   to NEXT_NODES.
3041   TODO: This function is similar to the functions transit_state*(),
3042         however this function has many additional works.
3043         Can't we unify them?  */
3044
3045static reg_errcode_t
3046internal_function
3047check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3048                              re_node_set *cur_nodes, re_node_set *next_nodes)
3049{
3050  const re_dfa_t *const dfa = mctx->dfa;
3051  int result;
3052  int cur_idx;
3053#ifdef RE_ENABLE_I18N
3054  reg_errcode_t err = REG_NOERROR;
3055#endif
3056  re_node_set union_set;
3057  re_node_set_init_empty (&union_set);
3058  for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3059    {
3060      int naccepted = 0;
3061      int cur_node = cur_nodes->elems[cur_idx];
3062#ifdef DEBUG
3063      re_token_type_t type = dfa->nodes[cur_node].type;
3064      assert (!IS_EPSILON_NODE (type));
3065#endif
3066#ifdef RE_ENABLE_I18N
3067      /* If the node may accept `multi byte'.  */
3068      if (dfa->nodes[cur_node].accept_mb)
3069        {
3070          naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3071                                               str_idx);
3072          if (naccepted > 1)
3073            {
3074              re_dfastate_t *dest_state;
3075              int next_node = dfa->nexts[cur_node];
3076              int next_idx = str_idx + naccepted;
3077              dest_state = mctx->state_log[next_idx];
3078              re_node_set_empty (&union_set);
3079              if (dest_state)
3080                {
3081                  err = re_node_set_merge (&union_set, &dest_state->nodes);
3082                  if (BE (err != REG_NOERROR, 0))
3083                    {
3084                      re_node_set_free (&union_set);
3085                      return err;
3086                    }
3087                }
3088              result = re_node_set_insert (&union_set, next_node);
3089              if (BE (result < 0, 0))
3090                {
3091                  re_node_set_free (&union_set);
3092                  return REG_ESPACE;
3093                }
3094              mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3095                                                            &union_set);
3096              if (BE (mctx->state_log[next_idx] == NULL
3097                      && err != REG_NOERROR, 0))
3098                {
3099                  re_node_set_free (&union_set);
3100                  return err;
3101                }
3102            }
3103        }
3104#endif /* RE_ENABLE_I18N */
3105      if (naccepted
3106          || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3107        {
3108          result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3109          if (BE (result < 0, 0))
3110            {
3111              re_node_set_free (&union_set);
3112              return REG_ESPACE;
3113            }
3114        }
3115    }
3116  re_node_set_free (&union_set);
3117  return REG_NOERROR;
3118}
3119
3120/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3121   CUR_NODES, however exclude the nodes which are:
3122    - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3123    - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3124*/
3125
3126static reg_errcode_t
3127internal_function
3128check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3129                          int ex_subexp, int type)
3130{
3131  reg_errcode_t err;
3132  int idx, outside_node;
3133  re_node_set new_nodes;
3134#ifdef DEBUG
3135  assert (cur_nodes->nelem);
3136#endif
3137  err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3138  if (BE (err != REG_NOERROR, 0))
3139    return err;
3140  /* Create a new node set NEW_NODES with the nodes which are epsilon
3141     closures of the node in CUR_NODES.  */
3142
3143  for (idx = 0; idx < cur_nodes->nelem; ++idx)
3144    {
3145      int cur_node = cur_nodes->elems[idx];
3146      const re_node_set *eclosure = dfa->eclosures + cur_node;
3147      outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3148      if (outside_node == -1)
3149        {
3150          /* There are no problematic nodes, just merge them.  */
3151          err = re_node_set_merge (&new_nodes, eclosure);
3152          if (BE (err != REG_NOERROR, 0))
3153            {
3154              re_node_set_free (&new_nodes);
3155              return err;
3156            }
3157        }
3158      else
3159        {
3160          /* There are problematic nodes, re-calculate incrementally.  */
3161          err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3162                                              ex_subexp, type);
3163          if (BE (err != REG_NOERROR, 0))
3164            {
3165              re_node_set_free (&new_nodes);
3166              return err;
3167            }
3168        }
3169    }
3170  re_node_set_free (cur_nodes);
3171  *cur_nodes = new_nodes;
3172  return REG_NOERROR;
3173}
3174
3175/* Helper function for check_arrival_expand_ecl.
3176   Check incrementally the epsilon closure of TARGET, and if it isn't
3177   problematic append it to DST_NODES.  */
3178
3179static reg_errcode_t
3180internal_function
3181check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3182                              int target, int ex_subexp, int type)
3183{
3184  int cur_node;
3185  for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3186    {
3187      int err;
3188
3189      if (dfa->nodes[cur_node].type == type
3190          && dfa->nodes[cur_node].opr.idx == ex_subexp)
3191        {
3192          if (type == OP_CLOSE_SUBEXP)
3193            {
3194              err = re_node_set_insert (dst_nodes, cur_node);
3195              if (BE (err == -1, 0))
3196                return REG_ESPACE;
3197            }
3198          break;
3199        }
3200      err = re_node_set_insert (dst_nodes, cur_node);
3201      if (BE (err == -1, 0))
3202        return REG_ESPACE;
3203      if (dfa->edests[cur_node].nelem == 0)
3204        break;
3205      if (dfa->edests[cur_node].nelem == 2)
3206        {
3207          err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3208                                              dfa->edests[cur_node].elems[1],
3209                                              ex_subexp, type);
3210          if (BE (err != REG_NOERROR, 0))
3211            return err;
3212        }
3213      cur_node = dfa->edests[cur_node].elems[0];
3214    }
3215  return REG_NOERROR;
3216}
3217
3218
3219/* For all the back references in the current state, calculate the
3220   destination of the back references by the appropriate entry
3221   in MCTX->BKREF_ENTS.  */
3222
3223static reg_errcode_t
3224internal_function
3225expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3226                    int cur_str, int subexp_num, int type)
3227{
3228  const re_dfa_t *const dfa = mctx->dfa;
3229  reg_errcode_t err;
3230  int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3231  struct re_backref_cache_entry *ent;
3232
3233  if (cache_idx_start == -1)
3234    return REG_NOERROR;
3235
3236 restart:
3237  ent = mctx->bkref_ents + cache_idx_start;
3238  do
3239    {
3240      int to_idx, next_node;
3241
3242      /* Is this entry ENT is appropriate?  */
3243      if (!re_node_set_contains (cur_nodes, ent->node))
3244        continue; /* No.  */
3245
3246      to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3247      /* Calculate the destination of the back reference, and append it
3248         to MCTX->STATE_LOG.  */
3249      if (to_idx == cur_str)
3250        {
3251          /* The backreference did epsilon transit, we must re-check all the
3252             node in the current state.  */
3253          re_node_set new_dests;
3254          reg_errcode_t err2, err3;
3255          next_node = dfa->edests[ent->node].elems[0];
3256          if (re_node_set_contains (cur_nodes, next_node))
3257            continue;
3258          err = re_node_set_init_1 (&new_dests, next_node);
3259          err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3260          err3 = re_node_set_merge (cur_nodes, &new_dests);
3261          re_node_set_free (&new_dests);
3262          if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3263                  || err3 != REG_NOERROR, 0))
3264            {
3265              err = (err != REG_NOERROR ? err
3266                     : (err2 != REG_NOERROR ? err2 : err3));
3267              return err;
3268            }
3269          /* TODO: It is still inefficient...  */
3270          goto restart;
3271        }
3272      else
3273        {
3274          re_node_set union_set;
3275          next_node = dfa->nexts[ent->node];
3276          if (mctx->state_log[to_idx])
3277            {
3278              int ret;
3279              if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3280                                        next_node))
3281                continue;
3282              err = re_node_set_init_copy (&union_set,
3283                                           &mctx->state_log[to_idx]->nodes);
3284              ret = re_node_set_insert (&union_set, next_node);
3285              if (BE (err != REG_NOERROR || ret < 0, 0))
3286                {
3287                  re_node_set_free (&union_set);
3288                  err = err != REG_NOERROR ? err : REG_ESPACE;
3289                  return err;
3290                }
3291            }
3292          else
3293            {
3294              err = re_node_set_init_1 (&union_set, next_node);
3295              if (BE (err != REG_NOERROR, 0))
3296                return err;
3297            }
3298          mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3299          re_node_set_free (&union_set);
3300          if (BE (mctx->state_log[to_idx] == NULL
3301                  && err != REG_NOERROR, 0))
3302            return err;
3303        }
3304    }
3305  while (ent++->more);
3306  return REG_NOERROR;
3307}
3308
3309/* Build transition table for the state.
3310   Return 1 if succeeded, otherwise return NULL.  */
3311
3312static int
3313internal_function
3314build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3315{
3316  reg_errcode_t err;
3317  int i, j, ch, need_word_trtable = 0;
3318  bitset_word_t elem, mask;
3319  bool dests_node_malloced = false;
3320  bool dest_states_malloced = false;
3321  int ndests; /* Number of the destination states from `state'.  */
3322  re_dfastate_t **trtable;
3323  re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3324  re_node_set follows, *dests_node;
3325  bitset_t *dests_ch;
3326  bitset_t acceptable;
3327
3328  struct dests_alloc
3329  {
3330    re_node_set dests_node[SBC_MAX];
3331    bitset_t dests_ch[SBC_MAX];
3332  } *dests_alloc;
3333
3334  /* We build DFA states which corresponds to the destination nodes
3335     from `state'.  `dests_node[i]' represents the nodes which i-th
3336     destination state contains, and `dests_ch[i]' represents the
3337     characters which i-th destination state accepts.  */
3338#ifdef HAVE_ALLOCA
3339  if (__libc_use_alloca (sizeof (struct dests_alloc)))
3340    dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3341  else
3342#endif
3343    {
3344      dests_alloc = re_malloc (struct dests_alloc, 1);
3345      if (BE (dests_alloc == NULL, 0))
3346        return 0;
3347      dests_node_malloced = true;
3348    }
3349  dests_node = dests_alloc->dests_node;
3350  dests_ch = dests_alloc->dests_ch;
3351
3352  /* Initialize transiton table.  */
3353  state->word_trtable = state->trtable = NULL;
3354
3355  /* At first, group all nodes belonging to `state' into several
3356     destinations.  */
3357  ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3358  if (BE (ndests <= 0, 0))
3359    {
3360      if (dests_node_malloced)
3361        free (dests_alloc);
3362      /* Return 0 in case of an error, 1 otherwise.  */
3363      if (ndests == 0)
3364        {
3365          state->trtable = (re_dfastate_t **)
3366            calloc (sizeof (re_dfastate_t *), SBC_MAX);
3367          return 1;
3368        }
3369      return 0;
3370    }
3371
3372  err = re_node_set_alloc (&follows, ndests + 1);
3373  if (BE (err != REG_NOERROR, 0))
3374    goto out_free;
3375
3376  /* Avoid arithmetic overflow in size calculation.  */
3377  if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3378            / (3 * sizeof (re_dfastate_t *)))
3379           < ndests),
3380          0))
3381    goto out_free;
3382
3383#ifdef HAVE_ALLOCA
3384  if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3385                         + ndests * 3 * sizeof (re_dfastate_t *)))
3386    dest_states = (re_dfastate_t **)
3387      alloca (ndests * 3 * sizeof (re_dfastate_t *));
3388  else
3389#endif
3390    {
3391      dest_states = (re_dfastate_t **)
3392        malloc (ndests * 3 * sizeof (re_dfastate_t *));
3393      if (BE (dest_states == NULL, 0))
3394        {
3395out_free:
3396          if (dest_states_malloced)
3397            free (dest_states);
3398          re_node_set_free (&follows);
3399          for (i = 0; i < ndests; ++i)
3400            re_node_set_free (dests_node + i);
3401          if (dests_node_malloced)
3402            free (dests_alloc);
3403          return 0;
3404        }
3405      dest_states_malloced = true;
3406    }
3407  dest_states_word = dest_states + ndests;
3408  dest_states_nl = dest_states_word + ndests;
3409  bitset_empty (acceptable);
3410
3411  /* Then build the states for all destinations.  */
3412  for (i = 0; i < ndests; ++i)
3413    {
3414      int next_node;
3415      re_node_set_empty (&follows);
3416      /* Merge the follows of this destination states.  */
3417      for (j = 0; j < dests_node[i].nelem; ++j)
3418        {
3419          next_node = dfa->nexts[dests_node[i].elems[j]];
3420          if (next_node != -1)
3421            {
3422              err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3423              if (BE (err != REG_NOERROR, 0))
3424                goto out_free;
3425            }
3426        }
3427      dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3428      if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3429        goto out_free;
3430      /* If the new state has context constraint,
3431         build appropriate states for these contexts.  */
3432      if (dest_states[i]->has_constraint)
3433        {
3434          dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3435                                                          CONTEXT_WORD);
3436          if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3437            goto out_free;
3438
3439          if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3440            need_word_trtable = 1;
3441
3442          dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3443                                                        CONTEXT_NEWLINE);
3444          if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3445            goto out_free;
3446        }
3447      else
3448        {
3449          dest_states_word[i] = dest_states[i];
3450          dest_states_nl[i] = dest_states[i];
3451        }
3452      bitset_merge (acceptable, dests_ch[i]);
3453    }
3454
3455  if (!BE (need_word_trtable, 0))
3456    {
3457      /* We don't care about whether the following character is a word
3458         character, or we are in a single-byte character set so we can
3459         discern by looking at the character code: allocate a
3460         256-entry transition table.  */
3461      trtable = state->trtable =
3462        (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3463      if (BE (trtable == NULL, 0))
3464        goto out_free;
3465
3466      /* For all characters ch...:  */
3467      for (i = 0; i < BITSET_WORDS; ++i)
3468        for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3469             elem;
3470             mask <<= 1, elem >>= 1, ++ch)
3471          if (BE (elem & 1, 0))
3472            {
3473              /* There must be exactly one destination which accepts
3474                 character ch.  See group_nodes_into_DFAstates.  */
3475              for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3476                ;
3477
3478              /* j-th destination accepts the word character ch.  */
3479              if (dfa->word_char[i] & mask)
3480                trtable[ch] = dest_states_word[j];
3481              else
3482                trtable[ch] = dest_states[j];
3483            }
3484    }
3485  else
3486    {
3487      /* We care about whether the following character is a word
3488         character, and we are in a multi-byte character set: discern
3489         by looking at the character code: build two 256-entry
3490         transition tables, one starting at trtable[0] and one
3491         starting at trtable[SBC_MAX].  */
3492      trtable = state->word_trtable =
3493        (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3494      if (BE (trtable == NULL, 0))
3495        goto out_free;
3496
3497      /* For all characters ch...:  */
3498      for (i = 0; i < BITSET_WORDS; ++i)
3499        for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3500             elem;
3501             mask <<= 1, elem >>= 1, ++ch)
3502          if (BE (elem & 1, 0))
3503            {
3504              /* There must be exactly one destination which accepts
3505                 character ch.  See group_nodes_into_DFAstates.  */
3506              for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3507                ;
3508
3509              /* j-th destination accepts the word character ch.  */
3510              trtable[ch] = dest_states[j];
3511              trtable[ch + SBC_MAX] = dest_states_word[j];
3512            }
3513    }
3514
3515  /* new line */
3516  if (bitset_contain (acceptable, NEWLINE_CHAR))
3517    {
3518      /* The current state accepts newline character.  */
3519      for (j = 0; j < ndests; ++j)
3520        if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3521          {
3522            /* k-th destination accepts newline character.  */
3523            trtable[NEWLINE_CHAR] = dest_states_nl[j];
3524            if (need_word_trtable)
3525              trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3526            /* There must be only one destination which accepts
3527               newline.  See group_nodes_into_DFAstates.  */
3528            break;
3529          }
3530    }
3531
3532  if (dest_states_malloced)
3533    free (dest_states);
3534
3535  re_node_set_free (&follows);
3536  for (i = 0; i < ndests; ++i)
3537    re_node_set_free (dests_node + i);
3538
3539  if (dests_node_malloced)
3540    free (dests_alloc);
3541
3542  return 1;
3543}
3544
3545/* Group all nodes belonging to STATE into several destinations.
3546   Then for all destinations, set the nodes belonging to the destination
3547   to DESTS_NODE[i] and set the characters accepted by the destination
3548   to DEST_CH[i].  This function return the number of destinations.  */
3549
3550static int
3551internal_function
3552group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3553                            re_node_set *dests_node, bitset_t *dests_ch)
3554{
3555  reg_errcode_t err;
3556  int result;
3557  int i, j, k;
3558  int ndests; /* Number of the destinations from `state'.  */
3559  bitset_t accepts; /* Characters a node can accept.  */
3560  const re_node_set *cur_nodes = &state->nodes;
3561  bitset_empty (accepts);
3562  ndests = 0;
3563
3564  /* For all the nodes belonging to `state',  */
3565  for (i = 0; i < cur_nodes->nelem; ++i)
3566    {
3567      re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3568      re_token_type_t type = node->type;
3569      unsigned int constraint = node->constraint;
3570
3571      /* Enumerate all single byte character this node can accept.  */
3572      if (type == CHARACTER)
3573        bitset_set (accepts, node->opr.c);
3574      else if (type == SIMPLE_BRACKET)
3575        {
3576          bitset_merge (accepts, node->opr.sbcset);
3577        }
3578      else if (type == OP_PERIOD)
3579        {
3580#ifdef RE_ENABLE_I18N
3581          if (dfa->mb_cur_max > 1)
3582            bitset_merge (accepts, dfa->sb_char);
3583          else
3584#endif
3585            bitset_set_all (accepts);
3586          if (!(dfa->syntax & RE_DOT_NEWLINE))
3587            bitset_clear (accepts, '\n');
3588          if (dfa->syntax & RE_DOT_NOT_NULL)
3589            bitset_clear (accepts, '\0');
3590        }
3591#ifdef RE_ENABLE_I18N
3592      else if (type == OP_UTF8_PERIOD)
3593        {
3594          memset (accepts, '\xff', sizeof (bitset_t) / 2);
3595          if (!(dfa->syntax & RE_DOT_NEWLINE))
3596            bitset_clear (accepts, '\n');
3597          if (dfa->syntax & RE_DOT_NOT_NULL)
3598            bitset_clear (accepts, '\0');
3599        }
3600#endif
3601      else
3602        continue;
3603
3604      /* Check the `accepts' and sift the characters which are not
3605         match it the context.  */
3606      if (constraint)
3607        {
3608          if (constraint & NEXT_NEWLINE_CONSTRAINT)
3609            {
3610              bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3611              bitset_empty (accepts);
3612              if (accepts_newline)
3613                bitset_set (accepts, NEWLINE_CHAR);
3614              else
3615                continue;
3616            }
3617          if (constraint & NEXT_ENDBUF_CONSTRAINT)
3618            {
3619              bitset_empty (accepts);
3620              continue;
3621            }
3622
3623          if (constraint & NEXT_WORD_CONSTRAINT)
3624            {
3625              bitset_word_t any_set = 0;
3626              if (type == CHARACTER && !node->word_char)
3627                {
3628                  bitset_empty (accepts);
3629                  continue;
3630                }
3631#ifdef RE_ENABLE_I18N
3632              if (dfa->mb_cur_max > 1)
3633                for (j = 0; j < BITSET_WORDS; ++j)
3634                  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3635              else
3636#endif
3637                for (j = 0; j < BITSET_WORDS; ++j)
3638                  any_set |= (accepts[j] &= dfa->word_char[j]);
3639              if (!any_set)
3640                continue;
3641            }
3642          if (constraint & NEXT_NOTWORD_CONSTRAINT)
3643            {
3644              bitset_word_t any_set = 0;
3645              if (type == CHARACTER && node->word_char)
3646                {
3647                  bitset_empty (accepts);
3648                  continue;
3649                }
3650#ifdef RE_ENABLE_I18N
3651              if (dfa->mb_cur_max > 1)
3652                for (j = 0; j < BITSET_WORDS; ++j)
3653                  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3654              else
3655#endif
3656                for (j = 0; j < BITSET_WORDS; ++j)
3657                  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3658              if (!any_set)
3659                continue;
3660            }
3661        }
3662
3663      /* Then divide `accepts' into DFA states, or create a new
3664         state.  Above, we make sure that accepts is not empty.  */
3665      for (j = 0; j < ndests; ++j)
3666        {
3667          bitset_t intersec; /* Intersection sets, see below.  */
3668          bitset_t remains;
3669          /* Flags, see below.  */
3670          bitset_word_t has_intersec, not_subset, not_consumed;
3671
3672          /* Optimization, skip if this state doesn't accept the character.  */
3673          if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3674            continue;
3675
3676          /* Enumerate the intersection set of this state and `accepts'.  */
3677          has_intersec = 0;
3678          for (k = 0; k < BITSET_WORDS; ++k)
3679            has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3680          /* And skip if the intersection set is empty.  */
3681          if (!has_intersec)
3682            continue;
3683
3684          /* Then check if this state is a subset of `accepts'.  */
3685          not_subset = not_consumed = 0;
3686          for (k = 0; k < BITSET_WORDS; ++k)
3687            {
3688              not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3689              not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3690            }
3691
3692          /* If this state isn't a subset of `accepts', create a
3693             new group state, which has the `remains'. */
3694          if (not_subset)
3695            {
3696              bitset_copy (dests_ch[ndests], remains);
3697              bitset_copy (dests_ch[j], intersec);
3698              err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3699              if (BE (err != REG_NOERROR, 0))
3700                goto error_return;
3701              ++ndests;
3702            }
3703
3704          /* Put the position in the current group. */
3705          result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3706          if (BE (result < 0, 0))
3707            goto error_return;
3708
3709          /* If all characters are consumed, go to next node. */
3710          if (!not_consumed)
3711            break;
3712        }
3713      /* Some characters remain, create a new group. */
3714      if (j == ndests)
3715        {
3716          bitset_copy (dests_ch[ndests], accepts);
3717          err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3718          if (BE (err != REG_NOERROR, 0))
3719            goto error_return;
3720          ++ndests;
3721          bitset_empty (accepts);
3722        }
3723    }
3724  return ndests;
3725 error_return:
3726  for (j = 0; j < ndests; ++j)
3727    re_node_set_free (dests_node + j);
3728  return -1;
3729}
3730
3731#ifdef RE_ENABLE_I18N
3732/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3733   Return the number of the bytes the node accepts.
3734   STR_IDX is the current index of the input string.
3735
3736   This function handles the nodes which can accept one character, or
3737   one collating element like '.', '[a-z]', opposite to the other nodes
3738   can only accept one byte.  */
3739
3740static int
3741internal_function
3742check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3743                         const re_string_t *input, int str_idx)
3744{
3745  const re_token_t *node = dfa->nodes + node_idx;
3746  int char_len, elem_len;
3747  int i;
3748  wint_t wc;
3749
3750  if (BE (node->type == OP_UTF8_PERIOD, 0))
3751    {
3752      unsigned char c = re_string_byte_at (input, str_idx), d;
3753      if (BE (c < 0xc2, 1))
3754        return 0;
3755
3756      if (str_idx + 2 > input->len)
3757        return 0;
3758
3759      d = re_string_byte_at (input, str_idx + 1);
3760      if (c < 0xe0)
3761        return (d < 0x80 || d > 0xbf) ? 0 : 2;
3762      else if (c < 0xf0)
3763        {
3764          char_len = 3;
3765          if (c == 0xe0 && d < 0xa0)
3766            return 0;
3767        }
3768      else if (c < 0xf8)
3769        {
3770          char_len = 4;
3771          if (c == 0xf0 && d < 0x90)
3772            return 0;
3773        }
3774      else if (c < 0xfc)
3775        {
3776          char_len = 5;
3777          if (c == 0xf8 && d < 0x88)
3778            return 0;
3779        }
3780      else if (c < 0xfe)
3781        {
3782          char_len = 6;
3783          if (c == 0xfc && d < 0x84)
3784            return 0;
3785        }
3786      else
3787        return 0;
3788
3789      if (str_idx + char_len > input->len)
3790        return 0;
3791
3792      for (i = 1; i < char_len; ++i)
3793        {
3794          d = re_string_byte_at (input, str_idx + i);
3795          if (d < 0x80 || d > 0xbf)
3796            return 0;
3797        }
3798      return char_len;
3799    }
3800
3801  char_len = re_string_char_size_at (input, str_idx);
3802  if (node->type == OP_PERIOD)
3803    {
3804      if (char_len <= 1)
3805        return 0;
3806      /* FIXME: I don't think this if is needed, as both '\n'
3807         and '\0' are char_len == 1.  */
3808      /* '.' accepts any one character except the following two cases.  */
3809      if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3810           re_string_byte_at (input, str_idx) == '\n') ||
3811          ((dfa->syntax & RE_DOT_NOT_NULL) &&
3812           re_string_byte_at (input, str_idx) == '\0'))
3813        return 0;
3814      return char_len;
3815    }
3816
3817  elem_len = re_string_elem_size_at (input, str_idx);
3818  wc = __btowc(*(input->mbs+str_idx));
3819  if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3820    return 0;
3821
3822  if (node->type == COMPLEX_BRACKET)
3823    {
3824      const re_charset_t *cset = node->opr.mbcset;
3825# ifdef _LIBC
3826      const unsigned char *pin
3827        = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3828      int j;
3829      uint32_t nrules;
3830# endif /* _LIBC */
3831      int match_len = 0;
3832      wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3833                    ? re_string_wchar_at (input, str_idx) : 0);
3834
3835      /* match with multibyte character?  */
3836      for (i = 0; i < cset->nmbchars; ++i)
3837        if (wc == cset->mbchars[i])
3838          {
3839            match_len = char_len;
3840            goto check_node_accept_bytes_match;
3841          }
3842      /* match with character_class?  */
3843      for (i = 0; i < cset->nchar_classes; ++i)
3844        {
3845          wctype_t wt = cset->char_classes[i];
3846          if (__iswctype (wc, wt))
3847            {
3848              match_len = char_len;
3849              goto check_node_accept_bytes_match;
3850            }
3851        }
3852
3853# ifdef _LIBC
3854      nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3855      if (nrules != 0)
3856        {
3857          unsigned int in_collseq = 0;
3858          const int32_t *table, *indirect;
3859          const unsigned char *weights, *extra;
3860          const char *collseqwc;
3861          /* This #include defines a local function!  */
3862#  include <locale/weight.h>
3863
3864          /* match with collating_symbol?  */
3865          if (cset->ncoll_syms)
3866            extra = (const unsigned char *)
3867              _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3868          for (i = 0; i < cset->ncoll_syms; ++i)
3869            {
3870              const unsigned char *coll_sym = extra + cset->coll_syms[i];
3871              /* Compare the length of input collating element and
3872                 the length of current collating element.  */
3873              if (*coll_sym != elem_len)
3874                continue;
3875              /* Compare each bytes.  */
3876              for (j = 0; j < *coll_sym; j++)
3877                if (pin[j] != coll_sym[1 + j])
3878                  break;
3879              if (j == *coll_sym)
3880                {
3881                  /* Match if every bytes is equal.  */
3882                  match_len = j;
3883                  goto check_node_accept_bytes_match;
3884                }
3885            }
3886
3887          if (cset->nranges)
3888            {
3889              if (elem_len <= char_len)
3890                {
3891                  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3892                  in_collseq = __collseq_table_lookup (collseqwc, wc);
3893                }
3894              else
3895                in_collseq = find_collation_sequence_value (pin, elem_len);
3896            }
3897          /* match with range expression?  */
3898          for (i = 0; i < cset->nranges; ++i)
3899            if (cset->range_starts[i] <= in_collseq
3900                && in_collseq <= cset->range_ends[i])
3901              {
3902                match_len = elem_len;
3903                goto check_node_accept_bytes_match;
3904              }
3905
3906          /* match with equivalence_class?  */
3907          if (cset->nequiv_classes)
3908            {
3909              const unsigned char *cp = pin;
3910              table = (const int32_t *)
3911                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3912              weights = (const unsigned char *)
3913                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3914              extra = (const unsigned char *)
3915                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3916              indirect = (const int32_t *)
3917                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3918              int32_t idx = findidx (&cp);
3919              if (idx > 0)
3920                for (i = 0; i < cset->nequiv_classes; ++i)
3921                  {
3922                    int32_t equiv_class_idx = cset->equiv_classes[i];
3923                    size_t weight_len = weights[idx & 0xffffff];
3924                    if (weight_len == weights[equiv_class_idx & 0xffffff]
3925                        && (idx >> 24) == (equiv_class_idx >> 24))
3926                      {
3927                        int cnt = 0;
3928
3929                        idx &= 0xffffff;
3930                        equiv_class_idx &= 0xffffff;
3931
3932                        while (cnt <= weight_len
3933                               && (weights[equiv_class_idx + 1 + cnt]
3934                                   == weights[idx + 1 + cnt]))
3935                          ++cnt;
3936                        if (cnt > weight_len)
3937                          {
3938                            match_len = elem_len;
3939                            goto check_node_accept_bytes_match;
3940                          }
3941                      }
3942                  }
3943            }
3944        }
3945      else
3946# endif /* _LIBC */
3947        {
3948          /* match with range expression?  */
3949#if __GNUC__ >= 2
3950          wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3951#else
3952          wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3953          cmp_buf[2] = wc;
3954#endif
3955          for (i = 0; i < cset->nranges; ++i)
3956            {
3957              cmp_buf[0] = cset->range_starts[i];
3958              cmp_buf[4] = cset->range_ends[i];
3959              if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3960                  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3961                {
3962                  match_len = char_len;
3963                  goto check_node_accept_bytes_match;
3964                }
3965            }
3966        }
3967    check_node_accept_bytes_match:
3968      if (!cset->non_match)
3969        return match_len;
3970      else
3971        {
3972          if (match_len > 0)
3973            return 0;
3974          else
3975            return (elem_len > char_len) ? elem_len : char_len;
3976        }
3977    }
3978  return 0;
3979}
3980
3981# ifdef _LIBC
3982static unsigned int
3983internal_function
3984find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3985{
3986  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3987  if (nrules == 0)
3988    {
3989      if (mbs_len == 1)
3990        {
3991          /* No valid character.  Match it as a single byte character.  */
3992          const unsigned char *collseq = (const unsigned char *)
3993            _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3994          return collseq[mbs[0]];
3995        }
3996      return UINT_MAX;
3997    }
3998  else
3999    {
4000      int32_t idx;
4001      const unsigned char *extra = (const unsigned char *)
4002        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4003      int32_t extrasize = (const unsigned char *)
4004        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4005
4006      for (idx = 0; idx < extrasize;)
4007        {
4008          int mbs_cnt, found = 0;
4009          int32_t elem_mbs_len;
4010          /* Skip the name of collating element name.  */
4011          idx = idx + extra[idx] + 1;
4012          elem_mbs_len = extra[idx++];
4013          if (mbs_len == elem_mbs_len)
4014            {
4015              for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4016                if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4017                  break;
4018              if (mbs_cnt == elem_mbs_len)
4019                /* Found the entry.  */
4020                found = 1;
4021            }
4022          /* Skip the byte sequence of the collating element.  */
4023          idx += elem_mbs_len;
4024          /* Adjust for the alignment.  */
4025          idx = (idx + 3) & ~3;
4026          /* Skip the collation sequence value.  */
4027          idx += sizeof (uint32_t);
4028          /* Skip the wide char sequence of the collating element.  */
4029          idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4030          /* If we found the entry, return the sequence value.  */
4031          if (found)
4032            return *(uint32_t *) (extra + idx);
4033          /* Skip the collation sequence value.  */
4034          idx += sizeof (uint32_t);
4035        }
4036      return UINT_MAX;
4037    }
4038}
4039# endif /* _LIBC */
4040#endif /* RE_ENABLE_I18N */
4041
4042/* Check whether the node accepts the byte which is IDX-th
4043   byte of the INPUT.  */
4044
4045static int
4046internal_function
4047check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4048                   int idx)
4049{
4050  unsigned char ch;
4051  ch = re_string_byte_at (&mctx->input, idx);
4052  switch (node->type)
4053    {
4054    case CHARACTER:
4055      if (node->opr.c != ch)
4056        return 0;
4057      break;
4058
4059    case SIMPLE_BRACKET:
4060      if (!bitset_contain (node->opr.sbcset, ch))
4061        return 0;
4062      break;
4063
4064#ifdef RE_ENABLE_I18N
4065    case OP_UTF8_PERIOD:
4066      if (ch >= 0x80)
4067        return 0;
4068      /* FALLTHROUGH */
4069#endif
4070    case OP_PERIOD:
4071      if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4072          || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4073        return 0;
4074      break;
4075
4076    default:
4077      return 0;
4078    }
4079
4080  if (node->constraint)
4081    {
4082      /* The node has constraints.  Check whether the current context
4083         satisfies the constraints.  */
4084      unsigned int context = re_string_context_at (&mctx->input, idx,
4085                                                   mctx->eflags);
4086      if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4087        return 0;
4088    }
4089
4090  return 1;
4091}
4092
4093/* Extend the buffers, if the buffers have run out.  */
4094
4095static reg_errcode_t
4096internal_function
4097extend_buffers (re_match_context_t *mctx)
4098{
4099  reg_errcode_t ret;
4100  re_string_t *pstr = &mctx->input;
4101
4102  /* Avoid overflow.  */
4103  if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4104    return REG_ESPACE;
4105
4106  /* Double the lengthes of the buffers.  */
4107  ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4108  if (BE (ret != REG_NOERROR, 0))
4109    return ret;
4110
4111  if (mctx->state_log != NULL)
4112    {
4113      /* And double the length of state_log.  */
4114      /* XXX We have no indication of the size of this buffer.  If this
4115         allocation fail we have no indication that the state_log array
4116         does not have the right size.  */
4117      re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4118                                              pstr->bufs_len + 1);
4119      if (BE (new_array == NULL, 0))
4120        return REG_ESPACE;
4121      mctx->state_log = new_array;
4122    }
4123
4124  /* Then reconstruct the buffers.  */
4125  if (pstr->icase)
4126    {
4127#ifdef RE_ENABLE_I18N
4128      if (pstr->mb_cur_max > 1)
4129        {
4130          ret = build_wcs_upper_buffer (pstr);
4131          if (BE (ret != REG_NOERROR, 0))
4132            return ret;
4133        }
4134      else
4135#endif /* RE_ENABLE_I18N  */
4136        build_upper_buffer (pstr);
4137    }
4138  else
4139    {
4140#ifdef RE_ENABLE_I18N
4141      if (pstr->mb_cur_max > 1)
4142        build_wcs_buffer (pstr);
4143      else
4144#endif /* RE_ENABLE_I18N  */
4145        {
4146          if (pstr->trans != NULL)
4147            re_string_translate_buffer (pstr);
4148        }
4149    }
4150  return REG_NOERROR;
4151}
4152
4153\f
4154/* Functions for matching context.  */
4155
4156/* Initialize MCTX.  */
4157
4158static reg_errcode_t
4159internal_function
4160match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4161{
4162  mctx->eflags = eflags;
4163  mctx->match_last = -1;
4164  if (n > 0)
4165    {
4166      mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4167      mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4168      if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4169        return REG_ESPACE;
4170    }
4171  /* Already zero-ed by the caller.
4172     else
4173       mctx->bkref_ents = NULL;
4174     mctx->nbkref_ents = 0;
4175     mctx->nsub_tops = 0;  */
4176  mctx->abkref_ents = n;
4177  mctx->max_mb_elem_len = 1;
4178  mctx->asub_tops = n;
4179  return REG_NOERROR;
4180}
4181
4182/* Clean the entries which depend on the current input in MCTX.
4183   This function must be invoked when the matcher changes the start index
4184   of the input, or changes the input string.  */
4185
4186static void
4187internal_function
4188match_ctx_clean (re_match_context_t *mctx)
4189{
4190  int st_idx;
4191  for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4192    {
4193      int sl_idx;
4194      re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4195      for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4196        {
4197          re_sub_match_last_t *last = top->lasts[sl_idx];
4198          re_free (last->path.array);
4199          re_free (last);
4200        }
4201      re_free (top->lasts);
4202      if (top->path)
4203        {
4204          re_free (top->path->array);
4205          re_free (top->path);
4206        }
4207      free (top);
4208    }
4209
4210  mctx->nsub_tops = 0;
4211  mctx->nbkref_ents = 0;
4212}
4213
4214/* Free all the memory associated with MCTX.  */
4215
4216static void
4217internal_function
4218match_ctx_free (re_match_context_t *mctx)
4219{
4220  /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4221  match_ctx_clean (mctx);
4222  re_free (mctx->sub_tops);
4223  re_free (mctx->bkref_ents);
4224}
4225
4226/* Add a new backreference entry to MCTX.
4227   Note that we assume that caller never call this function with duplicate
4228   entry, and call with STR_IDX which isn't smaller than any existing entry.
4229*/
4230
4231static reg_errcode_t
4232internal_function
4233match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4234                     int to)
4235{
4236  if (mctx->nbkref_ents >= mctx->abkref_ents)
4237    {
4238      struct re_backref_cache_entry* new_entry;
4239      new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4240                              mctx->abkref_ents * 2);
4241      if (BE (new_entry == NULL, 0))
4242        {
4243          re_free (mctx->bkref_ents);
4244          return REG_ESPACE;
4245        }
4246      mctx->bkref_ents = new_entry;
4247      memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4248              sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4249      mctx->abkref_ents *= 2;
4250    }
4251  if (mctx->nbkref_ents > 0
4252      && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4253    mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4254
4255  mctx->bkref_ents[mctx->nbkref_ents].node = node;
4256  mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4257  mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4258  mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4259
4260  /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4261     If bit N is clear, means that this entry won't epsilon-transition to
4262     an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4263     it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4264     such node.
4265
4266     A backreference does not epsilon-transition unless it is empty, so set
4267     to all zeros if FROM != TO.  */
4268  mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4269    = (from == to ? ~0 : 0);
4270
4271  mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4272  if (mctx->max_mb_elem_len < to - from)
4273    mctx->max_mb_elem_len = to - from;
4274  return REG_NOERROR;
4275}
4276
4277/* Search for the first entry which has the same str_idx, or -1 if none is
4278   found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4279
4280static int
4281internal_function
4282search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4283{
4284  int left, right, mid, last;
4285  last = right = mctx->nbkref_ents;
4286  for (left = 0; left < right;)
4287    {
4288      mid = (left + right) / 2;
4289      if (mctx->bkref_ents[mid].str_idx < str_idx)
4290        left = mid + 1;
4291      else
4292        right = mid;
4293    }
4294  if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4295    return left;
4296  else
4297    return -1;
4298}
4299
4300/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4301   at STR_IDX.  */
4302
4303static reg_errcode_t
4304internal_function
4305match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4306{
4307#ifdef DEBUG
4308  assert (mctx->sub_tops != NULL);
4309  assert (mctx->asub_tops > 0);
4310#endif
4311  if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4312    {
4313      int new_asub_tops = mctx->asub_tops * 2;
4314      re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4315                                                   re_sub_match_top_t *,
4316                                                   new_asub_tops);
4317      if (BE (new_array == NULL, 0))
4318        return REG_ESPACE;
4319      mctx->sub_tops = new_array;
4320      mctx->asub_tops = new_asub_tops;
4321    }
4322  mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4323  if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4324    return REG_ESPACE;
4325  mctx->sub_tops[mctx->nsub_tops]->node = node;
4326  mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4327  return REG_NOERROR;
4328}
4329
4330/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4331   at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4332
4333static re_sub_match_last_t *
4334internal_function
4335match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4336{
4337  re_sub_match_last_t *new_entry;
4338  if (BE (subtop->nlasts == subtop->alasts, 0))
4339    {
4340      int new_alasts = 2 * subtop->alasts + 1;
4341      re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4342                                                    re_sub_match_last_t *,
4343                                                    new_alasts);
4344      if (BE (new_array == NULL, 0))
4345        return NULL;
4346      subtop->lasts = new_array;
4347      subtop->alasts = new_alasts;
4348    }
4349  new_entry = calloc (1, sizeof (re_sub_match_last_t));
4350  if (BE (new_entry != NULL, 1))
4351    {
4352      subtop->lasts[subtop->nlasts] = new_entry;
4353      new_entry->node = node;
4354      new_entry->str_idx = str_idx;
4355      ++subtop->nlasts;
4356    }
4357  return new_entry;
4358}
4359
4360static void
4361internal_function
4362sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4363               re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4364{
4365  sctx->sifted_states = sifted_sts;
4366  sctx->limited_states = limited_sts;
4367  sctx->last_node = last_node;
4368  sctx->last_str_idx = last_str_idx;
4369  re_node_set_init_empty (&sctx->limits);
4370}