compat / regex / regexec.con commit Merge branch 'sb/histogram-less-memory' (57fbd8e)
   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, see
  18   <http://www.gnu.org/licenses/>.  */
  19
  20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                     int n) internal_function;
  22static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23static void match_ctx_free (re_match_context_t *cache) internal_function;
  24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                          int str_idx, int from, int to)
  26     internal_function;
  27static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  28     internal_function;
  29static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                           int str_idx) internal_function;
  31static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                   int node, int str_idx)
  33     internal_function;
  34static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                           re_dfastate_t **limited_sts, int last_node,
  36                           int last_str_idx)
  37     internal_function;
  38static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                         const char *string, int length,
  40                                         int start, int range, int stop,
  41                                         size_t nmatch, regmatch_t pmatch[],
  42                                         int eflags);
  43static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                             const char *string1, int length1,
  45                             const char *string2, int length2,
  46                             int start, int range, struct re_registers *regs,
  47                             int stop, int ret_len);
  48static int re_search_stub (struct re_pattern_buffer *bufp,
  49                           const char *string, int length, int start,
  50                           int range, int stop, struct re_registers *regs,
  51                           int ret_len);
  52static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                              int nregs, int regs_allocated);
  54static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
  55static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  56                           int *p_match_first) internal_function;
  57static int check_halt_state_context (const re_match_context_t *mctx,
  58                                     const re_dfastate_t *state, int idx)
  59     internal_function;
  60static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  61                         regmatch_t *prev_idx_match, int cur_node,
  62                         int cur_idx, int nmatch) internal_function;
  63static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  64                                      int str_idx, int dest_node, int nregs,
  65                                      regmatch_t *regs,
  66                                      re_node_set *eps_via_nodes)
  67     internal_function;
  68static reg_errcode_t set_regs (const regex_t *preg,
  69                               const re_match_context_t *mctx,
  70                               size_t nmatch, regmatch_t *pmatch,
  71                               int fl_backtrack) internal_function;
  72static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  73     internal_function;
  74
  75#ifdef RE_ENABLE_I18N
  76static int sift_states_iter_mb (const re_match_context_t *mctx,
  77                                re_sift_context_t *sctx,
  78                                int node_idx, int str_idx, int max_str_idx)
  79     internal_function;
  80#endif /* RE_ENABLE_I18N */
  81static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  82                                           re_sift_context_t *sctx)
  83     internal_function;
  84static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  85                                          re_sift_context_t *sctx, int str_idx,
  86                                          re_node_set *cur_dest)
  87     internal_function;
  88static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  89                                              re_sift_context_t *sctx,
  90                                              int str_idx,
  91                                              re_node_set *dest_nodes)
  92     internal_function;
  93static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  94                                            re_node_set *dest_nodes,
  95                                            const re_node_set *candidates)
  96     internal_function;
  97static int check_dst_limits (const re_match_context_t *mctx,
  98                             re_node_set *limits,
  99                             int dst_node, int dst_idx, int src_node,
 100                             int src_idx) internal_function;
 101static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 102                                        int boundaries, int subexp_idx,
 103                                        int from_node, int bkref_idx)
 104     internal_function;
 105static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 106                                      int limit, int subexp_idx,
 107                                      int node, int str_idx,
 108                                      int bkref_idx) internal_function;
 109static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 110                                          re_node_set *dest_nodes,
 111                                          const re_node_set *candidates,
 112                                          re_node_set *limits,
 113                                          struct re_backref_cache_entry *bkref_ents,
 114                                          int str_idx) internal_function;
 115static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 116                                        re_sift_context_t *sctx,
 117                                        int str_idx, const re_node_set *candidates)
 118     internal_function;
 119static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 120                                        re_dfastate_t **dst,
 121                                        re_dfastate_t **src, int num)
 122     internal_function;
 123static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 124                                         re_match_context_t *mctx) internal_function;
 125static re_dfastate_t *transit_state (reg_errcode_t *err,
 126                                     re_match_context_t *mctx,
 127                                     re_dfastate_t *state) internal_function;
 128static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 129                                            re_match_context_t *mctx,
 130                                            re_dfastate_t *next_state)
 131     internal_function;
 132static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 133                                                re_node_set *cur_nodes,
 134                                                int str_idx) internal_function;
 135#if 0
 136static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 137                                        re_match_context_t *mctx,
 138                                        re_dfastate_t *pstate)
 139     internal_function;
 140#endif
 141#ifdef RE_ENABLE_I18N
 142static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 143                                       re_dfastate_t *pstate)
 144     internal_function;
 145#endif /* RE_ENABLE_I18N */
 146static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 147                                          const re_node_set *nodes)
 148     internal_function;
 149static reg_errcode_t get_subexp (re_match_context_t *mctx,
 150                                 int bkref_node, int bkref_str_idx)
 151     internal_function;
 152static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 153                                     const re_sub_match_top_t *sub_top,
 154                                     re_sub_match_last_t *sub_last,
 155                                     int bkref_node, int bkref_str)
 156     internal_function;
 157static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 158                             int subexp_idx, int type) internal_function;
 159static reg_errcode_t check_arrival (re_match_context_t *mctx,
 160                                    state_array_t *path, int top_node,
 161                                    int top_str, int last_node, int last_str,
 162                                    int type) internal_function;
 163static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 164                                                   int str_idx,
 165                                                   re_node_set *cur_nodes,
 166                                                   re_node_set *next_nodes)
 167     internal_function;
 168static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 169                                               re_node_set *cur_nodes,
 170                                               int ex_subexp, int type)
 171     internal_function;
 172static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 173                                                   re_node_set *dst_nodes,
 174                                                   int target, int ex_subexp,
 175                                                   int type) internal_function;
 176static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 177                                         re_node_set *cur_nodes, int cur_str,
 178                                         int subexp_num, int type)
 179     internal_function;
 180static int build_trtable (const re_dfa_t *dfa,
 181                          re_dfastate_t *state) internal_function;
 182#ifdef RE_ENABLE_I18N
 183static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 184                                    const re_string_t *input, int idx)
 185     internal_function;
 186# ifdef _LIBC
 187static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 188                                                   size_t name_len)
 189     internal_function;
 190# endif /* _LIBC */
 191#endif /* RE_ENABLE_I18N */
 192static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 193                                       const re_dfastate_t *state,
 194                                       re_node_set *states_node,
 195                                       bitset_t *states_ch) internal_function;
 196static int check_node_accept (const re_match_context_t *mctx,
 197                              const re_token_t *node, int idx)
 198     internal_function;
 199static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 200     internal_function;
 201\f
 202/* Entry point for POSIX code.  */
 203
 204/* regexec searches for a given pattern, specified by PREG, in the
 205   string STRING.
 206
 207   If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 208   `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 209   least NMATCH elements, and we set them to the offsets of the
 210   corresponding matched substrings.
 211
 212   EFLAGS specifies `execution flags' which affect matching: if
 213   REG_NOTBOL is set, then ^ does not match at the beginning of the
 214   string; if REG_NOTEOL is set, then $ does not match at the end.
 215
 216   We return 0 if we find a match and REG_NOMATCH if not.  */
 217
 218int
 219regexec (
 220        const regex_t *__restrict preg,
 221        const char *__restrict string,
 222        size_t nmatch,
 223        regmatch_t pmatch[],
 224        int eflags)
 225{
 226  reg_errcode_t err;
 227  int start, length;
 228
 229  if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 230    return REG_BADPAT;
 231
 232  if (eflags & REG_STARTEND)
 233    {
 234      start = pmatch[0].rm_so;
 235      length = pmatch[0].rm_eo;
 236    }
 237  else
 238    {
 239      start = 0;
 240      length = strlen (string);
 241    }
 242
 243  __libc_lock_lock (dfa->lock);
 244  if (preg->no_sub)
 245    err = re_search_internal (preg, string, length, start, length - start,
 246                              length, 0, NULL, eflags);
 247  else
 248    err = re_search_internal (preg, string, length, start, length - start,
 249                              length, nmatch, pmatch, eflags);
 250  __libc_lock_unlock (dfa->lock);
 251  return err != REG_NOERROR;
 252}
 253
 254#ifdef _LIBC
 255# include <shlib-compat.h>
 256versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 257
 258# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 259__typeof__ (__regexec) __compat_regexec;
 260
 261int
 262attribute_compat_text_section
 263__compat_regexec (const regex_t *__restrict preg,
 264                  const char *__restrict string, size_t nmatch,
 265                  regmatch_t pmatch[], int eflags)
 266{
 267  return regexec (preg, string, nmatch, pmatch,
 268                  eflags & (REG_NOTBOL | REG_NOTEOL));
 269}
 270compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 271# endif
 272#endif
 273
 274/* Entry points for GNU code.  */
 275
 276/* re_match, re_search, re_match_2, re_search_2
 277
 278   The former two functions operate on STRING with length LENGTH,
 279   while the later two operate on concatenation of STRING1 and STRING2
 280   with lengths LENGTH1 and LENGTH2, respectively.
 281
 282   re_match() matches the compiled pattern in BUFP against the string,
 283   starting at index START.
 284
 285   re_search() first tries matching at index START, then it tries to match
 286   starting from index START + 1, and so on.  The last start position tried
 287   is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 288   way as re_match().)
 289
 290   The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 291   the first STOP characters of the concatenation of the strings should be
 292   concerned.
 293
 294   If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 295   and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 296   computed relative to the concatenation, not relative to the individual
 297   strings.)
 298
 299   On success, re_match* functions return the length of the match, re_search*
 300   return the position of the start of the match.  Return value -1 means no
 301   match was found and -2 indicates an internal error.  */
 302
 303int
 304re_match (struct re_pattern_buffer *bufp,
 305          const char *string,
 306          int length,
 307          int start,
 308          struct re_registers *regs)
 309{
 310  return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 311}
 312#ifdef _LIBC
 313weak_alias (__re_match, re_match)
 314#endif
 315
 316int
 317re_search (struct re_pattern_buffer *bufp,
 318           const char *string,
 319           int length, int start, int range,
 320           struct re_registers *regs)
 321{
 322  return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 323}
 324#ifdef _LIBC
 325weak_alias (__re_search, re_search)
 326#endif
 327
 328int
 329re_match_2 (struct re_pattern_buffer *bufp,
 330            const char *string1, int length1,
 331            const char *string2, int length2, int start,
 332            struct re_registers *regs, int stop)
 333{
 334  return re_search_2_stub (bufp, string1, length1, string2, length2,
 335                           start, 0, regs, stop, 1);
 336}
 337#ifdef _LIBC
 338weak_alias (__re_match_2, re_match_2)
 339#endif
 340
 341int
 342re_search_2 (struct re_pattern_buffer *bufp,
 343             const char *string1, int length1,
 344             const char *string2, int length2, int start,
 345             int range, struct re_registers *regs,  int stop)
 346{
 347  return re_search_2_stub (bufp, string1, length1, string2, length2,
 348                           start, range, regs, stop, 0);
 349}
 350#ifdef _LIBC
 351weak_alias (__re_search_2, re_search_2)
 352#endif
 353
 354static int
 355re_search_2_stub (struct re_pattern_buffer *bufp,
 356                  const char *string1, int length1,
 357                  const char *string2, int length2, int start,
 358                  int range, struct re_registers *regs,
 359                  int stop, int ret_len)
 360{
 361  const char *str;
 362  int rval;
 363  int len = length1 + length2;
 364  int free_str = 0;
 365
 366  if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 367    return -2;
 368
 369  /* Concatenate the strings.  */
 370  if (length2 > 0)
 371    if (length1 > 0)
 372      {
 373        char *s = re_malloc (char, len);
 374
 375        if (BE (s == NULL, 0))
 376          return -2;
 377        memcpy (s, string1, length1);
 378        memcpy (s + length1, string2, length2);
 379        str = s;
 380        free_str = 1;
 381      }
 382    else
 383      str = string2;
 384  else
 385    str = string1;
 386
 387  rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 388  if (free_str)
 389    re_free ((char *) str);
 390  return rval;
 391}
 392
 393/* The parameters have the same meaning as those of re_search.
 394   Additional parameters:
 395   If RET_LEN is nonzero the length of the match is returned (re_match style);
 396   otherwise the position of the match is returned.  */
 397
 398static int
 399re_search_stub (struct re_pattern_buffer *bufp,
 400                const char *string, int length, int start,
 401                int range, int stop,
 402                struct re_registers *regs, int ret_len)
 403{
 404  reg_errcode_t result;
 405  regmatch_t *pmatch;
 406  int nregs, rval;
 407  int eflags = 0;
 408
 409  /* Check for out-of-range.  */
 410  if (BE (start < 0 || start > length, 0))
 411    return -1;
 412  if (BE (start + range > length, 0))
 413    range = length - start;
 414  else if (BE (start + range < 0, 0))
 415    range = -start;
 416
 417  __libc_lock_lock (dfa->lock);
 418
 419  eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 420  eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 421
 422  /* Compile fastmap if we haven't yet.  */
 423  if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 424    re_compile_fastmap (bufp);
 425
 426  if (BE (bufp->no_sub, 0))
 427    regs = NULL;
 428
 429  /* We need at least 1 register.  */
 430  if (regs == NULL)
 431    nregs = 1;
 432  else if (BE (bufp->regs_allocated == REGS_FIXED &&
 433               regs->num_regs < bufp->re_nsub + 1, 0))
 434    {
 435      nregs = regs->num_regs;
 436      if (BE (nregs < 1, 0))
 437        {
 438          /* Nothing can be copied to regs.  */
 439          regs = NULL;
 440          nregs = 1;
 441        }
 442    }
 443  else
 444    nregs = bufp->re_nsub + 1;
 445  pmatch = re_malloc (regmatch_t, nregs);
 446  if (BE (pmatch == NULL, 0))
 447    {
 448      rval = -2;
 449      goto out;
 450    }
 451
 452  result = re_search_internal (bufp, string, length, start, range, stop,
 453                               nregs, pmatch, eflags);
 454
 455  rval = 0;
 456
 457  /* I hope we needn't fill their regs with -1's when no match was found.  */
 458  if (result != REG_NOERROR)
 459    rval = -1;
 460  else if (regs != NULL)
 461    {
 462      /* If caller wants register contents data back, copy them.  */
 463      bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 464                                           bufp->regs_allocated);
 465      if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 466        rval = -2;
 467    }
 468
 469  if (BE (rval == 0, 1))
 470    {
 471      if (ret_len)
 472        {
 473          assert (pmatch[0].rm_so == start);
 474          rval = pmatch[0].rm_eo - start;
 475        }
 476      else
 477        rval = pmatch[0].rm_so;
 478    }
 479  re_free (pmatch);
 480 out:
 481  __libc_lock_unlock (dfa->lock);
 482  return rval;
 483}
 484
 485static unsigned
 486re_copy_regs (struct re_registers *regs,
 487              regmatch_t *pmatch,
 488              int nregs, int regs_allocated)
 489{
 490  int rval = REGS_REALLOCATE;
 491  int i;
 492  int need_regs = nregs + 1;
 493  /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 494     uses.  */
 495
 496  /* Have the register data arrays been allocated?  */
 497  if (regs_allocated == REGS_UNALLOCATED)
 498    { /* No.  So allocate them with malloc.  */
 499      regs->start = re_malloc (regoff_t, need_regs);
 500      if (BE (regs->start == NULL, 0))
 501        return REGS_UNALLOCATED;
 502      regs->end = re_malloc (regoff_t, need_regs);
 503      if (BE (regs->end == NULL, 0))
 504        {
 505          re_free (regs->start);
 506          return REGS_UNALLOCATED;
 507        }
 508      regs->num_regs = need_regs;
 509    }
 510  else if (regs_allocated == REGS_REALLOCATE)
 511    { /* Yes.  If we need more elements than were already
 512         allocated, reallocate them.  If we need fewer, just
 513         leave it alone.  */
 514      if (BE (need_regs > regs->num_regs, 0))
 515        {
 516          regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 517          regoff_t *new_end;
 518          if (BE (new_start == NULL, 0))
 519            return REGS_UNALLOCATED;
 520          new_end = re_realloc (regs->end, regoff_t, need_regs);
 521          if (BE (new_end == NULL, 0))
 522            {
 523              re_free (new_start);
 524              return REGS_UNALLOCATED;
 525            }
 526          regs->start = new_start;
 527          regs->end = new_end;
 528          regs->num_regs = need_regs;
 529        }
 530    }
 531  else
 532    {
 533      assert (regs_allocated == REGS_FIXED);
 534      /* This function may not be called with REGS_FIXED and nregs too big.  */
 535      assert (regs->num_regs >= nregs);
 536      rval = REGS_FIXED;
 537    }
 538
 539  /* Copy the regs.  */
 540  for (i = 0; i < nregs; ++i)
 541    {
 542      regs->start[i] = pmatch[i].rm_so;
 543      regs->end[i] = pmatch[i].rm_eo;
 544    }
 545  for ( ; i < regs->num_regs; ++i)
 546    regs->start[i] = regs->end[i] = -1;
 547
 548  return rval;
 549}
 550
 551/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 552   ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 553   this memory for recording register information.  STARTS and ENDS
 554   must be allocated using the malloc library routine, and must each
 555   be at least NUM_REGS * sizeof (regoff_t) bytes long.
 556
 557   If NUM_REGS == 0, then subsequent matches should allocate their own
 558   register data.
 559
 560   Unless this function is called, the first search or match using
 561   PATTERN_BUFFER will allocate its own register data, without
 562   freeing the old data.  */
 563
 564void
 565re_set_registers (struct re_pattern_buffer *bufp,
 566                  struct re_registers *regs,
 567                  unsigned num_regs,
 568                  regoff_t *starts,
 569                  regoff_t *ends)
 570{
 571  if (num_regs)
 572    {
 573      bufp->regs_allocated = REGS_REALLOCATE;
 574      regs->num_regs = num_regs;
 575      regs->start = starts;
 576      regs->end = ends;
 577    }
 578  else
 579    {
 580      bufp->regs_allocated = REGS_UNALLOCATED;
 581      regs->num_regs = 0;
 582      regs->start = regs->end = (regoff_t *) 0;
 583    }
 584}
 585#ifdef _LIBC
 586weak_alias (__re_set_registers, re_set_registers)
 587#endif
 588\f
 589/* Entry points compatible with 4.2 BSD regex library.  We don't define
 590   them unless specifically requested.  */
 591
 592#if defined _REGEX_RE_COMP || defined _LIBC
 593int
 594# ifdef _LIBC
 595weak_function
 596# endif
 597re_exec (s)
 598     const char *s;
 599{
 600  return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 601}
 602#endif /* _REGEX_RE_COMP */
 603\f
 604/* Internal entry point.  */
 605
 606/* Searches for a compiled pattern PREG in the string STRING, whose
 607   length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 608   mingings with regexec.  START, and RANGE have the same meanings
 609   with re_search.
 610   Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 611   otherwise return the error code.
 612   Note: We assume front end functions already check ranges.
 613   (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 614
 615static reg_errcode_t
 616re_search_internal (const regex_t *preg,
 617                    const char *string,
 618                    int length, int start, int range, int stop,
 619                    size_t nmatch, regmatch_t pmatch[],
 620                    int eflags)
 621{
 622  reg_errcode_t err;
 623  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 624  int left_lim, right_lim, incr;
 625  int fl_longest_match, match_first, match_kind, match_last = -1;
 626  int extra_nmatch;
 627  int sb, ch;
 628#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 629  re_match_context_t mctx = { .dfa = dfa };
 630#else
 631  re_match_context_t mctx;
 632#endif
 633  char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 634                   && range && !preg->can_be_null) ? preg->fastmap : NULL;
 635  RE_TRANSLATE_TYPE t = preg->translate;
 636
 637#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 638  memset (&mctx, '\0', sizeof (re_match_context_t));
 639  mctx.dfa = dfa;
 640#endif
 641
 642  extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 643  nmatch -= extra_nmatch;
 644
 645  /* Check if the DFA haven't been compiled.  */
 646  if (BE (preg->used == 0 || dfa->init_state == NULL
 647          || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 648          || dfa->init_state_begbuf == NULL, 0))
 649    return REG_NOMATCH;
 650
 651#ifdef DEBUG
 652  /* We assume front-end functions already check them.  */
 653  assert (start + range >= 0 && start + range <= length);
 654#endif
 655
 656  /* If initial states with non-begbuf contexts have no elements,
 657     the regex must be anchored.  If preg->newline_anchor is set,
 658     we'll never use init_state_nl, so do not check it.  */
 659  if (dfa->init_state->nodes.nelem == 0
 660      && dfa->init_state_word->nodes.nelem == 0
 661      && (dfa->init_state_nl->nodes.nelem == 0
 662          || !preg->newline_anchor))
 663    {
 664      if (start != 0 && start + range != 0)
 665        return REG_NOMATCH;
 666      start = range = 0;
 667    }
 668
 669  /* We must check the longest matching, if nmatch > 0.  */
 670  fl_longest_match = (nmatch != 0 || dfa->nbackref);
 671
 672  err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 673                            preg->translate, preg->syntax & RE_ICASE, dfa);
 674  if (BE (err != REG_NOERROR, 0))
 675    goto free_return;
 676  mctx.input.stop = stop;
 677  mctx.input.raw_stop = stop;
 678  mctx.input.newline_anchor = preg->newline_anchor;
 679
 680  err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 681  if (BE (err != REG_NOERROR, 0))
 682    goto free_return;
 683
 684  /* We will log all the DFA states through which the dfa pass,
 685     if nmatch > 1, or this dfa has "multibyte node", which is a
 686     back-reference or a node which can accept multibyte character or
 687     multi character collating element.  */
 688  if (nmatch > 1 || dfa->has_mb_node)
 689    {
 690      /* Avoid overflow.  */
 691      if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 692        {
 693          err = REG_ESPACE;
 694          goto free_return;
 695        }
 696
 697      mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 698      if (BE (mctx.state_log == NULL, 0))
 699        {
 700          err = REG_ESPACE;
 701          goto free_return;
 702        }
 703    }
 704  else
 705    mctx.state_log = NULL;
 706
 707  match_first = start;
 708  mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 709                           : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 710
 711  /* Check incrementally whether of not the input string match.  */
 712  incr = (range < 0) ? -1 : 1;
 713  left_lim = (range < 0) ? start + range : start;
 714  right_lim = (range < 0) ? start : start + range;
 715  sb = dfa->mb_cur_max == 1;
 716  match_kind =
 717    (fastmap
 718     ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 719        | (range >= 0 ? 2 : 0)
 720        | (t != NULL ? 1 : 0))
 721     : 8);
 722
 723  for (;; match_first += incr)
 724    {
 725      err = REG_NOMATCH;
 726      if (match_first < left_lim || right_lim < match_first)
 727        goto free_return;
 728
 729      /* Advance as rapidly as possible through the string, until we
 730         find a plausible place to start matching.  This may be done
 731         with varying efficiency, so there are various possibilities:
 732         only the most common of them are specialized, in order to
 733         save on code size.  We use a switch statement for speed.  */
 734      switch (match_kind)
 735        {
 736        case 8:
 737          /* No fastmap.  */
 738          break;
 739
 740        case 7:
 741          /* Fastmap with single-byte translation, match forward.  */
 742          while (BE (match_first < right_lim, 1)
 743                 && !fastmap[t[(unsigned char) string[match_first]]])
 744            ++match_first;
 745          goto forward_match_found_start_or_reached_end;
 746
 747        case 6:
 748          /* Fastmap without translation, match forward.  */
 749          while (BE (match_first < right_lim, 1)
 750                 && !fastmap[(unsigned char) string[match_first]])
 751            ++match_first;
 752
 753        forward_match_found_start_or_reached_end:
 754          if (BE (match_first == right_lim, 0))
 755            {
 756              ch = match_first >= length
 757                       ? 0 : (unsigned char) string[match_first];
 758              if (!fastmap[t ? t[ch] : ch])
 759                goto free_return;
 760            }
 761          break;
 762
 763        case 4:
 764        case 5:
 765          /* Fastmap without multi-byte translation, match backwards.  */
 766          while (match_first >= left_lim)
 767            {
 768              ch = match_first >= length
 769                       ? 0 : (unsigned char) string[match_first];
 770              if (fastmap[t ? t[ch] : ch])
 771                break;
 772              --match_first;
 773            }
 774          if (match_first < left_lim)
 775            goto free_return;
 776          break;
 777
 778        default:
 779          /* In this case, we can't determine easily the current byte,
 780             since it might be a component byte of a multibyte
 781             character.  Then we use the constructed buffer instead.  */
 782          for (;;)
 783            {
 784              /* If MATCH_FIRST is out of the valid range, reconstruct the
 785                 buffers.  */
 786              unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 787              if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 788                {
 789                  err = re_string_reconstruct (&mctx.input, match_first,
 790                                               eflags);
 791                  if (BE (err != REG_NOERROR, 0))
 792                    goto free_return;
 793
 794                  offset = match_first - mctx.input.raw_mbs_idx;
 795                }
 796              /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 797                 Note that MATCH_FIRST must not be smaller than 0.  */
 798              ch = (match_first >= length
 799                    ? 0 : re_string_byte_at (&mctx.input, offset));
 800              if (fastmap[ch])
 801                break;
 802              match_first += incr;
 803              if (match_first < left_lim || match_first > right_lim)
 804                {
 805                  err = REG_NOMATCH;
 806                  goto free_return;
 807                }
 808            }
 809          break;
 810        }
 811
 812      /* Reconstruct the buffers so that the matcher can assume that
 813         the matching starts from the beginning of the buffer.  */
 814      err = re_string_reconstruct (&mctx.input, match_first, eflags);
 815      if (BE (err != REG_NOERROR, 0))
 816        goto free_return;
 817
 818#ifdef RE_ENABLE_I18N
 819     /* Don't consider this char as a possible match start if it part,
 820        yet isn't the head, of a multibyte character.  */
 821      if (!sb && !re_string_first_byte (&mctx.input, 0))
 822        continue;
 823#endif
 824
 825      /* It seems to be appropriate one, then use the matcher.  */
 826      /* We assume that the matching starts from 0.  */
 827      mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 828      match_last = check_matching (&mctx, fl_longest_match,
 829                                   range >= 0 ? &match_first : NULL);
 830      if (match_last != -1)
 831        {
 832          if (BE (match_last == -2, 0))
 833            {
 834              err = REG_ESPACE;
 835              goto free_return;
 836            }
 837          else
 838            {
 839              mctx.match_last = match_last;
 840              if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 841                {
 842                  re_dfastate_t *pstate = mctx.state_log[match_last];
 843                  mctx.last_node = check_halt_state_context (&mctx, pstate,
 844                                                             match_last);
 845                }
 846              if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 847                  || dfa->nbackref)
 848                {
 849                  err = prune_impossible_nodes (&mctx);
 850                  if (err == REG_NOERROR)
 851                    break;
 852                  if (BE (err != REG_NOMATCH, 0))
 853                    goto free_return;
 854                  match_last = -1;
 855                }
 856              else
 857                break; /* We found a match.  */
 858            }
 859        }
 860
 861      match_ctx_clean (&mctx);
 862    }
 863
 864#ifdef DEBUG
 865  assert (match_last != -1);
 866  assert (err == REG_NOERROR);
 867#endif
 868
 869  /* Set pmatch[] if we need.  */
 870  if (nmatch > 0)
 871    {
 872      int reg_idx;
 873
 874      /* Initialize registers.  */
 875      for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 876        pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 877
 878      /* Set the points where matching start/end.  */
 879      pmatch[0].rm_so = 0;
 880      pmatch[0].rm_eo = mctx.match_last;
 881
 882      if (!preg->no_sub && nmatch > 1)
 883        {
 884          err = set_regs (preg, &mctx, nmatch, pmatch,
 885                          dfa->has_plural_match && dfa->nbackref > 0);
 886          if (BE (err != REG_NOERROR, 0))
 887            goto free_return;
 888        }
 889
 890      /* At last, add the offset to the each registers, since we slided
 891         the buffers so that we could assume that the matching starts
 892         from 0.  */
 893      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 894        if (pmatch[reg_idx].rm_so != -1)
 895          {
 896#ifdef RE_ENABLE_I18N
 897            if (BE (mctx.input.offsets_needed != 0, 0))
 898              {
 899                pmatch[reg_idx].rm_so =
 900                  (pmatch[reg_idx].rm_so == mctx.input.valid_len
 901                   ? mctx.input.valid_raw_len
 902                   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 903                pmatch[reg_idx].rm_eo =
 904                  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 905                   ? mctx.input.valid_raw_len
 906                   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 907              }
 908#else
 909            assert (mctx.input.offsets_needed == 0);
 910#endif
 911            pmatch[reg_idx].rm_so += match_first;
 912            pmatch[reg_idx].rm_eo += match_first;
 913          }
 914      for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 915        {
 916          pmatch[nmatch + reg_idx].rm_so = -1;
 917          pmatch[nmatch + reg_idx].rm_eo = -1;
 918        }
 919
 920      if (dfa->subexp_map)
 921        for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 922          if (dfa->subexp_map[reg_idx] != reg_idx)
 923            {
 924              pmatch[reg_idx + 1].rm_so
 925                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 926              pmatch[reg_idx + 1].rm_eo
 927                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 928            }
 929    }
 930
 931 free_return:
 932  re_free (mctx.state_log);
 933  if (dfa->nbackref)
 934    match_ctx_free (&mctx);
 935  re_string_destruct (&mctx.input);
 936  return err;
 937}
 938
 939static reg_errcode_t
 940prune_impossible_nodes (re_match_context_t *mctx)
 941{
 942  const re_dfa_t *const dfa = mctx->dfa;
 943  int halt_node, match_last;
 944  reg_errcode_t ret;
 945  re_dfastate_t **sifted_states;
 946  re_dfastate_t **lim_states = NULL;
 947  re_sift_context_t sctx;
 948#ifdef DEBUG
 949  assert (mctx->state_log != NULL);
 950#endif
 951  match_last = mctx->match_last;
 952  halt_node = mctx->last_node;
 953
 954  /* Avoid overflow.  */
 955  if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 956    return REG_ESPACE;
 957
 958  sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 959  if (BE (sifted_states == NULL, 0))
 960    {
 961      ret = REG_ESPACE;
 962      goto free_return;
 963    }
 964  if (dfa->nbackref)
 965    {
 966      lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 967      if (BE (lim_states == NULL, 0))
 968        {
 969          ret = REG_ESPACE;
 970          goto free_return;
 971        }
 972      while (1)
 973        {
 974          memset (lim_states, '\0',
 975                  sizeof (re_dfastate_t *) * (match_last + 1));
 976          sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 977                         match_last);
 978          ret = sift_states_backward (mctx, &sctx);
 979          re_node_set_free (&sctx.limits);
 980          if (BE (ret != REG_NOERROR, 0))
 981              goto free_return;
 982          if (sifted_states[0] != NULL || lim_states[0] != NULL)
 983            break;
 984          do
 985            {
 986              --match_last;
 987              if (match_last < 0)
 988                {
 989                  ret = REG_NOMATCH;
 990                  goto free_return;
 991                }
 992            } while (mctx->state_log[match_last] == NULL
 993                     || !mctx->state_log[match_last]->halt);
 994          halt_node = check_halt_state_context (mctx,
 995                                                mctx->state_log[match_last],
 996                                                match_last);
 997        }
 998      ret = merge_state_array (dfa, sifted_states, lim_states,
 999                               match_last + 1);
1000      re_free (lim_states);
1001      lim_states = NULL;
1002      if (BE (ret != REG_NOERROR, 0))
1003        goto free_return;
1004    }
1005  else
1006    {
1007      sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1008      ret = sift_states_backward (mctx, &sctx);
1009      re_node_set_free (&sctx.limits);
1010      if (BE (ret != REG_NOERROR, 0))
1011        goto free_return;
1012      if (sifted_states[0] == NULL)
1013        {
1014          ret = REG_NOMATCH;
1015          goto free_return;
1016        }
1017    }
1018  re_free (mctx->state_log);
1019  mctx->state_log = sifted_states;
1020  sifted_states = NULL;
1021  mctx->last_node = halt_node;
1022  mctx->match_last = match_last;
1023  ret = REG_NOERROR;
1024 free_return:
1025  re_free (sifted_states);
1026  re_free (lim_states);
1027  return ret;
1028}
1029
1030/* Acquire an initial state and return it.
1031   We must select appropriate initial state depending on the context,
1032   since initial states may have constraints like "\<", "^", etc..  */
1033
1034static inline re_dfastate_t *
1035__attribute ((always_inline)) internal_function
1036acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1037                            int idx)
1038{
1039  const re_dfa_t *const dfa = mctx->dfa;
1040  if (dfa->init_state->has_constraint)
1041    {
1042      unsigned int context;
1043      context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1044      if (IS_WORD_CONTEXT (context))
1045        return dfa->init_state_word;
1046      else if (IS_ORDINARY_CONTEXT (context))
1047        return dfa->init_state;
1048      else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1049        return dfa->init_state_begbuf;
1050      else if (IS_NEWLINE_CONTEXT (context))
1051        return dfa->init_state_nl;
1052      else if (IS_BEGBUF_CONTEXT (context))
1053        {
1054          /* It is relatively rare case, then calculate on demand.  */
1055          return re_acquire_state_context (err, dfa,
1056                                           dfa->init_state->entrance_nodes,
1057                                           context);
1058        }
1059      else
1060        /* Must not happen?  */
1061        return dfa->init_state;
1062    }
1063  else
1064    return dfa->init_state;
1065}
1066
1067/* Check whether the regular expression match input string INPUT or not,
1068   and return the index where the matching end, return -1 if not match,
1069   or return -2 in case of an error.
1070   FL_LONGEST_MATCH means we want the POSIX longest matching.
1071   If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1072   next place where we may want to try matching.
1073   Note that the matcher assume that the matching starts from the current
1074   index of the buffer.  */
1075
1076static int
1077internal_function
1078check_matching (re_match_context_t *mctx, int fl_longest_match,
1079                int *p_match_first)
1080{
1081  const re_dfa_t *const dfa = mctx->dfa;
1082  reg_errcode_t err;
1083  int match = 0;
1084  int match_last = -1;
1085  int cur_str_idx = re_string_cur_idx (&mctx->input);
1086  re_dfastate_t *cur_state;
1087  int at_init_state = p_match_first != NULL;
1088  int next_start_idx = cur_str_idx;
1089
1090  err = REG_NOERROR;
1091  cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1092  /* An initial state must not be NULL (invalid).  */
1093  if (BE (cur_state == NULL, 0))
1094    {
1095      assert (err == REG_ESPACE);
1096      return -2;
1097    }
1098
1099  if (mctx->state_log != NULL)
1100    {
1101      mctx->state_log[cur_str_idx] = cur_state;
1102
1103      /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1104         later.  E.g. Processing back references.  */
1105      if (BE (dfa->nbackref, 0))
1106        {
1107          at_init_state = 0;
1108          err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1109          if (BE (err != REG_NOERROR, 0))
1110            return err;
1111
1112          if (cur_state->has_backref)
1113            {
1114              err = transit_state_bkref (mctx, &cur_state->nodes);
1115              if (BE (err != REG_NOERROR, 0))
1116                return err;
1117            }
1118        }
1119    }
1120
1121  /* If the RE accepts NULL string.  */
1122  if (BE (cur_state->halt, 0))
1123    {
1124      if (!cur_state->has_constraint
1125          || check_halt_state_context (mctx, cur_state, cur_str_idx))
1126        {
1127          if (!fl_longest_match)
1128            return cur_str_idx;
1129          else
1130            {
1131              match_last = cur_str_idx;
1132              match = 1;
1133            }
1134        }
1135    }
1136
1137  while (!re_string_eoi (&mctx->input))
1138    {
1139      re_dfastate_t *old_state = cur_state;
1140      int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1141
1142      if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1143          || (BE (next_char_idx >= mctx->input.valid_len, 0)
1144              && mctx->input.valid_len < mctx->input.len))
1145        {
1146          err = extend_buffers (mctx);
1147          if (BE (err != REG_NOERROR, 0))
1148            {
1149              assert (err == REG_ESPACE);
1150              return -2;
1151            }
1152        }
1153
1154      cur_state = transit_state (&err, mctx, cur_state);
1155      if (mctx->state_log != NULL)
1156        cur_state = merge_state_with_log (&err, mctx, cur_state);
1157
1158      if (cur_state == NULL)
1159        {
1160          /* Reached the invalid state or an error.  Try to recover a valid
1161             state using the state log, if available and if we have not
1162             already found a valid (even if not the longest) match.  */
1163          if (BE (err != REG_NOERROR, 0))
1164            return -2;
1165
1166          if (mctx->state_log == NULL
1167              || (match && !fl_longest_match)
1168              || (cur_state = find_recover_state (&err, mctx)) == NULL)
1169            break;
1170        }
1171
1172      if (BE (at_init_state, 0))
1173        {
1174          if (old_state == cur_state)
1175            next_start_idx = next_char_idx;
1176          else
1177            at_init_state = 0;
1178        }
1179
1180      if (cur_state->halt)
1181        {
1182          /* Reached a halt state.
1183             Check the halt state can satisfy the current context.  */
1184          if (!cur_state->has_constraint
1185              || check_halt_state_context (mctx, cur_state,
1186                                           re_string_cur_idx (&mctx->input)))
1187            {
1188              /* We found an appropriate halt state.  */
1189              match_last = re_string_cur_idx (&mctx->input);
1190              match = 1;
1191
1192              /* We found a match, do not modify match_first below.  */
1193              p_match_first = NULL;
1194              if (!fl_longest_match)
1195                break;
1196            }
1197        }
1198    }
1199
1200  if (p_match_first)
1201    *p_match_first += next_start_idx;
1202
1203  return match_last;
1204}
1205
1206/* Check NODE match the current context.  */
1207
1208static int
1209internal_function
1210check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1211{
1212  re_token_type_t type = dfa->nodes[node].type;
1213  unsigned int constraint = dfa->nodes[node].constraint;
1214  if (type != END_OF_RE)
1215    return 0;
1216  if (!constraint)
1217    return 1;
1218  if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1219    return 0;
1220  return 1;
1221}
1222
1223/* Check the halt state STATE match the current context.
1224   Return 0 if not match, if the node, STATE has, is a halt node and
1225   match the context, return the node.  */
1226
1227static int
1228internal_function
1229check_halt_state_context (const re_match_context_t *mctx,
1230                          const re_dfastate_t *state, int idx)
1231{
1232  int i;
1233  unsigned int context;
1234#ifdef DEBUG
1235  assert (state->halt);
1236#endif
1237  context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1238  for (i = 0; i < state->nodes.nelem; ++i)
1239    if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1240      return state->nodes.elems[i];
1241  return 0;
1242}
1243
1244/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1245   corresponding to the DFA).
1246   Return the destination node, and update EPS_VIA_NODES, return -1 in case
1247   of errors.  */
1248
1249static int
1250internal_function
1251proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1252                   int *pidx, int node, re_node_set *eps_via_nodes,
1253                   struct re_fail_stack_t *fs)
1254{
1255  const re_dfa_t *const dfa = mctx->dfa;
1256  int i, err;
1257  if (IS_EPSILON_NODE (dfa->nodes[node].type))
1258    {
1259      re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1260      re_node_set *edests = &dfa->edests[node];
1261      int dest_node;
1262      err = re_node_set_insert (eps_via_nodes, node);
1263      if (BE (err < 0, 0))
1264        return -2;
1265      /* Pick up a valid destination, or return -1 if none is found.  */
1266      for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1267        {
1268          int candidate = edests->elems[i];
1269          if (!re_node_set_contains (cur_nodes, candidate))
1270            continue;
1271          if (dest_node == -1)
1272            dest_node = candidate;
1273
1274          else
1275            {
1276              /* In order to avoid infinite loop like "(a*)*", return the second
1277                 epsilon-transition if the first was already considered.  */
1278              if (re_node_set_contains (eps_via_nodes, dest_node))
1279                return candidate;
1280
1281              /* Otherwise, push the second epsilon-transition on the fail stack.  */
1282              else if (fs != NULL
1283                       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1284                                           eps_via_nodes))
1285                return -2;
1286
1287              /* We know we are going to exit.  */
1288              break;
1289            }
1290        }
1291      return dest_node;
1292    }
1293  else
1294    {
1295      int naccepted = 0;
1296      re_token_type_t type = dfa->nodes[node].type;
1297
1298#ifdef RE_ENABLE_I18N
1299      if (dfa->nodes[node].accept_mb)
1300        naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1301      else
1302#endif /* RE_ENABLE_I18N */
1303      if (type == OP_BACK_REF)
1304        {
1305          int subexp_idx = dfa->nodes[node].opr.idx + 1;
1306          naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1307          if (fs != NULL)
1308            {
1309              if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1310                return -1;
1311              else if (naccepted)
1312                {
1313                  char *buf = (char *) re_string_get_buffer (&mctx->input);
1314                  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1315                              naccepted) != 0)
1316                    return -1;
1317                }
1318            }
1319
1320          if (naccepted == 0)
1321            {
1322              int dest_node;
1323              err = re_node_set_insert (eps_via_nodes, node);
1324              if (BE (err < 0, 0))
1325                return -2;
1326              dest_node = dfa->edests[node].elems[0];
1327              if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1328                                        dest_node))
1329                return dest_node;
1330            }
1331        }
1332
1333      if (naccepted != 0
1334          || check_node_accept (mctx, dfa->nodes + node, *pidx))
1335        {
1336          int dest_node = dfa->nexts[node];
1337          *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1338          if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1339                     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1340                                               dest_node)))
1341            return -1;
1342          re_node_set_empty (eps_via_nodes);
1343          return dest_node;
1344        }
1345    }
1346  return -1;
1347}
1348
1349static reg_errcode_t
1350internal_function
1351push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1352                 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1353{
1354  reg_errcode_t err;
1355  int num = fs->num++;
1356  if (fs->num == fs->alloc)
1357    {
1358      struct re_fail_stack_ent_t *new_array;
1359      new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1360                                       * fs->alloc * 2));
1361      if (new_array == NULL)
1362        return REG_ESPACE;
1363      fs->alloc *= 2;
1364      fs->stack = new_array;
1365    }
1366  fs->stack[num].idx = str_idx;
1367  fs->stack[num].node = dest_node;
1368  fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1369  if (fs->stack[num].regs == NULL)
1370    return REG_ESPACE;
1371  memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1372  err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1373  return err;
1374}
1375
1376static int
1377internal_function
1378pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1379                regmatch_t *regs, re_node_set *eps_via_nodes)
1380{
1381  int num = --fs->num;
1382  assert (num >= 0);
1383  *pidx = fs->stack[num].idx;
1384  memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1385  re_node_set_free (eps_via_nodes);
1386  re_free (fs->stack[num].regs);
1387  *eps_via_nodes = fs->stack[num].eps_via_nodes;
1388  return fs->stack[num].node;
1389}
1390
1391/* Set the positions where the subexpressions are starts/ends to registers
1392   PMATCH.
1393   Note: We assume that pmatch[0] is already set, and
1394   pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1395
1396static reg_errcode_t
1397internal_function
1398set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1399          regmatch_t *pmatch, int fl_backtrack)
1400{
1401  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1402  int idx, cur_node;
1403  re_node_set eps_via_nodes;
1404  struct re_fail_stack_t *fs;
1405  struct re_fail_stack_t fs_body = { 0, 2, NULL };
1406  regmatch_t *prev_idx_match;
1407  int prev_idx_match_malloced = 0;
1408
1409#ifdef DEBUG
1410  assert (nmatch > 1);
1411  assert (mctx->state_log != NULL);
1412#endif
1413  if (fl_backtrack)
1414    {
1415      fs = &fs_body;
1416      fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1417      if (fs->stack == NULL)
1418        return REG_ESPACE;
1419    }
1420  else
1421    fs = NULL;
1422
1423  cur_node = dfa->init_node;
1424  re_node_set_init_empty (&eps_via_nodes);
1425
1426#ifdef HAVE_ALLOCA
1427  if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1428    prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1429  else
1430#endif
1431    {
1432      prev_idx_match = re_malloc (regmatch_t, nmatch);
1433      if (prev_idx_match == NULL)
1434        {
1435          free_fail_stack_return (fs);
1436          return REG_ESPACE;
1437        }
1438      prev_idx_match_malloced = 1;
1439    }
1440  memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1441
1442  for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1443    {
1444      update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1445
1446      if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1447        {
1448          int reg_idx;
1449          if (fs)
1450            {
1451              for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1452                if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1453                  break;
1454              if (reg_idx == nmatch)
1455                {
1456                  re_node_set_free (&eps_via_nodes);
1457                  if (prev_idx_match_malloced)
1458                    re_free (prev_idx_match);
1459                  return free_fail_stack_return (fs);
1460                }
1461              cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1462                                         &eps_via_nodes);
1463            }
1464          else
1465            {
1466              re_node_set_free (&eps_via_nodes);
1467              if (prev_idx_match_malloced)
1468                re_free (prev_idx_match);
1469              return REG_NOERROR;
1470            }
1471        }
1472
1473      /* Proceed to next node.  */
1474      cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1475                                    &eps_via_nodes, fs);
1476
1477      if (BE (cur_node < 0, 0))
1478        {
1479          if (BE (cur_node == -2, 0))
1480            {
1481              re_node_set_free (&eps_via_nodes);
1482              if (prev_idx_match_malloced)
1483                re_free (prev_idx_match);
1484              free_fail_stack_return (fs);
1485              return REG_ESPACE;
1486            }
1487          if (fs)
1488            cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1489                                       &eps_via_nodes);
1490          else
1491            {
1492              re_node_set_free (&eps_via_nodes);
1493              if (prev_idx_match_malloced)
1494                re_free (prev_idx_match);
1495              return REG_NOMATCH;
1496            }
1497        }
1498    }
1499  re_node_set_free (&eps_via_nodes);
1500  if (prev_idx_match_malloced)
1501    re_free (prev_idx_match);
1502  return free_fail_stack_return (fs);
1503}
1504
1505static reg_errcode_t
1506internal_function
1507free_fail_stack_return (struct re_fail_stack_t *fs)
1508{
1509  if (fs)
1510    {
1511      int fs_idx;
1512      for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1513        {
1514          re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1515          re_free (fs->stack[fs_idx].regs);
1516        }
1517      re_free (fs->stack);
1518    }
1519  return REG_NOERROR;
1520}
1521
1522static void
1523internal_function
1524update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1525             regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1526{
1527  int type = dfa->nodes[cur_node].type;
1528  if (type == OP_OPEN_SUBEXP)
1529    {
1530      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1531
1532      /* We are at the first node of this sub expression.  */
1533      if (reg_num < nmatch)
1534        {
1535          pmatch[reg_num].rm_so = cur_idx;
1536          pmatch[reg_num].rm_eo = -1;
1537        }
1538    }
1539  else if (type == OP_CLOSE_SUBEXP)
1540    {
1541      int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1542      if (reg_num < nmatch)
1543        {
1544          /* We are at the last node of this sub expression.  */
1545          if (pmatch[reg_num].rm_so < cur_idx)
1546            {
1547              pmatch[reg_num].rm_eo = cur_idx;
1548              /* This is a non-empty match or we are not inside an optional
1549                 subexpression.  Accept this right away.  */
1550              memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1551            }
1552          else
1553            {
1554              if (dfa->nodes[cur_node].opt_subexp
1555                  && prev_idx_match[reg_num].rm_so != -1)
1556                /* We transited through an empty match for an optional
1557                   subexpression, like (a?)*, and this is not the subexp's
1558                   first match.  Copy back the old content of the registers
1559                   so that matches of an inner subexpression are undone as
1560                   well, like in ((a?))*.  */
1561                memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1562              else
1563                /* We completed a subexpression, but it may be part of
1564                   an optional one, so do not update PREV_IDX_MATCH.  */
1565                pmatch[reg_num].rm_eo = cur_idx;
1566            }
1567        }
1568    }
1569}
1570
1571/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1572   and sift the nodes in each states according to the following rules.
1573   Updated state_log will be wrote to STATE_LOG.
1574
1575   Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1576     1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1577        If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1578        the LAST_NODE, we throw away the node `a'.
1579     2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1580        string `s' and transit to `b':
1581        i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1582           away the node `a'.
1583        ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1584            thrown away, we throw away the node `a'.
1585     3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1586        i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1587           node `a'.
1588        ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1589            we throw away the node `a'.  */
1590
1591#define STATE_NODE_CONTAINS(state,node) \
1592  ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1593
1594static reg_errcode_t
1595internal_function
1596sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1597{
1598  reg_errcode_t err;
1599  int null_cnt = 0;
1600  int str_idx = sctx->last_str_idx;
1601  re_node_set cur_dest;
1602
1603#ifdef DEBUG
1604  assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1605#endif
1606
1607  /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1608     transit to the last_node and the last_node itself.  */
1609  err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1610  if (BE (err != REG_NOERROR, 0))
1611    return err;
1612  err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1613  if (BE (err != REG_NOERROR, 0))
1614    goto free_return;
1615
1616  /* Then check each states in the state_log.  */
1617  while (str_idx > 0)
1618    {
1619      /* Update counters.  */
1620      null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1621      if (null_cnt > mctx->max_mb_elem_len)
1622        {
1623          memset (sctx->sifted_states, '\0',
1624                  sizeof (re_dfastate_t *) * str_idx);
1625          re_node_set_free (&cur_dest);
1626          return REG_NOERROR;
1627        }
1628      re_node_set_empty (&cur_dest);
1629      --str_idx;
1630
1631      if (mctx->state_log[str_idx])
1632        {
1633          err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1634          if (BE (err != REG_NOERROR, 0))
1635            goto free_return;
1636        }
1637
1638      /* Add all the nodes which satisfy the following conditions:
1639         - It can epsilon transit to a node in CUR_DEST.
1640         - It is in CUR_SRC.
1641         And update state_log.  */
1642      err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1643      if (BE (err != REG_NOERROR, 0))
1644        goto free_return;
1645    }
1646  err = REG_NOERROR;
1647 free_return:
1648  re_node_set_free (&cur_dest);
1649  return err;
1650}
1651
1652static reg_errcode_t
1653internal_function
1654build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1655                     int str_idx, re_node_set *cur_dest)
1656{
1657  const re_dfa_t *const dfa = mctx->dfa;
1658  const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1659  int i;
1660
1661  /* Then build the next sifted state.
1662     We build the next sifted state on `cur_dest', and update
1663     `sifted_states[str_idx]' with `cur_dest'.
1664     Note:
1665     `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1666     `cur_src' points the node_set of the old `state_log[str_idx]'
1667     (with the epsilon nodes pre-filtered out).  */
1668  for (i = 0; i < cur_src->nelem; i++)
1669    {
1670      int prev_node = cur_src->elems[i];
1671      int naccepted = 0;
1672      int ret;
1673
1674#ifdef DEBUG
1675      re_token_type_t type = dfa->nodes[prev_node].type;
1676      assert (!IS_EPSILON_NODE (type));
1677#endif
1678#ifdef RE_ENABLE_I18N
1679      /* If the node may accept `multi byte'.  */
1680      if (dfa->nodes[prev_node].accept_mb)
1681        naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1682                                         str_idx, sctx->last_str_idx);
1683#endif /* RE_ENABLE_I18N */
1684
1685      /* We don't check backreferences here.
1686         See update_cur_sifted_state().  */
1687      if (!naccepted
1688          && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1689          && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1690                                  dfa->nexts[prev_node]))
1691        naccepted = 1;
1692
1693      if (naccepted == 0)
1694        continue;
1695
1696      if (sctx->limits.nelem)
1697        {
1698          int to_idx = str_idx + naccepted;
1699          if (check_dst_limits (mctx, &sctx->limits,
1700                                dfa->nexts[prev_node], to_idx,
1701                                prev_node, str_idx))
1702            continue;
1703        }
1704      ret = re_node_set_insert (cur_dest, prev_node);
1705      if (BE (ret == -1, 0))
1706        return REG_ESPACE;
1707    }
1708
1709  return REG_NOERROR;
1710}
1711
1712/* Helper functions.  */
1713
1714static reg_errcode_t
1715internal_function
1716clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1717{
1718  int top = mctx->state_log_top;
1719
1720  if (next_state_log_idx >= mctx->input.bufs_len
1721      || (next_state_log_idx >= mctx->input.valid_len
1722          && mctx->input.valid_len < mctx->input.len))
1723    {
1724      reg_errcode_t err;
1725      err = extend_buffers (mctx);
1726      if (BE (err != REG_NOERROR, 0))
1727        return err;
1728    }
1729
1730  if (top < next_state_log_idx)
1731    {
1732      memset (mctx->state_log + top + 1, '\0',
1733              sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1734      mctx->state_log_top = next_state_log_idx;
1735    }
1736  return REG_NOERROR;
1737}
1738
1739static reg_errcode_t
1740internal_function
1741merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1742                   re_dfastate_t **src, int num)
1743{
1744  int st_idx;
1745  reg_errcode_t err;
1746  for (st_idx = 0; st_idx < num; ++st_idx)
1747    {
1748      if (dst[st_idx] == NULL)
1749        dst[st_idx] = src[st_idx];
1750      else if (src[st_idx] != NULL)
1751        {
1752          re_node_set merged_set;
1753          err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1754                                        &src[st_idx]->nodes);
1755          if (BE (err != REG_NOERROR, 0))
1756            return err;
1757          dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1758          re_node_set_free (&merged_set);
1759          if (BE (err != REG_NOERROR, 0))
1760            return err;
1761        }
1762    }
1763  return REG_NOERROR;
1764}
1765
1766static reg_errcode_t
1767internal_function
1768update_cur_sifted_state (const re_match_context_t *mctx,
1769                         re_sift_context_t *sctx, int str_idx,
1770                         re_node_set *dest_nodes)
1771{
1772  const re_dfa_t *const dfa = mctx->dfa;
1773  reg_errcode_t err = REG_NOERROR;
1774  const re_node_set *candidates;
1775  candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1776                : &mctx->state_log[str_idx]->nodes);
1777
1778  if (dest_nodes->nelem == 0)
1779    sctx->sifted_states[str_idx] = NULL;
1780  else
1781    {
1782      if (candidates)
1783        {
1784          /* At first, add the nodes which can epsilon transit to a node in
1785             DEST_NODE.  */
1786          err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1787          if (BE (err != REG_NOERROR, 0))
1788            return err;
1789
1790          /* Then, check the limitations in the current sift_context.  */
1791          if (sctx->limits.nelem)
1792            {
1793              err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1794                                         mctx->bkref_ents, str_idx);
1795              if (BE (err != REG_NOERROR, 0))
1796                return err;
1797            }
1798        }
1799
1800      sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1801      if (BE (err != REG_NOERROR, 0))
1802        return err;
1803    }
1804
1805  if (candidates && mctx->state_log[str_idx]->has_backref)
1806    {
1807      err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1808      if (BE (err != REG_NOERROR, 0))
1809        return err;
1810    }
1811  return REG_NOERROR;
1812}
1813
1814static reg_errcode_t
1815internal_function
1816add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1817                       const re_node_set *candidates)
1818{
1819  reg_errcode_t err = REG_NOERROR;
1820  int i;
1821
1822  re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1823  if (BE (err != REG_NOERROR, 0))
1824    return err;
1825
1826  if (!state->inveclosure.alloc)
1827    {
1828      err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1829      if (BE (err != REG_NOERROR, 0))
1830        return REG_ESPACE;
1831      for (i = 0; i < dest_nodes->nelem; i++)
1832        {
1833          err = re_node_set_merge (&state->inveclosure,
1834                                   dfa->inveclosures + dest_nodes->elems[i]);
1835          if (BE (err != REG_NOERROR, 0))
1836            return REG_ESPACE;
1837        }
1838    }
1839  return re_node_set_add_intersect (dest_nodes, candidates,
1840                                    &state->inveclosure);
1841}
1842
1843static reg_errcode_t
1844internal_function
1845sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1846                       const re_node_set *candidates)
1847{
1848    int ecl_idx;
1849    reg_errcode_t err;
1850    re_node_set *inv_eclosure = dfa->inveclosures + node;
1851    re_node_set except_nodes;
1852    re_node_set_init_empty (&except_nodes);
1853    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1854      {
1855        int cur_node = inv_eclosure->elems[ecl_idx];
1856        if (cur_node == node)
1857          continue;
1858        if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1859          {
1860            int edst1 = dfa->edests[cur_node].elems[0];
1861            int edst2 = ((dfa->edests[cur_node].nelem > 1)
1862                         ? dfa->edests[cur_node].elems[1] : -1);
1863            if ((!re_node_set_contains (inv_eclosure, edst1)
1864                 && re_node_set_contains (dest_nodes, edst1))
1865                || (edst2 > 0
1866                    && !re_node_set_contains (inv_eclosure, edst2)
1867                    && re_node_set_contains (dest_nodes, edst2)))
1868              {
1869                err = re_node_set_add_intersect (&except_nodes, candidates,
1870                                                 dfa->inveclosures + cur_node);
1871                if (BE (err != REG_NOERROR, 0))
1872                  {
1873                    re_node_set_free (&except_nodes);
1874                    return err;
1875                  }
1876              }
1877          }
1878      }
1879    for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1880      {
1881        int cur_node = inv_eclosure->elems[ecl_idx];
1882        if (!re_node_set_contains (&except_nodes, cur_node))
1883          {
1884            int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1885            re_node_set_remove_at (dest_nodes, idx);
1886          }
1887      }
1888    re_node_set_free (&except_nodes);
1889    return REG_NOERROR;
1890}
1891
1892static int
1893internal_function
1894check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1895                  int dst_node, int dst_idx, int src_node, int src_idx)
1896{
1897  const re_dfa_t *const dfa = mctx->dfa;
1898  int lim_idx, src_pos, dst_pos;
1899
1900  int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1901  int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1902  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1903    {
1904      int subexp_idx;
1905      struct re_backref_cache_entry *ent;
1906      ent = mctx->bkref_ents + limits->elems[lim_idx];
1907      subexp_idx = dfa->nodes[ent->node].opr.idx;
1908
1909      dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1910                                           subexp_idx, dst_node, dst_idx,
1911                                           dst_bkref_idx);
1912      src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1913                                           subexp_idx, src_node, src_idx,
1914                                           src_bkref_idx);
1915
1916      /* In case of:
1917         <src> <dst> ( <subexp> )
1918         ( <subexp> ) <src> <dst>
1919         ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1920      if (src_pos == dst_pos)
1921        continue; /* This is unrelated limitation.  */
1922      else
1923        return 1;
1924    }
1925  return 0;
1926}
1927
1928static int
1929internal_function
1930check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1931                             int subexp_idx, int from_node, int bkref_idx)
1932{
1933  const re_dfa_t *const dfa = mctx->dfa;
1934  const re_node_set *eclosures = dfa->eclosures + from_node;
1935  int node_idx;
1936
1937  /* Else, we are on the boundary: examine the nodes on the epsilon
1938     closure.  */
1939  for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1940    {
1941      int node = eclosures->elems[node_idx];
1942      switch (dfa->nodes[node].type)
1943        {
1944        case OP_BACK_REF:
1945          if (bkref_idx != -1)
1946            {
1947              struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1948              do
1949                {
1950                  int dst, cpos;
1951
1952                  if (ent->node != node)
1953                    continue;
1954
1955                  if (subexp_idx < BITSET_WORD_BITS
1956                      && !(ent->eps_reachable_subexps_map
1957                           & ((bitset_word_t) 1 << subexp_idx)))
1958                    continue;
1959
1960                  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1961                     OP_CLOSE_SUBEXP cases below.  But, if the
1962                     destination node is the same node as the source
1963                     node, don't recurse because it would cause an
1964                     infinite loop: a regex that exhibits this behavior
1965                     is ()\1*\1*  */
1966                  dst = dfa->edests[node].elems[0];
1967                  if (dst == from_node)
1968                    {
1969                      if (boundaries & 1)
1970                        return -1;
1971                      else /* if (boundaries & 2) */
1972                        return 0;
1973                    }
1974
1975                  cpos =
1976                    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1977                                                 dst, bkref_idx);
1978                  if (cpos == -1 /* && (boundaries & 1) */)
1979                    return -1;
1980                  if (cpos == 0 && (boundaries & 2))
1981                    return 0;
1982
1983                  if (subexp_idx < BITSET_WORD_BITS)
1984                    ent->eps_reachable_subexps_map
1985                      &= ~((bitset_word_t) 1 << subexp_idx);
1986                }
1987              while (ent++->more);
1988            }
1989          break;
1990
1991        case OP_OPEN_SUBEXP:
1992          if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1993            return -1;
1994          break;
1995
1996        case OP_CLOSE_SUBEXP:
1997          if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1998            return 0;
1999          break;
2000
2001        default:
2002            break;
2003        }
2004    }
2005
2006  return (boundaries & 2) ? 1 : 0;
2007}
2008
2009static int
2010internal_function
2011check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2012                           int subexp_idx, int from_node, int str_idx,
2013                           int bkref_idx)
2014{
2015  struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2016  int boundaries;
2017
2018  /* If we are outside the range of the subexpression, return -1 or 1.  */
2019  if (str_idx < lim->subexp_from)
2020    return -1;
2021
2022  if (lim->subexp_to < str_idx)
2023    return 1;
2024
2025  /* If we are within the subexpression, return 0.  */
2026  boundaries = (str_idx == lim->subexp_from);
2027  boundaries |= (str_idx == lim->subexp_to) << 1;
2028  if (boundaries == 0)
2029    return 0;
2030
2031  /* Else, examine epsilon closure.  */
2032  return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2033                                      from_node, bkref_idx);
2034}
2035
2036/* Check the limitations of sub expressions LIMITS, and remove the nodes
2037   which are against limitations from DEST_NODES. */
2038
2039static reg_errcode_t
2040internal_function
2041check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2042                     const re_node_set *candidates, re_node_set *limits,
2043                     struct re_backref_cache_entry *bkref_ents, int str_idx)
2044{
2045  reg_errcode_t err;
2046  int node_idx, lim_idx;
2047
2048  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2049    {
2050      int subexp_idx;
2051      struct re_backref_cache_entry *ent;
2052      ent = bkref_ents + limits->elems[lim_idx];
2053
2054      if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2055        continue; /* This is unrelated limitation.  */
2056
2057      subexp_idx = dfa->nodes[ent->node].opr.idx;
2058      if (ent->subexp_to == str_idx)
2059        {
2060          int ops_node = -1;
2061          int cls_node = -1;
2062          for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2063            {
2064              int node = dest_nodes->elems[node_idx];
2065              re_token_type_t type = dfa->nodes[node].type;
2066              if (type == OP_OPEN_SUBEXP
2067                  && subexp_idx == dfa->nodes[node].opr.idx)
2068                ops_node = node;
2069              else if (type == OP_CLOSE_SUBEXP
2070                       && subexp_idx == dfa->nodes[node].opr.idx)
2071                cls_node = node;
2072            }
2073
2074          /* Check the limitation of the open subexpression.  */
2075          /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2076          if (ops_node >= 0)
2077            {
2078              err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2079                                           candidates);
2080              if (BE (err != REG_NOERROR, 0))
2081                return err;
2082            }
2083
2084          /* Check the limitation of the close subexpression.  */
2085          if (cls_node >= 0)
2086            for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2087              {
2088                int node = dest_nodes->elems[node_idx];
2089                if (!re_node_set_contains (dfa->inveclosures + node,
2090                                           cls_node)
2091                    && !re_node_set_contains (dfa->eclosures + node,
2092                                              cls_node))
2093                  {
2094                    /* It is against this limitation.
2095                       Remove it form the current sifted state.  */
2096                    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2097                                                 candidates);
2098                    if (BE (err != REG_NOERROR, 0))
2099                      return err;
2100                    --node_idx;
2101                  }
2102              }
2103        }
2104      else /* (ent->subexp_to != str_idx)  */
2105        {
2106          for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2107            {
2108              int node = dest_nodes->elems[node_idx];
2109              re_token_type_t type = dfa->nodes[node].type;
2110              if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2111                {
2112                  if (subexp_idx != dfa->nodes[node].opr.idx)
2113                    continue;
2114                  /* It is against this limitation.
2115                     Remove it form the current sifted state.  */
2116                  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2117                                               candidates);
2118                  if (BE (err != REG_NOERROR, 0))
2119                    return err;
2120                }
2121            }
2122        }
2123    }
2124  return REG_NOERROR;
2125}
2126
2127static reg_errcode_t
2128internal_function
2129sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2130                   int str_idx, const re_node_set *candidates)
2131{
2132  const re_dfa_t *const dfa = mctx->dfa;
2133  reg_errcode_t err;
2134  int node_idx, node;
2135  re_sift_context_t local_sctx;
2136  int first_idx = search_cur_bkref_entry (mctx, str_idx);
2137
2138  if (first_idx == -1)
2139    return REG_NOERROR;
2140
2141  local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2142
2143  for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2144    {
2145      int enabled_idx;
2146      re_token_type_t type;
2147      struct re_backref_cache_entry *entry;
2148      node = candidates->elems[node_idx];
2149      type = dfa->nodes[node].type;
2150      /* Avoid infinite loop for the REs like "()\1+".  */
2151      if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2152        continue;
2153      if (type != OP_BACK_REF)
2154        continue;
2155
2156      entry = mctx->bkref_ents + first_idx;
2157      enabled_idx = first_idx;
2158      do
2159        {
2160          int subexp_len;
2161          int to_idx;
2162          int dst_node;
2163          int ret;
2164          re_dfastate_t *cur_state;
2165
2166          if (entry->node != node)
2167            continue;
2168          subexp_len = entry->subexp_to - entry->subexp_from;
2169          to_idx = str_idx + subexp_len;
2170          dst_node = (subexp_len ? dfa->nexts[node]
2171                      : dfa->edests[node].elems[0]);
2172
2173          if (to_idx > sctx->last_str_idx
2174              || sctx->sifted_states[to_idx] == NULL
2175              || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2176              || check_dst_limits (mctx, &sctx->limits, node,
2177                                   str_idx, dst_node, to_idx))
2178            continue;
2179
2180          if (local_sctx.sifted_states == NULL)
2181            {
2182              local_sctx = *sctx;
2183              err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2184              if (BE (err != REG_NOERROR, 0))
2185                goto free_return;
2186            }
2187          local_sctx.last_node = node;
2188          local_sctx.last_str_idx = str_idx;
2189          ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2190          if (BE (ret < 0, 0))
2191            {
2192              err = REG_ESPACE;
2193              goto free_return;
2194            }
2195          cur_state = local_sctx.sifted_states[str_idx];
2196          err = sift_states_backward (mctx, &local_sctx);
2197          if (BE (err != REG_NOERROR, 0))
2198            goto free_return;
2199          if (sctx->limited_states != NULL)
2200            {
2201              err = merge_state_array (dfa, sctx->limited_states,
2202                                       local_sctx.sifted_states,
2203                                       str_idx + 1);
2204              if (BE (err != REG_NOERROR, 0))
2205                goto free_return;
2206            }
2207          local_sctx.sifted_states[str_idx] = cur_state;
2208          re_node_set_remove (&local_sctx.limits, enabled_idx);
2209
2210          /* mctx->bkref_ents may have changed, reload the pointer.  */
2211          entry = mctx->bkref_ents + enabled_idx;
2212        }
2213      while (enabled_idx++, entry++->more);
2214    }
2215  err = REG_NOERROR;
2216 free_return:
2217  if (local_sctx.sifted_states != NULL)
2218    {
2219      re_node_set_free (&local_sctx.limits);
2220    }
2221
2222  return err;
2223}
2224
2225
2226#ifdef RE_ENABLE_I18N
2227static int
2228internal_function
2229sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2230                     int node_idx, int str_idx, int max_str_idx)
2231{
2232  const re_dfa_t *const dfa = mctx->dfa;
2233  int naccepted;
2234  /* Check the node can accept `multi byte'.  */
2235  naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2236  if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2237      !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2238                            dfa->nexts[node_idx]))
2239    /* The node can't accept the `multi byte', or the
2240       destination was already thrown away, then the node
2241       couldn't accept the current input `multi byte'.   */
2242    naccepted = 0;
2243  /* Otherwise, it is sure that the node could accept
2244     `naccepted' bytes input.  */
2245  return naccepted;
2246}
2247#endif /* RE_ENABLE_I18N */
2248
2249\f
2250/* Functions for state transition.  */
2251
2252/* Return the next state to which the current state STATE will transit by
2253   accepting the current input byte, and update STATE_LOG if necessary.
2254   If STATE can accept a multibyte char/collating element/back reference
2255   update the destination of STATE_LOG.  */
2256
2257static re_dfastate_t *
2258internal_function
2259transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2260               re_dfastate_t *state)
2261{
2262  re_dfastate_t **trtable;
2263  unsigned char ch;
2264
2265#ifdef RE_ENABLE_I18N
2266  /* If the current state can accept multibyte.  */
2267  if (BE (state->accept_mb, 0))
2268    {
2269      *err = transit_state_mb (mctx, state);
2270      if (BE (*err != REG_NOERROR, 0))
2271        return NULL;
2272    }
2273#endif /* RE_ENABLE_I18N */
2274
2275  /* Then decide the next state with the single byte.  */
2276#if 0
2277  if (0)
2278    /* don't use transition table  */
2279    return transit_state_sb (err, mctx, state);
2280#endif
2281
2282  /* Use transition table  */
2283  ch = re_string_fetch_byte (&mctx->input);
2284  for (;;)
2285    {
2286      trtable = state->trtable;
2287      if (BE (trtable != NULL, 1))
2288        return trtable[ch];
2289
2290      trtable = state->word_trtable;
2291      if (BE (trtable != NULL, 1))
2292        {
2293          unsigned int context;
2294          context
2295            = re_string_context_at (&mctx->input,
2296                                    re_string_cur_idx (&mctx->input) - 1,
2297                                    mctx->eflags);
2298          if (IS_WORD_CONTEXT (context))
2299            return trtable[ch + SBC_MAX];
2300          else
2301            return trtable[ch];
2302        }
2303
2304      if (!build_trtable (mctx->dfa, state))
2305        {
2306          *err = REG_ESPACE;
2307          return NULL;
2308        }
2309
2310      /* Retry, we now have a transition table.  */
2311    }
2312}
2313
2314/* Update the state_log if we need */
2315static re_dfastate_t *
2316internal_function
2317merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2318                      re_dfastate_t *next_state)
2319{
2320  const re_dfa_t *const dfa = mctx->dfa;
2321  int cur_idx = re_string_cur_idx (&mctx->input);
2322
2323  if (cur_idx > mctx->state_log_top)
2324    {
2325      mctx->state_log[cur_idx] = next_state;
2326      mctx->state_log_top = cur_idx;
2327    }
2328  else if (mctx->state_log[cur_idx] == NULL)
2329    {
2330      mctx->state_log[cur_idx] = next_state;
2331    }
2332  else
2333    {
2334      re_dfastate_t *pstate;
2335      unsigned int context;
2336      re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2337      /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2338         the destination of a multibyte char/collating element/
2339         back reference.  Then the next state is the union set of
2340         these destinations and the results of the transition table.  */
2341      pstate = mctx->state_log[cur_idx];
2342      log_nodes = pstate->entrance_nodes;
2343      if (next_state != NULL)
2344        {
2345          table_nodes = next_state->entrance_nodes;
2346          *err = re_node_set_init_union (&next_nodes, table_nodes,
2347                                             log_nodes);
2348          if (BE (*err != REG_NOERROR, 0))
2349            return NULL;
2350        }
2351      else
2352        next_nodes = *log_nodes;
2353      /* Note: We already add the nodes of the initial state,
2354         then we don't need to add them here.  */
2355
2356      context = re_string_context_at (&mctx->input,
2357                                      re_string_cur_idx (&mctx->input) - 1,
2358                                      mctx->eflags);
2359      next_state = mctx->state_log[cur_idx]
2360        = re_acquire_state_context (err, dfa, &next_nodes, context);
2361      /* We don't need to check errors here, since the return value of
2362         this function is next_state and ERR is already set.  */
2363
2364      if (table_nodes != NULL)
2365        re_node_set_free (&next_nodes);
2366    }
2367
2368  if (BE (dfa->nbackref, 0) && next_state != NULL)
2369    {
2370      /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2371         later.  We must check them here, since the back references in the
2372         next state might use them.  */
2373      *err = check_subexp_matching_top (mctx, &next_state->nodes,
2374                                        cur_idx);
2375      if (BE (*err != REG_NOERROR, 0))
2376        return NULL;
2377
2378      /* If the next state has back references.  */
2379      if (next_state->has_backref)
2380        {
2381          *err = transit_state_bkref (mctx, &next_state->nodes);
2382          if (BE (*err != REG_NOERROR, 0))
2383            return NULL;
2384          next_state = mctx->state_log[cur_idx];
2385        }
2386    }
2387
2388  return next_state;
2389}
2390
2391/* Skip bytes in the input that correspond to part of a
2392   multi-byte match, then look in the log for a state
2393   from which to restart matching.  */
2394static re_dfastate_t *
2395internal_function
2396find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2397{
2398  re_dfastate_t *cur_state;
2399  do
2400    {
2401      int max = mctx->state_log_top;
2402      int cur_str_idx = re_string_cur_idx (&mctx->input);
2403
2404      do
2405        {
2406          if (++cur_str_idx > max)
2407            return NULL;
2408          re_string_skip_bytes (&mctx->input, 1);
2409        }
2410      while (mctx->state_log[cur_str_idx] == NULL);
2411
2412      cur_state = merge_state_with_log (err, mctx, NULL);
2413    }
2414  while (*err == REG_NOERROR && cur_state == NULL);
2415  return cur_state;
2416}
2417
2418/* Helper functions for transit_state.  */
2419
2420/* From the node set CUR_NODES, pick up the nodes whose types are
2421   OP_OPEN_SUBEXP and which have corresponding back references in the regular
2422   expression. And register them to use them later for evaluating the
2423   correspoding back references.  */
2424
2425static reg_errcode_t
2426internal_function
2427check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2428                           int str_idx)
2429{
2430  const re_dfa_t *const dfa = mctx->dfa;
2431  int node_idx;
2432  reg_errcode_t err;
2433
2434  /* TODO: This isn't efficient.
2435           Because there might be more than one nodes whose types are
2436           OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2437           nodes.
2438           E.g. RE: (a){2}  */
2439  for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2440    {
2441      int node = cur_nodes->elems[node_idx];
2442      if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2443          && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2444          && (dfa->used_bkref_map
2445              & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2446        {
2447          err = match_ctx_add_subtop (mctx, node, str_idx);
2448          if (BE (err != REG_NOERROR, 0))
2449            return err;
2450        }
2451    }
2452  return REG_NOERROR;
2453}
2454
2455#if 0
2456/* Return the next state to which the current state STATE will transit by
2457   accepting the current input byte.  */
2458
2459static re_dfastate_t *
2460transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2461                  re_dfastate_t *state)
2462{
2463  const re_dfa_t *const dfa = mctx->dfa;
2464  re_node_set next_nodes;
2465  re_dfastate_t *next_state;
2466  int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2467  unsigned int context;
2468
2469  *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2470  if (BE (*err != REG_NOERROR, 0))
2471    return NULL;
2472  for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2473    {
2474      int cur_node = state->nodes.elems[node_cnt];
2475      if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2476        {
2477          *err = re_node_set_merge (&next_nodes,
2478                                    dfa->eclosures + dfa->nexts[cur_node]);
2479          if (BE (*err != REG_NOERROR, 0))
2480            {
2481              re_node_set_free (&next_nodes);
2482              return NULL;
2483            }
2484        }
2485    }
2486  context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2487  next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2488  /* We don't need to check errors here, since the return value of
2489     this function is next_state and ERR is already set.  */
2490
2491  re_node_set_free (&next_nodes);
2492  re_string_skip_bytes (&mctx->input, 1);
2493  return next_state;
2494}
2495#endif
2496
2497#ifdef RE_ENABLE_I18N
2498static reg_errcode_t
2499internal_function
2500transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2501{
2502  const re_dfa_t *const dfa = mctx->dfa;
2503  reg_errcode_t err;
2504  int i;
2505
2506  for (i = 0; i < pstate->nodes.nelem; ++i)
2507    {
2508      re_node_set dest_nodes, *new_nodes;
2509      int cur_node_idx = pstate->nodes.elems[i];
2510      int naccepted, dest_idx;
2511      unsigned int context;
2512      re_dfastate_t *dest_state;
2513
2514      if (!dfa->nodes[cur_node_idx].accept_mb)
2515        continue;
2516
2517      if (dfa->nodes[cur_node_idx].constraint)
2518        {
2519          context = re_string_context_at (&mctx->input,
2520                                          re_string_cur_idx (&mctx->input),
2521                                          mctx->eflags);
2522          if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2523                                           context))
2524            continue;
2525        }
2526
2527      /* How many bytes the node can accept?  */
2528      naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2529                                           re_string_cur_idx (&mctx->input));
2530      if (naccepted == 0)
2531        continue;
2532
2533      /* The node can accepts `naccepted' bytes.  */
2534      dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2535      mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2536                               : mctx->max_mb_elem_len);
2537      err = clean_state_log_if_needed (mctx, dest_idx);
2538      if (BE (err != REG_NOERROR, 0))
2539        return err;
2540#ifdef DEBUG
2541      assert (dfa->nexts[cur_node_idx] != -1);
2542#endif
2543      new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2544
2545      dest_state = mctx->state_log[dest_idx];
2546      if (dest_state == NULL)
2547        dest_nodes = *new_nodes;
2548      else
2549        {
2550          err = re_node_set_init_union (&dest_nodes,
2551                                        dest_state->entrance_nodes, new_nodes);
2552          if (BE (err != REG_NOERROR, 0))
2553            return err;
2554        }
2555      context = re_string_context_at (&mctx->input, dest_idx - 1,
2556                                      mctx->eflags);
2557      mctx->state_log[dest_idx]
2558        = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2559      if (dest_state != NULL)
2560        re_node_set_free (&dest_nodes);
2561      if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2562        return err;
2563    }
2564  return REG_NOERROR;
2565}
2566#endif /* RE_ENABLE_I18N */
2567
2568static reg_errcode_t
2569internal_function
2570transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2571{
2572  const re_dfa_t *const dfa = mctx->dfa;
2573  reg_errcode_t err;
2574  int i;
2575  int cur_str_idx = re_string_cur_idx (&mctx->input);
2576
2577  for (i = 0; i < nodes->nelem; ++i)
2578    {
2579      int dest_str_idx, prev_nelem, bkc_idx;
2580      int node_idx = nodes->elems[i];
2581      unsigned int context;
2582      const re_token_t *node = dfa->nodes + node_idx;
2583      re_node_set *new_dest_nodes;
2584
2585      /* Check whether `node' is a backreference or not.  */
2586      if (node->type != OP_BACK_REF)
2587        continue;
2588
2589      if (node->constraint)
2590        {
2591          context = re_string_context_at (&mctx->input, cur_str_idx,
2592                                          mctx->eflags);
2593          if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2594            continue;
2595        }
2596
2597      /* `node' is a backreference.
2598         Check the substring which the substring matched.  */
2599      bkc_idx = mctx->nbkref_ents;
2600      err = get_subexp (mctx, node_idx, cur_str_idx);
2601      if (BE (err != REG_NOERROR, 0))
2602        goto free_return;
2603
2604      /* And add the epsilon closures (which is `new_dest_nodes') of
2605         the backreference to appropriate state_log.  */
2606#ifdef DEBUG
2607      assert (dfa->nexts[node_idx] != -1);
2608#endif
2609      for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2610        {
2611          int subexp_len;
2612          re_dfastate_t *dest_state;
2613          struct re_backref_cache_entry *bkref_ent;
2614          bkref_ent = mctx->bkref_ents + bkc_idx;
2615          if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2616            continue;
2617          subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2618          new_dest_nodes = (subexp_len == 0
2619                            ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2620                            : dfa->eclosures + dfa->nexts[node_idx]);
2621          dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2622                          - bkref_ent->subexp_from);
2623          context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2624                                          mctx->eflags);
2625          dest_state = mctx->state_log[dest_str_idx];
2626          prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2627                        : mctx->state_log[cur_str_idx]->nodes.nelem);
2628          /* Add `new_dest_node' to state_log.  */
2629          if (dest_state == NULL)
2630            {
2631              mctx->state_log[dest_str_idx]
2632                = re_acquire_state_context (&err, dfa, new_dest_nodes,
2633                                            context);
2634              if (BE (mctx->state_log[dest_str_idx] == NULL
2635                      && err != REG_NOERROR, 0))
2636                goto free_return;
2637            }
2638          else
2639            {
2640              re_node_set dest_nodes;
2641              err = re_node_set_init_union (&dest_nodes,
2642                                            dest_state->entrance_nodes,
2643                                            new_dest_nodes);
2644              if (BE (err != REG_NOERROR, 0))
2645                {
2646                  re_node_set_free (&dest_nodes);
2647                  goto free_return;
2648                }
2649              mctx->state_log[dest_str_idx]
2650                = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2651              re_node_set_free (&dest_nodes);
2652              if (BE (mctx->state_log[dest_str_idx] == NULL
2653                      && err != REG_NOERROR, 0))
2654                goto free_return;
2655            }
2656          /* We need to check recursively if the backreference can epsilon
2657             transit.  */
2658          if (subexp_len == 0
2659              && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2660            {
2661              err = check_subexp_matching_top (mctx, new_dest_nodes,
2662                                               cur_str_idx);
2663              if (BE (err != REG_NOERROR, 0))
2664                goto free_return;
2665              err = transit_state_bkref (mctx, new_dest_nodes);
2666              if (BE (err != REG_NOERROR, 0))
2667                goto free_return;
2668            }
2669        }
2670    }
2671  err = REG_NOERROR;
2672 free_return:
2673  return err;
2674}
2675
2676/* Enumerate all the candidates which the backreference BKREF_NODE can match
2677   at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2678   Note that we might collect inappropriate candidates here.
2679   However, the cost of checking them strictly here is too high, then we
2680   delay these checking for prune_impossible_nodes().  */
2681
2682static reg_errcode_t
2683internal_function
2684get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2685{
2686  const re_dfa_t *const dfa = mctx->dfa;
2687  int subexp_num, sub_top_idx;
2688  const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2689  /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2690  int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2691  if (cache_idx != -1)
2692    {
2693      const struct re_backref_cache_entry *entry
2694        = mctx->bkref_ents + cache_idx;
2695      do
2696        if (entry->node == bkref_node)
2697          return REG_NOERROR; /* We already checked it.  */
2698      while (entry++->more);
2699    }
2700
2701  subexp_num = dfa->nodes[bkref_node].opr.idx;
2702
2703  /* For each sub expression  */
2704  for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2705    {
2706      reg_errcode_t err;
2707      re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2708      re_sub_match_last_t *sub_last;
2709      int sub_last_idx, sl_str, bkref_str_off;
2710
2711      if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2712        continue; /* It isn't related.  */
2713
2714      sl_str = sub_top->str_idx;
2715      bkref_str_off = bkref_str_idx;
2716      /* At first, check the last node of sub expressions we already
2717         evaluated.  */
2718      for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2719        {
2720          int sl_str_diff;
2721          sub_last = sub_top->lasts[sub_last_idx];
2722          sl_str_diff = sub_last->str_idx - sl_str;
2723          /* The matched string by the sub expression match with the substring
2724             at the back reference?  */
2725          if (sl_str_diff > 0)
2726            {
2727              if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2728                {
2729                  /* Not enough chars for a successful match.  */
2730                  if (bkref_str_off + sl_str_diff > mctx->input.len)
2731                    break;
2732
2733                  err = clean_state_log_if_needed (mctx,
2734                                                   bkref_str_off
2735                                                   + sl_str_diff);
2736                  if (BE (err != REG_NOERROR, 0))
2737                    return err;
2738                  buf = (const char *) re_string_get_buffer (&mctx->input);
2739                }
2740              if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2741                /* We don't need to search this sub expression any more.  */
2742                break;
2743            }
2744          bkref_str_off += sl_str_diff;
2745          sl_str += sl_str_diff;
2746          err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2747                                bkref_str_idx);
2748
2749          /* Reload buf, since the preceding call might have reallocated
2750             the buffer.  */
2751          buf = (const char *) re_string_get_buffer (&mctx->input);
2752
2753          if (err == REG_NOMATCH)
2754            continue;
2755          if (BE (err != REG_NOERROR, 0))
2756            return err;
2757        }
2758
2759      if (sub_last_idx < sub_top->nlasts)
2760        continue;
2761      if (sub_last_idx > 0)
2762        ++sl_str;
2763      /* Then, search for the other last nodes of the sub expression.  */
2764      for (; sl_str <= bkref_str_idx; ++sl_str)
2765        {
2766          int cls_node, sl_str_off;
2767          const re_node_set *nodes;
2768          sl_str_off = sl_str - sub_top->str_idx;
2769          /* The matched string by the sub expression match with the substring
2770             at the back reference?  */
2771          if (sl_str_off > 0)
2772            {
2773              if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2774                {
2775                  /* If we are at the end of the input, we cannot match.  */
2776                  if (bkref_str_off >= mctx->input.len)
2777                    break;
2778
2779                  err = extend_buffers (mctx);
2780                  if (BE (err != REG_NOERROR, 0))
2781                    return err;
2782
2783                  buf = (const char *) re_string_get_buffer (&mctx->input);
2784                }
2785              if (buf [bkref_str_off++] != buf[sl_str - 1])
2786                break; /* We don't need to search this sub expression
2787                          any more.  */
2788            }
2789          if (mctx->state_log[sl_str] == NULL)
2790            continue;
2791          /* Does this state have a ')' of the sub expression?  */
2792          nodes = &mctx->state_log[sl_str]->nodes;
2793          cls_node = find_subexp_node (dfa, nodes, subexp_num,
2794                                       OP_CLOSE_SUBEXP);
2795          if (cls_node == -1)
2796            continue; /* No.  */
2797          if (sub_top->path == NULL)
2798            {
2799              sub_top->path = calloc (sizeof (state_array_t),
2800                                      sl_str - sub_top->str_idx + 1);
2801              if (sub_top->path == NULL)
2802                return REG_ESPACE;
2803            }
2804          /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2805             in the current context?  */
2806          err = check_arrival (mctx, sub_top->path, sub_top->node,
2807                               sub_top->str_idx, cls_node, sl_str,
2808                               OP_CLOSE_SUBEXP);
2809          if (err == REG_NOMATCH)
2810              continue;
2811          if (BE (err != REG_NOERROR, 0))
2812              return err;
2813          sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2814          if (BE (sub_last == NULL, 0))
2815            return REG_ESPACE;
2816          err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2817                                bkref_str_idx);
2818          if (err == REG_NOMATCH)
2819            continue;
2820        }
2821    }
2822  return REG_NOERROR;
2823}
2824
2825/* Helper functions for get_subexp().  */
2826
2827/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2828   If it can arrive, register the sub expression expressed with SUB_TOP
2829   and SUB_LAST.  */
2830
2831static reg_errcode_t
2832internal_function
2833get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2834                re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2835{
2836  reg_errcode_t err;
2837  int to_idx;
2838  /* Can the subexpression arrive the back reference?  */
2839  err = check_arrival (mctx, &sub_last->path, sub_last->node,
2840                       sub_last->str_idx, bkref_node, bkref_str,
2841                       OP_OPEN_SUBEXP);
2842  if (err != REG_NOERROR)
2843    return err;
2844  err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2845                             sub_last->str_idx);
2846  if (BE (err != REG_NOERROR, 0))
2847    return err;
2848  to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2849  return clean_state_log_if_needed (mctx, to_idx);
2850}
2851
2852/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2853   Search '(' if FL_OPEN, or search ')' otherwise.
2854   TODO: This function isn't efficient...
2855         Because there might be more than one nodes whose types are
2856         OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2857         nodes.
2858         E.g. RE: (a){2}  */
2859
2860static int
2861internal_function
2862find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2863                  int subexp_idx, int type)
2864{
2865  int cls_idx;
2866  for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2867    {
2868      int cls_node = nodes->elems[cls_idx];
2869      const re_token_t *node = dfa->nodes + cls_node;
2870      if (node->type == type
2871          && node->opr.idx == subexp_idx)
2872        return cls_node;
2873    }
2874  return -1;
2875}
2876
2877/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2878   LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2879   heavily reused.
2880   Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2881
2882static reg_errcode_t
2883internal_function
2884check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2885               int top_str, int last_node, int last_str, int type)
2886{
2887  const re_dfa_t *const dfa = mctx->dfa;
2888  reg_errcode_t err = REG_NOERROR;
2889  int subexp_num, backup_cur_idx, str_idx, null_cnt;
2890  re_dfastate_t *cur_state = NULL;
2891  re_node_set *cur_nodes, next_nodes;
2892  re_dfastate_t **backup_state_log;
2893  unsigned int context;
2894
2895  subexp_num = dfa->nodes[top_node].opr.idx;
2896  /* Extend the buffer if we need.  */
2897  if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2898    {
2899      re_dfastate_t **new_array;
2900      int old_alloc = path->alloc;
2901      path->alloc += last_str + mctx->max_mb_elem_len + 1;
2902      new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2903      if (BE (new_array == NULL, 0))
2904        {
2905          path->alloc = old_alloc;
2906          return REG_ESPACE;
2907        }
2908      path->array = new_array;
2909      memset (new_array + old_alloc, '\0',
2910              sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2911    }
2912
2913  str_idx = path->next_idx ? path->next_idx : top_str;
2914
2915  /* Temporary modify MCTX.  */
2916  backup_state_log = mctx->state_log;
2917  backup_cur_idx = mctx->input.cur_idx;
2918  mctx->state_log = path->array;
2919  mctx->input.cur_idx = str_idx;
2920
2921  /* Setup initial node set.  */
2922  context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2923  if (str_idx == top_str)
2924    {
2925      err = re_node_set_init_1 (&next_nodes, top_node);
2926      if (BE (err != REG_NOERROR, 0))
2927        return err;
2928      err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2929      if (BE (err != REG_NOERROR, 0))
2930        {
2931          re_node_set_free (&next_nodes);
2932          return err;
2933        }
2934    }
2935  else
2936    {
2937      cur_state = mctx->state_log[str_idx];
2938      if (cur_state && cur_state->has_backref)
2939        {
2940          err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2941          if (BE (err != REG_NOERROR, 0))
2942            return err;
2943        }
2944      else
2945        re_node_set_init_empty (&next_nodes);
2946    }
2947  if (str_idx == top_str || (cur_state && cur_state->has_backref))
2948    {
2949      if (next_nodes.nelem)
2950        {
2951          err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2952                                    subexp_num, type);
2953          if (BE (err != REG_NOERROR, 0))
2954            {
2955              re_node_set_free (&next_nodes);
2956              return err;
2957            }
2958        }
2959      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2960      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2961        {
2962          re_node_set_free (&next_nodes);
2963          return err;
2964        }
2965      mctx->state_log[str_idx] = cur_state;
2966    }
2967
2968  for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2969    {
2970      re_node_set_empty (&next_nodes);
2971      if (mctx->state_log[str_idx + 1])
2972        {
2973          err = re_node_set_merge (&next_nodes,
2974                                   &mctx->state_log[str_idx + 1]->nodes);
2975          if (BE (err != REG_NOERROR, 0))
2976            {
2977              re_node_set_free (&next_nodes);
2978              return err;
2979            }
2980        }
2981      if (cur_state)
2982        {
2983          err = check_arrival_add_next_nodes (mctx, str_idx,
2984                                              &cur_state->non_eps_nodes,
2985                                              &next_nodes);
2986          if (BE (err != REG_NOERROR, 0))
2987            {
2988              re_node_set_free (&next_nodes);
2989              return err;
2990            }
2991        }
2992      ++str_idx;
2993      if (next_nodes.nelem)
2994        {
2995          err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2996          if (BE (err != REG_NOERROR, 0))
2997            {
2998              re_node_set_free (&next_nodes);
2999              return err;
3000            }
3001          err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3002                                    subexp_num, type);
3003          if (BE (err != REG_NOERROR, 0))
3004            {
3005              re_node_set_free (&next_nodes);
3006              return err;
3007            }
3008        }
3009      context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3010      cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3011      if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3012        {
3013          re_node_set_free (&next_nodes);
3014          return err;
3015        }
3016      mctx->state_log[str_idx] = cur_state;
3017      null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3018    }
3019  re_node_set_free (&next_nodes);
3020  cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3021               : &mctx->state_log[last_str]->nodes);
3022  path->next_idx = str_idx;
3023
3024  /* Fix MCTX.  */
3025  mctx->state_log = backup_state_log;
3026  mctx->input.cur_idx = backup_cur_idx;
3027
3028  /* Then check the current node set has the node LAST_NODE.  */
3029  if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3030    return REG_NOERROR;
3031
3032  return REG_NOMATCH;
3033}
3034
3035/* Helper functions for check_arrival.  */
3036
3037/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3038   to NEXT_NODES.
3039   TODO: This function is similar to the functions transit_state*(),
3040         however this function has many additional works.
3041         Can't we unify them?  */
3042
3043static reg_errcode_t
3044internal_function
3045check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3046                              re_node_set *cur_nodes, re_node_set *next_nodes)
3047{
3048  const re_dfa_t *const dfa = mctx->dfa;
3049  int result;
3050  int cur_idx;
3051#ifdef RE_ENABLE_I18N
3052  reg_errcode_t err = REG_NOERROR;
3053#endif
3054  re_node_set union_set;
3055  re_node_set_init_empty (&union_set);
3056  for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3057    {
3058      int naccepted = 0;
3059      int cur_node = cur_nodes->elems[cur_idx];
3060#ifdef DEBUG
3061      re_token_type_t type = dfa->nodes[cur_node].type;
3062      assert (!IS_EPSILON_NODE (type));
3063#endif
3064#ifdef RE_ENABLE_I18N
3065      /* If the node may accept `multi byte'.  */
3066      if (dfa->nodes[cur_node].accept_mb)
3067        {
3068          naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3069                                               str_idx);
3070          if (naccepted > 1)
3071            {
3072              re_dfastate_t *dest_state;
3073              int next_node = dfa->nexts[cur_node];
3074              int next_idx = str_idx + naccepted;
3075              dest_state = mctx->state_log[next_idx];
3076              re_node_set_empty (&union_set);
3077              if (dest_state)
3078                {
3079                  err = re_node_set_merge (&union_set, &dest_state->nodes);
3080                  if (BE (err != REG_NOERROR, 0))
3081                    {
3082                      re_node_set_free (&union_set);
3083                      return err;
3084                    }
3085                }
3086              result = re_node_set_insert (&union_set, next_node);
3087              if (BE (result < 0, 0))
3088                {
3089                  re_node_set_free (&union_set);
3090                  return REG_ESPACE;
3091                }
3092              mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3093                                                            &union_set);
3094              if (BE (mctx->state_log[next_idx] == NULL
3095                      && err != REG_NOERROR, 0))
3096                {
3097                  re_node_set_free (&union_set);
3098                  return err;
3099                }
3100            }
3101        }
3102#endif /* RE_ENABLE_I18N */
3103      if (naccepted
3104          || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3105        {
3106          result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3107          if (BE (result < 0, 0))
3108            {
3109              re_node_set_free (&union_set);
3110              return REG_ESPACE;
3111            }
3112        }
3113    }
3114  re_node_set_free (&union_set);
3115  return REG_NOERROR;
3116}
3117
3118/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3119   CUR_NODES, however exclude the nodes which are:
3120    - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3121    - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3122*/
3123
3124static reg_errcode_t
3125internal_function
3126check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3127                          int ex_subexp, int type)
3128{
3129  reg_errcode_t err;
3130  int idx, outside_node;
3131  re_node_set new_nodes;
3132#ifdef DEBUG
3133  assert (cur_nodes->nelem);
3134#endif
3135  err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3136  if (BE (err != REG_NOERROR, 0))
3137    return err;
3138  /* Create a new node set NEW_NODES with the nodes which are epsilon
3139     closures of the node in CUR_NODES.  */
3140
3141  for (idx = 0; idx < cur_nodes->nelem; ++idx)
3142    {
3143      int cur_node = cur_nodes->elems[idx];
3144      const re_node_set *eclosure = dfa->eclosures + cur_node;
3145      outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3146      if (outside_node == -1)
3147        {
3148          /* There are no problematic nodes, just merge them.  */
3149          err = re_node_set_merge (&new_nodes, eclosure);
3150          if (BE (err != REG_NOERROR, 0))
3151            {
3152              re_node_set_free (&new_nodes);
3153              return err;
3154            }
3155        }
3156      else
3157        {
3158          /* There are problematic nodes, re-calculate incrementally.  */
3159          err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3160                                              ex_subexp, type);
3161          if (BE (err != REG_NOERROR, 0))
3162            {
3163              re_node_set_free (&new_nodes);
3164              return err;
3165            }
3166        }
3167    }
3168  re_node_set_free (cur_nodes);
3169  *cur_nodes = new_nodes;
3170  return REG_NOERROR;
3171}
3172
3173/* Helper function for check_arrival_expand_ecl.
3174   Check incrementally the epsilon closure of TARGET, and if it isn't
3175   problematic append it to DST_NODES.  */
3176
3177static reg_errcode_t
3178internal_function
3179check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3180                              int target, int ex_subexp, int type)
3181{
3182  int cur_node;
3183  for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3184    {
3185      int err;
3186
3187      if (dfa->nodes[cur_node].type == type
3188          && dfa->nodes[cur_node].opr.idx == ex_subexp)
3189        {
3190          if (type == OP_CLOSE_SUBEXP)
3191            {
3192              err = re_node_set_insert (dst_nodes, cur_node);
3193              if (BE (err == -1, 0))
3194                return REG_ESPACE;
3195            }
3196          break;
3197        }
3198      err = re_node_set_insert (dst_nodes, cur_node);
3199      if (BE (err == -1, 0))
3200        return REG_ESPACE;
3201      if (dfa->edests[cur_node].nelem == 0)
3202        break;
3203      if (dfa->edests[cur_node].nelem == 2)
3204        {
3205          err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3206                                              dfa->edests[cur_node].elems[1],
3207                                              ex_subexp, type);
3208          if (BE (err != REG_NOERROR, 0))
3209            return err;
3210        }
3211      cur_node = dfa->edests[cur_node].elems[0];
3212    }
3213  return REG_NOERROR;
3214}
3215
3216
3217/* For all the back references in the current state, calculate the
3218   destination of the back references by the appropriate entry
3219   in MCTX->BKREF_ENTS.  */
3220
3221static reg_errcode_t
3222internal_function
3223expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3224                    int cur_str, int subexp_num, int type)
3225{
3226  const re_dfa_t *const dfa = mctx->dfa;
3227  reg_errcode_t err;
3228  int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3229  struct re_backref_cache_entry *ent;
3230
3231  if (cache_idx_start == -1)
3232    return REG_NOERROR;
3233
3234 restart:
3235  ent = mctx->bkref_ents + cache_idx_start;
3236  do
3237    {
3238      int to_idx, next_node;
3239
3240      /* Is this entry ENT is appropriate?  */
3241      if (!re_node_set_contains (cur_nodes, ent->node))
3242        continue; /* No.  */
3243
3244      to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3245      /* Calculate the destination of the back reference, and append it
3246         to MCTX->STATE_LOG.  */
3247      if (to_idx == cur_str)
3248        {
3249          /* The backreference did epsilon transit, we must re-check all the
3250             node in the current state.  */
3251          re_node_set new_dests;
3252          reg_errcode_t err2, err3;
3253          next_node = dfa->edests[ent->node].elems[0];
3254          if (re_node_set_contains (cur_nodes, next_node))
3255            continue;
3256          err = re_node_set_init_1 (&new_dests, next_node);
3257          err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3258          err3 = re_node_set_merge (cur_nodes, &new_dests);
3259          re_node_set_free (&new_dests);
3260          if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3261                  || err3 != REG_NOERROR, 0))
3262            {
3263              err = (err != REG_NOERROR ? err
3264                     : (err2 != REG_NOERROR ? err2 : err3));
3265              return err;
3266            }
3267          /* TODO: It is still inefficient...  */
3268          goto restart;
3269        }
3270      else
3271        {
3272          re_node_set union_set;
3273          next_node = dfa->nexts[ent->node];
3274          if (mctx->state_log[to_idx])
3275            {
3276              int ret;
3277              if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3278                                        next_node))
3279                continue;
3280              err = re_node_set_init_copy (&union_set,
3281                                           &mctx->state_log[to_idx]->nodes);
3282              ret = re_node_set_insert (&union_set, next_node);
3283              if (BE (err != REG_NOERROR || ret < 0, 0))
3284                {
3285                  re_node_set_free (&union_set);
3286                  err = err != REG_NOERROR ? err : REG_ESPACE;
3287                  return err;
3288                }
3289            }
3290          else
3291            {
3292              err = re_node_set_init_1 (&union_set, next_node);
3293              if (BE (err != REG_NOERROR, 0))
3294                return err;
3295            }
3296          mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3297          re_node_set_free (&union_set);
3298          if (BE (mctx->state_log[to_idx] == NULL
3299                  && err != REG_NOERROR, 0))
3300            return err;
3301        }
3302    }
3303  while (ent++->more);
3304  return REG_NOERROR;
3305}
3306
3307/* Build transition table for the state.
3308   Return 1 if succeeded, otherwise return NULL.  */
3309
3310static int
3311internal_function
3312build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3313{
3314  reg_errcode_t err;
3315  int i, j, ch, need_word_trtable = 0;
3316  bitset_word_t elem, mask;
3317  bool dests_node_malloced = false;
3318  bool dest_states_malloced = false;
3319  int ndests; /* Number of the destination states from `state'.  */
3320  re_dfastate_t **trtable;
3321  re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3322  re_node_set follows, *dests_node;
3323  bitset_t *dests_ch;
3324  bitset_t acceptable;
3325
3326  struct dests_alloc
3327  {
3328    re_node_set dests_node[SBC_MAX];
3329    bitset_t dests_ch[SBC_MAX];
3330  } *dests_alloc;
3331
3332  /* We build DFA states which corresponds to the destination nodes
3333     from `state'.  `dests_node[i]' represents the nodes which i-th
3334     destination state contains, and `dests_ch[i]' represents the
3335     characters which i-th destination state accepts.  */
3336#ifdef HAVE_ALLOCA
3337  if (__libc_use_alloca (sizeof (struct dests_alloc)))
3338    dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3339  else
3340#endif
3341    {
3342      dests_alloc = re_malloc (struct dests_alloc, 1);
3343      if (BE (dests_alloc == NULL, 0))
3344        return 0;
3345      dests_node_malloced = true;
3346    }
3347  dests_node = dests_alloc->dests_node;
3348  dests_ch = dests_alloc->dests_ch;
3349
3350  /* Initialize transiton table.  */
3351  state->word_trtable = state->trtable = NULL;
3352
3353  /* At first, group all nodes belonging to `state' into several
3354     destinations.  */
3355  ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3356  if (BE (ndests <= 0, 0))
3357    {
3358      if (dests_node_malloced)
3359        free (dests_alloc);
3360      /* Return 0 in case of an error, 1 otherwise.  */
3361      if (ndests == 0)
3362        {
3363          state->trtable = (re_dfastate_t **)
3364            calloc (sizeof (re_dfastate_t *), SBC_MAX);
3365          return 1;
3366        }
3367      return 0;
3368    }
3369
3370  err = re_node_set_alloc (&follows, ndests + 1);
3371  if (BE (err != REG_NOERROR, 0))
3372    goto out_free;
3373
3374  /* Avoid arithmetic overflow in size calculation.  */
3375  if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3376            / (3 * sizeof (re_dfastate_t *)))
3377           < ndests),
3378          0))
3379    goto out_free;
3380
3381#ifdef HAVE_ALLOCA
3382  if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3383                         + ndests * 3 * sizeof (re_dfastate_t *)))
3384    dest_states = (re_dfastate_t **)
3385      alloca (ndests * 3 * sizeof (re_dfastate_t *));
3386  else
3387#endif
3388    {
3389      dest_states = (re_dfastate_t **)
3390        malloc (ndests * 3 * sizeof (re_dfastate_t *));
3391      if (BE (dest_states == NULL, 0))
3392        {
3393out_free:
3394          if (dest_states_malloced)
3395            free (dest_states);
3396          re_node_set_free (&follows);
3397          for (i = 0; i < ndests; ++i)
3398            re_node_set_free (dests_node + i);
3399          if (dests_node_malloced)
3400            free (dests_alloc);
3401          return 0;
3402        }
3403      dest_states_malloced = true;
3404    }
3405  dest_states_word = dest_states + ndests;
3406  dest_states_nl = dest_states_word + ndests;
3407  bitset_empty (acceptable);
3408
3409  /* Then build the states for all destinations.  */
3410  for (i = 0; i < ndests; ++i)
3411    {
3412      int next_node;
3413      re_node_set_empty (&follows);
3414      /* Merge the follows of this destination states.  */
3415      for (j = 0; j < dests_node[i].nelem; ++j)
3416        {
3417          next_node = dfa->nexts[dests_node[i].elems[j]];
3418          if (next_node != -1)
3419            {
3420              err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3421              if (BE (err != REG_NOERROR, 0))
3422                goto out_free;
3423            }
3424        }
3425      dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3426      if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3427        goto out_free;
3428      /* If the new state has context constraint,
3429         build appropriate states for these contexts.  */
3430      if (dest_states[i]->has_constraint)
3431        {
3432          dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3433                                                          CONTEXT_WORD);
3434          if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3435            goto out_free;
3436
3437          if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3438            need_word_trtable = 1;
3439
3440          dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3441                                                        CONTEXT_NEWLINE);
3442          if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3443            goto out_free;
3444        }
3445      else
3446        {
3447          dest_states_word[i] = dest_states[i];
3448          dest_states_nl[i] = dest_states[i];
3449        }
3450      bitset_merge (acceptable, dests_ch[i]);
3451    }
3452
3453  if (!BE (need_word_trtable, 0))
3454    {
3455      /* We don't care about whether the following character is a word
3456         character, or we are in a single-byte character set so we can
3457         discern by looking at the character code: allocate a
3458         256-entry transition table.  */
3459      trtable = state->trtable =
3460        (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3461      if (BE (trtable == NULL, 0))
3462        goto out_free;
3463
3464      /* For all characters ch...:  */
3465      for (i = 0; i < BITSET_WORDS; ++i)
3466        for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3467             elem;
3468             mask <<= 1, elem >>= 1, ++ch)
3469          if (BE (elem & 1, 0))
3470            {
3471              /* There must be exactly one destination which accepts
3472                 character ch.  See group_nodes_into_DFAstates.  */
3473              for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3474                ;
3475
3476              /* j-th destination accepts the word character ch.  */
3477              if (dfa->word_char[i] & mask)
3478                trtable[ch] = dest_states_word[j];
3479              else
3480                trtable[ch] = dest_states[j];
3481            }
3482    }
3483  else
3484    {
3485      /* We care about whether the following character is a word
3486         character, and we are in a multi-byte character set: discern
3487         by looking at the character code: build two 256-entry
3488         transition tables, one starting at trtable[0] and one
3489         starting at trtable[SBC_MAX].  */
3490      trtable = state->word_trtable =
3491        (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3492      if (BE (trtable == NULL, 0))
3493        goto out_free;
3494
3495      /* For all characters ch...:  */
3496      for (i = 0; i < BITSET_WORDS; ++i)
3497        for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3498             elem;
3499             mask <<= 1, elem >>= 1, ++ch)
3500          if (BE (elem & 1, 0))
3501            {
3502              /* There must be exactly one destination which accepts
3503                 character ch.  See group_nodes_into_DFAstates.  */
3504              for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3505                ;
3506
3507              /* j-th destination accepts the word character ch.  */
3508              trtable[ch] = dest_states[j];
3509              trtable[ch + SBC_MAX] = dest_states_word[j];
3510            }
3511    }
3512
3513  /* new line */
3514  if (bitset_contain (acceptable, NEWLINE_CHAR))
3515    {
3516      /* The current state accepts newline character.  */
3517      for (j = 0; j < ndests; ++j)
3518        if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3519          {
3520            /* k-th destination accepts newline character.  */
3521            trtable[NEWLINE_CHAR] = dest_states_nl[j];
3522            if (need_word_trtable)
3523              trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3524            /* There must be only one destination which accepts
3525               newline.  See group_nodes_into_DFAstates.  */
3526            break;
3527          }
3528    }
3529
3530  if (dest_states_malloced)
3531    free (dest_states);
3532
3533  re_node_set_free (&follows);
3534  for (i = 0; i < ndests; ++i)
3535    re_node_set_free (dests_node + i);
3536
3537  if (dests_node_malloced)
3538    free (dests_alloc);
3539
3540  return 1;
3541}
3542
3543/* Group all nodes belonging to STATE into several destinations.
3544   Then for all destinations, set the nodes belonging to the destination
3545   to DESTS_NODE[i] and set the characters accepted by the destination
3546   to DEST_CH[i].  This function return the number of destinations.  */
3547
3548static int
3549internal_function
3550group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3551                            re_node_set *dests_node, bitset_t *dests_ch)
3552{
3553  reg_errcode_t err;
3554  int result;
3555  int i, j, k;
3556  int ndests; /* Number of the destinations from `state'.  */
3557  bitset_t accepts; /* Characters a node can accept.  */
3558  const re_node_set *cur_nodes = &state->nodes;
3559  bitset_empty (accepts);
3560  ndests = 0;
3561
3562  /* For all the nodes belonging to `state',  */
3563  for (i = 0; i < cur_nodes->nelem; ++i)
3564    {
3565      re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3566      re_token_type_t type = node->type;
3567      unsigned int constraint = node->constraint;
3568
3569      /* Enumerate all single byte character this node can accept.  */
3570      if (type == CHARACTER)
3571        bitset_set (accepts, node->opr.c);
3572      else if (type == SIMPLE_BRACKET)
3573        {
3574          bitset_merge (accepts, node->opr.sbcset);
3575        }
3576      else if (type == OP_PERIOD)
3577        {
3578#ifdef RE_ENABLE_I18N
3579          if (dfa->mb_cur_max > 1)
3580            bitset_merge (accepts, dfa->sb_char);
3581          else
3582#endif
3583            bitset_set_all (accepts);
3584          if (!(dfa->syntax & RE_DOT_NEWLINE))
3585            bitset_clear (accepts, '\n');
3586          if (dfa->syntax & RE_DOT_NOT_NULL)
3587            bitset_clear (accepts, '\0');
3588        }
3589#ifdef RE_ENABLE_I18N
3590      else if (type == OP_UTF8_PERIOD)
3591        {
3592          memset (accepts, '\xff', sizeof (bitset_t) / 2);
3593          if (!(dfa->syntax & RE_DOT_NEWLINE))
3594            bitset_clear (accepts, '\n');
3595          if (dfa->syntax & RE_DOT_NOT_NULL)
3596            bitset_clear (accepts, '\0');
3597        }
3598#endif
3599      else
3600        continue;
3601
3602      /* Check the `accepts' and sift the characters which are not
3603         match it the context.  */
3604      if (constraint)
3605        {
3606          if (constraint & NEXT_NEWLINE_CONSTRAINT)
3607            {
3608              bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3609              bitset_empty (accepts);
3610              if (accepts_newline)
3611                bitset_set (accepts, NEWLINE_CHAR);
3612              else
3613                continue;
3614            }
3615          if (constraint & NEXT_ENDBUF_CONSTRAINT)
3616            {
3617              bitset_empty (accepts);
3618              continue;
3619            }
3620
3621          if (constraint & NEXT_WORD_CONSTRAINT)
3622            {
3623              bitset_word_t any_set = 0;
3624              if (type == CHARACTER && !node->word_char)
3625                {
3626                  bitset_empty (accepts);
3627                  continue;
3628                }
3629#ifdef RE_ENABLE_I18N
3630              if (dfa->mb_cur_max > 1)
3631                for (j = 0; j < BITSET_WORDS; ++j)
3632                  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3633              else
3634#endif
3635                for (j = 0; j < BITSET_WORDS; ++j)
3636                  any_set |= (accepts[j] &= dfa->word_char[j]);
3637              if (!any_set)
3638                continue;
3639            }
3640          if (constraint & NEXT_NOTWORD_CONSTRAINT)
3641            {
3642              bitset_word_t any_set = 0;
3643              if (type == CHARACTER && node->word_char)
3644                {
3645                  bitset_empty (accepts);
3646                  continue;
3647                }
3648#ifdef RE_ENABLE_I18N
3649              if (dfa->mb_cur_max > 1)
3650                for (j = 0; j < BITSET_WORDS; ++j)
3651                  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3652              else
3653#endif
3654                for (j = 0; j < BITSET_WORDS; ++j)
3655                  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3656              if (!any_set)
3657                continue;
3658            }
3659        }
3660
3661      /* Then divide `accepts' into DFA states, or create a new
3662         state.  Above, we make sure that accepts is not empty.  */
3663      for (j = 0; j < ndests; ++j)
3664        {
3665          bitset_t intersec; /* Intersection sets, see below.  */
3666          bitset_t remains;
3667          /* Flags, see below.  */
3668          bitset_word_t has_intersec, not_subset, not_consumed;
3669
3670          /* Optimization, skip if this state doesn't accept the character.  */
3671          if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3672            continue;
3673
3674          /* Enumerate the intersection set of this state and `accepts'.  */
3675          has_intersec = 0;
3676          for (k = 0; k < BITSET_WORDS; ++k)
3677            has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3678          /* And skip if the intersection set is empty.  */
3679          if (!has_intersec)
3680            continue;
3681
3682          /* Then check if this state is a subset of `accepts'.  */
3683          not_subset = not_consumed = 0;
3684          for (k = 0; k < BITSET_WORDS; ++k)
3685            {
3686              not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3687              not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3688            }
3689
3690          /* If this state isn't a subset of `accepts', create a
3691             new group state, which has the `remains'. */
3692          if (not_subset)
3693            {
3694              bitset_copy (dests_ch[ndests], remains);
3695              bitset_copy (dests_ch[j], intersec);
3696              err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3697              if (BE (err != REG_NOERROR, 0))
3698                goto error_return;
3699              ++ndests;
3700            }
3701
3702          /* Put the position in the current group. */
3703          result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3704          if (BE (result < 0, 0))
3705            goto error_return;
3706
3707          /* If all characters are consumed, go to next node. */
3708          if (!not_consumed)
3709            break;
3710        }
3711      /* Some characters remain, create a new group. */
3712      if (j == ndests)
3713        {
3714          bitset_copy (dests_ch[ndests], accepts);
3715          err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3716          if (BE (err != REG_NOERROR, 0))
3717            goto error_return;
3718          ++ndests;
3719          bitset_empty (accepts);
3720        }
3721    }
3722  return ndests;
3723 error_return:
3724  for (j = 0; j < ndests; ++j)
3725    re_node_set_free (dests_node + j);
3726  return -1;
3727}
3728
3729#ifdef RE_ENABLE_I18N
3730/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3731   Return the number of the bytes the node accepts.
3732   STR_IDX is the current index of the input string.
3733
3734   This function handles the nodes which can accept one character, or
3735   one collating element like '.', '[a-z]', opposite to the other nodes
3736   can only accept one byte.  */
3737
3738static int
3739internal_function
3740check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3741                         const re_string_t *input, int str_idx)
3742{
3743  const re_token_t *node = dfa->nodes + node_idx;
3744  int char_len, elem_len;
3745  int i;
3746  wint_t wc;
3747
3748  if (BE (node->type == OP_UTF8_PERIOD, 0))
3749    {
3750      unsigned char c = re_string_byte_at (input, str_idx), d;
3751      if (BE (c < 0xc2, 1))
3752        return 0;
3753
3754      if (str_idx + 2 > input->len)
3755        return 0;
3756
3757      d = re_string_byte_at (input, str_idx + 1);
3758      if (c < 0xe0)
3759        return (d < 0x80 || d > 0xbf) ? 0 : 2;
3760      else if (c < 0xf0)
3761        {
3762          char_len = 3;
3763          if (c == 0xe0 && d < 0xa0)
3764            return 0;
3765        }
3766      else if (c < 0xf8)
3767        {
3768          char_len = 4;
3769          if (c == 0xf0 && d < 0x90)
3770            return 0;
3771        }
3772      else if (c < 0xfc)
3773        {
3774          char_len = 5;
3775          if (c == 0xf8 && d < 0x88)
3776            return 0;
3777        }
3778      else if (c < 0xfe)
3779        {
3780          char_len = 6;
3781          if (c == 0xfc && d < 0x84)
3782            return 0;
3783        }
3784      else
3785        return 0;
3786
3787      if (str_idx + char_len > input->len)
3788        return 0;
3789
3790      for (i = 1; i < char_len; ++i)
3791        {
3792          d = re_string_byte_at (input, str_idx + i);
3793          if (d < 0x80 || d > 0xbf)
3794            return 0;
3795        }
3796      return char_len;
3797    }
3798
3799  char_len = re_string_char_size_at (input, str_idx);
3800  if (node->type == OP_PERIOD)
3801    {
3802      if (char_len <= 1)
3803        return 0;
3804      /* FIXME: I don't think this if is needed, as both '\n'
3805         and '\0' are char_len == 1.  */
3806      /* '.' accepts any one character except the following two cases.  */
3807      if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3808           re_string_byte_at (input, str_idx) == '\n') ||
3809          ((dfa->syntax & RE_DOT_NOT_NULL) &&
3810           re_string_byte_at (input, str_idx) == '\0'))
3811        return 0;
3812      return char_len;
3813    }
3814
3815  elem_len = re_string_elem_size_at (input, str_idx);
3816  wc = __btowc(*(input->mbs+str_idx));
3817  if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3818    return 0;
3819
3820  if (node->type == COMPLEX_BRACKET)
3821    {
3822      const re_charset_t *cset = node->opr.mbcset;
3823# ifdef _LIBC
3824      const unsigned char *pin
3825        = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3826      int j;
3827      uint32_t nrules;
3828# endif /* _LIBC */
3829      int match_len = 0;
3830      wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3831                    ? re_string_wchar_at (input, str_idx) : 0);
3832
3833      /* match with multibyte character?  */
3834      for (i = 0; i < cset->nmbchars; ++i)
3835        if (wc == cset->mbchars[i])
3836          {
3837            match_len = char_len;
3838            goto check_node_accept_bytes_match;
3839          }
3840      /* match with character_class?  */
3841      for (i = 0; i < cset->nchar_classes; ++i)
3842        {
3843          wctype_t wt = cset->char_classes[i];
3844          if (__iswctype (wc, wt))
3845            {
3846              match_len = char_len;
3847              goto check_node_accept_bytes_match;
3848            }
3849        }
3850
3851# ifdef _LIBC
3852      nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3853      if (nrules != 0)
3854        {
3855          unsigned int in_collseq = 0;
3856          const int32_t *table, *indirect;
3857          const unsigned char *weights, *extra;
3858          const char *collseqwc;
3859          /* This #include defines a local function!  */
3860#  include <locale/weight.h>
3861
3862          /* match with collating_symbol?  */
3863          if (cset->ncoll_syms)
3864            extra = (const unsigned char *)
3865              _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3866          for (i = 0; i < cset->ncoll_syms; ++i)
3867            {
3868              const unsigned char *coll_sym = extra + cset->coll_syms[i];
3869              /* Compare the length of input collating element and
3870                 the length of current collating element.  */
3871              if (*coll_sym != elem_len)
3872                continue;
3873              /* Compare each bytes.  */
3874              for (j = 0; j < *coll_sym; j++)
3875                if (pin[j] != coll_sym[1 + j])
3876                  break;
3877              if (j == *coll_sym)
3878                {
3879                  /* Match if every bytes is equal.  */
3880                  match_len = j;
3881                  goto check_node_accept_bytes_match;
3882                }
3883            }
3884
3885          if (cset->nranges)
3886            {
3887              if (elem_len <= char_len)
3888                {
3889                  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3890                  in_collseq = __collseq_table_lookup (collseqwc, wc);
3891                }
3892              else
3893                in_collseq = find_collation_sequence_value (pin, elem_len);
3894            }
3895          /* match with range expression?  */
3896          for (i = 0; i < cset->nranges; ++i)
3897            if (cset->range_starts[i] <= in_collseq
3898                && in_collseq <= cset->range_ends[i])
3899              {
3900                match_len = elem_len;
3901                goto check_node_accept_bytes_match;
3902              }
3903
3904          /* match with equivalence_class?  */
3905          if (cset->nequiv_classes)
3906            {
3907              const unsigned char *cp = pin;
3908              table = (const int32_t *)
3909                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3910              weights = (const unsigned char *)
3911                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3912              extra = (const unsigned char *)
3913                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3914              indirect = (const int32_t *)
3915                _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3916              int32_t idx = findidx (&cp);
3917              if (idx > 0)
3918                for (i = 0; i < cset->nequiv_classes; ++i)
3919                  {
3920                    int32_t equiv_class_idx = cset->equiv_classes[i];
3921                    size_t weight_len = weights[idx & 0xffffff];
3922                    if (weight_len == weights[equiv_class_idx & 0xffffff]
3923                        && (idx >> 24) == (equiv_class_idx >> 24))
3924                      {
3925                        int cnt = 0;
3926
3927                        idx &= 0xffffff;
3928                        equiv_class_idx &= 0xffffff;
3929
3930                        while (cnt <= weight_len
3931                               && (weights[equiv_class_idx + 1 + cnt]
3932                                   == weights[idx + 1 + cnt]))
3933                          ++cnt;
3934                        if (cnt > weight_len)
3935                          {
3936                            match_len = elem_len;
3937                            goto check_node_accept_bytes_match;
3938                          }
3939                      }
3940                  }
3941            }
3942        }
3943      else
3944# endif /* _LIBC */
3945        {
3946          /* match with range expression?  */
3947#if __GNUC__ >= 2
3948          wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3949#else
3950          wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3951          cmp_buf[2] = wc;
3952#endif
3953          for (i = 0; i < cset->nranges; ++i)
3954            {
3955              cmp_buf[0] = cset->range_starts[i];
3956              cmp_buf[4] = cset->range_ends[i];
3957              if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3958                  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3959                {
3960                  match_len = char_len;
3961                  goto check_node_accept_bytes_match;
3962                }
3963            }
3964        }
3965    check_node_accept_bytes_match:
3966      if (!cset->non_match)
3967        return match_len;
3968      else
3969        {
3970          if (match_len > 0)
3971            return 0;
3972          else
3973            return (elem_len > char_len) ? elem_len : char_len;
3974        }
3975    }
3976  return 0;
3977}
3978
3979# ifdef _LIBC
3980static unsigned int
3981internal_function
3982find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3983{
3984  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3985  if (nrules == 0)
3986    {
3987      if (mbs_len == 1)
3988        {
3989          /* No valid character.  Match it as a single byte character.  */
3990          const unsigned char *collseq = (const unsigned char *)
3991            _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3992          return collseq[mbs[0]];
3993        }
3994      return UINT_MAX;
3995    }
3996  else
3997    {
3998      int32_t idx;
3999      const unsigned char *extra = (const unsigned char *)
4000        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4001      int32_t extrasize = (const unsigned char *)
4002        _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4003
4004      for (idx = 0; idx < extrasize;)
4005        {
4006          int mbs_cnt, found = 0;
4007          int32_t elem_mbs_len;
4008          /* Skip the name of collating element name.  */
4009          idx = idx + extra[idx] + 1;
4010          elem_mbs_len = extra[idx++];
4011          if (mbs_len == elem_mbs_len)
4012            {
4013              for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4014                if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4015                  break;
4016              if (mbs_cnt == elem_mbs_len)
4017                /* Found the entry.  */
4018                found = 1;
4019            }
4020          /* Skip the byte sequence of the collating element.  */
4021          idx += elem_mbs_len;
4022          /* Adjust for the alignment.  */
4023          idx = (idx + 3) & ~3;
4024          /* Skip the collation sequence value.  */
4025          idx += sizeof (uint32_t);
4026          /* Skip the wide char sequence of the collating element.  */
4027          idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4028          /* If we found the entry, return the sequence value.  */
4029          if (found)
4030            return *(uint32_t *) (extra + idx);
4031          /* Skip the collation sequence value.  */
4032          idx += sizeof (uint32_t);
4033        }
4034      return UINT_MAX;
4035    }
4036}
4037# endif /* _LIBC */
4038#endif /* RE_ENABLE_I18N */
4039
4040/* Check whether the node accepts the byte which is IDX-th
4041   byte of the INPUT.  */
4042
4043static int
4044internal_function
4045check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4046                   int idx)
4047{
4048  unsigned char ch;
4049  ch = re_string_byte_at (&mctx->input, idx);
4050  switch (node->type)
4051    {
4052    case CHARACTER:
4053      if (node->opr.c != ch)
4054        return 0;
4055      break;
4056
4057    case SIMPLE_BRACKET:
4058      if (!bitset_contain (node->opr.sbcset, ch))
4059        return 0;
4060      break;
4061
4062#ifdef RE_ENABLE_I18N
4063    case OP_UTF8_PERIOD:
4064      if (ch >= 0x80)
4065        return 0;
4066      /* FALLTHROUGH */
4067#endif
4068    case OP_PERIOD:
4069      if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4070          || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4071        return 0;
4072      break;
4073
4074    default:
4075      return 0;
4076    }
4077
4078  if (node->constraint)
4079    {
4080      /* The node has constraints.  Check whether the current context
4081         satisfies the constraints.  */
4082      unsigned int context = re_string_context_at (&mctx->input, idx,
4083                                                   mctx->eflags);
4084      if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4085        return 0;
4086    }
4087
4088  return 1;
4089}
4090
4091/* Extend the buffers, if the buffers have run out.  */
4092
4093static reg_errcode_t
4094internal_function
4095extend_buffers (re_match_context_t *mctx)
4096{
4097  reg_errcode_t ret;
4098  re_string_t *pstr = &mctx->input;
4099
4100  /* Avoid overflow.  */
4101  if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4102    return REG_ESPACE;
4103
4104  /* Double the lengths of the buffers.  */
4105  ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4106  if (BE (ret != REG_NOERROR, 0))
4107    return ret;
4108
4109  if (mctx->state_log != NULL)
4110    {
4111      /* And double the length of state_log.  */
4112      /* XXX We have no indication of the size of this buffer.  If this
4113         allocation fail we have no indication that the state_log array
4114         does not have the right size.  */
4115      re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4116                                              pstr->bufs_len + 1);
4117      if (BE (new_array == NULL, 0))
4118        return REG_ESPACE;
4119      mctx->state_log = new_array;
4120    }
4121
4122  /* Then reconstruct the buffers.  */
4123  if (pstr->icase)
4124    {
4125#ifdef RE_ENABLE_I18N
4126      if (pstr->mb_cur_max > 1)
4127        {
4128          ret = build_wcs_upper_buffer (pstr);
4129          if (BE (ret != REG_NOERROR, 0))
4130            return ret;
4131        }
4132      else
4133#endif /* RE_ENABLE_I18N  */
4134        build_upper_buffer (pstr);
4135    }
4136  else
4137    {
4138#ifdef RE_ENABLE_I18N
4139      if (pstr->mb_cur_max > 1)
4140        build_wcs_buffer (pstr);
4141      else
4142#endif /* RE_ENABLE_I18N  */
4143        {
4144          if (pstr->trans != NULL)
4145            re_string_translate_buffer (pstr);
4146        }
4147    }
4148  return REG_NOERROR;
4149}
4150
4151\f
4152/* Functions for matching context.  */
4153
4154/* Initialize MCTX.  */
4155
4156static reg_errcode_t
4157internal_function
4158match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4159{
4160  mctx->eflags = eflags;
4161  mctx->match_last = -1;
4162  if (n > 0)
4163    {
4164      mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4165      mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4166      if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4167        return REG_ESPACE;
4168    }
4169  /* Already zero-ed by the caller.
4170     else
4171       mctx->bkref_ents = NULL;
4172     mctx->nbkref_ents = 0;
4173     mctx->nsub_tops = 0;  */
4174  mctx->abkref_ents = n;
4175  mctx->max_mb_elem_len = 1;
4176  mctx->asub_tops = n;
4177  return REG_NOERROR;
4178}
4179
4180/* Clean the entries which depend on the current input in MCTX.
4181   This function must be invoked when the matcher changes the start index
4182   of the input, or changes the input string.  */
4183
4184static void
4185internal_function
4186match_ctx_clean (re_match_context_t *mctx)
4187{
4188  int st_idx;
4189  for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4190    {
4191      int sl_idx;
4192      re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4193      for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4194        {
4195          re_sub_match_last_t *last = top->lasts[sl_idx];
4196          re_free (last->path.array);
4197          re_free (last);
4198        }
4199      re_free (top->lasts);
4200      if (top->path)
4201        {
4202          re_free (top->path->array);
4203          re_free (top->path);
4204        }
4205      free (top);
4206    }
4207
4208  mctx->nsub_tops = 0;
4209  mctx->nbkref_ents = 0;
4210}
4211
4212/* Free all the memory associated with MCTX.  */
4213
4214static void
4215internal_function
4216match_ctx_free (re_match_context_t *mctx)
4217{
4218  /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4219  match_ctx_clean (mctx);
4220  re_free (mctx->sub_tops);
4221  re_free (mctx->bkref_ents);
4222}
4223
4224/* Add a new backreference entry to MCTX.
4225   Note that we assume that caller never call this function with duplicate
4226   entry, and call with STR_IDX which isn't smaller than any existing entry.
4227*/
4228
4229static reg_errcode_t
4230internal_function
4231match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4232                     int to)
4233{
4234  if (mctx->nbkref_ents >= mctx->abkref_ents)
4235    {
4236      struct re_backref_cache_entry* new_entry;
4237      new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4238                              mctx->abkref_ents * 2);
4239      if (BE (new_entry == NULL, 0))
4240        {
4241          re_free (mctx->bkref_ents);
4242          return REG_ESPACE;
4243        }
4244      mctx->bkref_ents = new_entry;
4245      memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4246              sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4247      mctx->abkref_ents *= 2;
4248    }
4249  if (mctx->nbkref_ents > 0
4250      && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4251    mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4252
4253  mctx->bkref_ents[mctx->nbkref_ents].node = node;
4254  mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4255  mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4256  mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4257
4258  /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4259     If bit N is clear, means that this entry won't epsilon-transition to
4260     an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4261     it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4262     such node.
4263
4264     A backreference does not epsilon-transition unless it is empty, so set
4265     to all zeros if FROM != TO.  */
4266  mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4267    = (from == to ? ~0 : 0);
4268
4269  mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4270  if (mctx->max_mb_elem_len < to - from)
4271    mctx->max_mb_elem_len = to - from;
4272  return REG_NOERROR;
4273}
4274
4275/* Search for the first entry which has the same str_idx, or -1 if none is
4276   found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4277
4278static int
4279internal_function
4280search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4281{
4282  int left, right, mid, last;
4283  last = right = mctx->nbkref_ents;
4284  for (left = 0; left < right;)
4285    {
4286      mid = left + (right - left) / 2;
4287      if (mctx->bkref_ents[mid].str_idx < str_idx)
4288        left = mid + 1;
4289      else
4290        right = mid;
4291    }
4292  if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4293    return left;
4294  else
4295    return -1;
4296}
4297
4298/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4299   at STR_IDX.  */
4300
4301static reg_errcode_t
4302internal_function
4303match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4304{
4305#ifdef DEBUG
4306  assert (mctx->sub_tops != NULL);
4307  assert (mctx->asub_tops > 0);
4308#endif
4309  if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4310    {
4311      int new_asub_tops = mctx->asub_tops * 2;
4312      re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4313                                                   re_sub_match_top_t *,
4314                                                   new_asub_tops);
4315      if (BE (new_array == NULL, 0))
4316        return REG_ESPACE;
4317      mctx->sub_tops = new_array;
4318      mctx->asub_tops = new_asub_tops;
4319    }
4320  mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4321  if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4322    return REG_ESPACE;
4323  mctx->sub_tops[mctx->nsub_tops]->node = node;
4324  mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4325  return REG_NOERROR;
4326}
4327
4328/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4329   at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4330
4331static re_sub_match_last_t *
4332internal_function
4333match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4334{
4335  re_sub_match_last_t *new_entry;
4336  if (BE (subtop->nlasts == subtop->alasts, 0))
4337    {
4338      int new_alasts = 2 * subtop->alasts + 1;
4339      re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4340                                                    re_sub_match_last_t *,
4341                                                    new_alasts);
4342      if (BE (new_array == NULL, 0))
4343        return NULL;
4344      subtop->lasts = new_array;
4345      subtop->alasts = new_alasts;
4346    }
4347  new_entry = calloc (1, sizeof (re_sub_match_last_t));
4348  if (BE (new_entry != NULL, 1))
4349    {
4350      subtop->lasts[subtop->nlasts] = new_entry;
4351      new_entry->node = node;
4352      new_entry->str_idx = str_idx;
4353      ++subtop->nlasts;
4354    }
4355  return new_entry;
4356}
4357
4358static void
4359internal_function
4360sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4361               re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4362{
4363  sctx->sifted_states = sifted_sts;
4364  sctx->limited_states = limited_sts;
4365  sctx->last_node = last_node;
4366  sctx->last_str_idx = last_str_idx;
4367  re_node_set_init_empty (&sctx->limits);
4368}