Andrew's git - gitweb.git/blob - compat/regex/regex

   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, write to the Free
  18   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19   02110-1301 USA.  */
  20
  21static void re_string_construct_common (const char *str, int len,
  22                                        re_string_t *pstr,
  23                                        RE_TRANSLATE_TYPE trans, int icase,
  24                                        const re_dfa_t *dfa) internal_function;
  25static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  26                                          const re_node_set *nodes,
  27                                          unsigned int hash) internal_function;
  28static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  29                                          const re_node_set *nodes,
  30                                          unsigned int context,
  31                                          unsigned int hash) internal_function;
  32
  33#ifdef GAWK
  34#undef MAX      /* safety */
  35static int
  36MAX(size_t a, size_t b)
  37{
  38        return (a > b ? a : b);
  39}
  40#endif
  41\f
  42/* Functions for string operation.  */
  43
  44/* This function allocate the buffers.  It is necessary to call
  45   re_string_reconstruct before using the object.  */
  46
  47static reg_errcode_t
  48internal_function
  49re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  50                    RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  51{
  52  reg_errcode_t ret;
  53  int init_buf_len;
  54
  55  /* Ensure at least one character fits into the buffers.  */
  56  if (init_len < dfa->mb_cur_max)
  57    init_len = dfa->mb_cur_max;
  58  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  59  re_string_construct_common (str, len, pstr, trans, icase, dfa);
  60
  61  ret = re_string_realloc_buffers (pstr, init_buf_len);
  62  if (BE (ret != REG_NOERROR, 0))
  63    return ret;
  64
  65  pstr->word_char = dfa->word_char;
  66  pstr->word_ops_used = dfa->word_ops_used;
  67  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  68  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  69  pstr->valid_raw_len = pstr->valid_len;
  70  return REG_NOERROR;
  71}
  72
  73/* This function allocate the buffers, and initialize them.  */
  74
  75static reg_errcode_t
  76internal_function
  77re_string_construct (re_string_t *pstr, const char *str, int len,
  78                     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  79{
  80  reg_errcode_t ret;
  81  memset (pstr, '\0', sizeof (re_string_t));
  82  re_string_construct_common (str, len, pstr, trans, icase, dfa);
  83
  84  if (len > 0)
  85    {
  86      ret = re_string_realloc_buffers (pstr, len + 1);
  87      if (BE (ret != REG_NOERROR, 0))
  88        return ret;
  89    }
  90  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  91
  92  if (icase)
  93    {
  94#ifdef RE_ENABLE_I18N
  95      if (dfa->mb_cur_max > 1)
  96        {
  97          while (1)
  98            {
  99              ret = build_wcs_upper_buffer (pstr);
 100              if (BE (ret != REG_NOERROR, 0))
 101                return ret;
 102              if (pstr->valid_raw_len >= len)
 103                break;
 104              if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 105                break;
 106              ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 107              if (BE (ret != REG_NOERROR, 0))
 108                return ret;
 109            }
 110        }
 111      else
 112#endif /* RE_ENABLE_I18N  */
 113        build_upper_buffer (pstr);
 114    }
 115  else
 116    {
 117#ifdef RE_ENABLE_I18N
 118      if (dfa->mb_cur_max > 1)
 119        build_wcs_buffer (pstr);
 120      else
 121#endif /* RE_ENABLE_I18N  */
 122        {
 123          if (trans != NULL)
 124            re_string_translate_buffer (pstr);
 125          else
 126            {
 127              pstr->valid_len = pstr->bufs_len;
 128              pstr->valid_raw_len = pstr->bufs_len;
 129            }
 130        }
 131    }
 132
 133  return REG_NOERROR;
 134}
 135
 136/* Helper functions for re_string_allocate, and re_string_construct.  */
 137
 138static reg_errcode_t
 139internal_function
 140re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 141{
 142#ifdef RE_ENABLE_I18N
 143  if (pstr->mb_cur_max > 1)
 144    {
 145      wint_t *new_wcs;
 146
 147      /* Avoid overflow in realloc.  */
 148      const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
 149      if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 150        return REG_ESPACE;
 151
 152      new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 153      if (BE (new_wcs == NULL, 0))
 154        return REG_ESPACE;
 155      pstr->wcs = new_wcs;
 156      if (pstr->offsets != NULL)
 157        {
 158          int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 159          if (BE (new_offsets == NULL, 0))
 160            return REG_ESPACE;
 161          pstr->offsets = new_offsets;
 162        }
 163    }
 164#endif /* RE_ENABLE_I18N  */
 165  if (pstr->mbs_allocated)
 166    {
 167      unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 168                                           new_buf_len);
 169      if (BE (new_mbs == NULL, 0))
 170        return REG_ESPACE;
 171      pstr->mbs = new_mbs;
 172    }
 173  pstr->bufs_len = new_buf_len;
 174  return REG_NOERROR;
 175}
 176
 177
 178static void
 179internal_function
 180re_string_construct_common (const char *str, int len, re_string_t *pstr,
 181                            RE_TRANSLATE_TYPE trans, int icase,
 182                            const re_dfa_t *dfa)
 183{
 184  pstr->raw_mbs = (const unsigned char *) str;
 185  pstr->len = len;
 186  pstr->raw_len = len;
 187  pstr->trans = trans;
 188  pstr->icase = icase ? 1 : 0;
 189  pstr->mbs_allocated = (trans != NULL || icase);
 190  pstr->mb_cur_max = dfa->mb_cur_max;
 191  pstr->is_utf8 = dfa->is_utf8;
 192  pstr->map_notascii = dfa->map_notascii;
 193  pstr->stop = pstr->len;
 194  pstr->raw_stop = pstr->stop;
 195}
 196
 197#ifdef RE_ENABLE_I18N
 198
 199/* Build wide character buffer PSTR->WCS.
 200   If the byte sequence of the string are:
 201     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 202   Then wide character buffer will be:
 203     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 204   We use WEOF for padding, they indicate that the position isn't
 205   a first byte of a multibyte character.
 206
 207   Note that this function assumes PSTR->VALID_LEN elements are already
 208   built and starts from PSTR->VALID_LEN.  */
 209
 210static void
 211internal_function
 212build_wcs_buffer (re_string_t *pstr)
 213{
 214#ifdef _LIBC
 215  unsigned char buf[MB_LEN_MAX];
 216  assert (MB_LEN_MAX >= pstr->mb_cur_max);
 217#else
 218  unsigned char buf[64];
 219#endif
 220  mbstate_t prev_st;
 221  int byte_idx, end_idx, remain_len;
 222  size_t mbclen;
 223
 224  /* Build the buffers from pstr->valid_len to either pstr->len or
 225     pstr->bufs_len.  */
 226  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 227  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 228    {
 229      wchar_t wc;
 230      const char *p;
 231
 232      remain_len = end_idx - byte_idx;
 233      prev_st = pstr->cur_state;
 234      /* Apply the translation if we need.  */
 235      if (BE (pstr->trans != NULL, 0))
 236        {
 237          int i, ch;
 238
 239          for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 240            {
 241              ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 242              buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 243            }
 244          p = (const char *) buf;
 245        }
 246      else
 247        p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 248      mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 249      if (BE (mbclen == (size_t) -2, 0))
 250        {
 251          /* The buffer doesn't have enough space, finish to build.  */
 252          pstr->cur_state = prev_st;
 253          break;
 254        }
 255      else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 256        {
 257          /* We treat these cases as a singlebyte character.  */
 258          mbclen = 1;
 259          wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 260          if (BE (pstr->trans != NULL, 0))
 261            wc = pstr->trans[wc];
 262          pstr->cur_state = prev_st;
 263        }
 264
 265      /* Write wide character and padding.  */
 266      pstr->wcs[byte_idx++] = wc;
 267      /* Write paddings.  */
 268      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 269        pstr->wcs[byte_idx++] = WEOF;
 270    }
 271  pstr->valid_len = byte_idx;
 272  pstr->valid_raw_len = byte_idx;
 273}
 274
 275/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 276   but for REG_ICASE.  */
 277
 278static reg_errcode_t
 279internal_function
 280build_wcs_upper_buffer (re_string_t *pstr)
 281{
 282  mbstate_t prev_st;
 283  int src_idx, byte_idx, end_idx, remain_len;
 284  size_t mbclen;
 285#ifdef _LIBC
 286  char buf[MB_LEN_MAX];
 287  assert (MB_LEN_MAX >= pstr->mb_cur_max);
 288#else
 289  char buf[64];
 290#endif
 291
 292  byte_idx = pstr->valid_len;
 293  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 294
 295  /* The following optimization assumes that ASCII characters can be
 296     mapped to wide characters with a simple cast.  */
 297  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 298    {
 299      while (byte_idx < end_idx)
 300        {
 301          wchar_t wc;
 302
 303          if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 304              && mbsinit (&pstr->cur_state))
 305            {
 306              /* In case of a singlebyte character.  */
 307              pstr->mbs[byte_idx]
 308                = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 309              /* The next step uses the assumption that wchar_t is encoded
 310                 ASCII-safe: all ASCII values can be converted like this.  */
 311              pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 312              ++byte_idx;
 313              continue;
 314            }
 315
 316          remain_len = end_idx - byte_idx;
 317          prev_st = pstr->cur_state;
 318          mbclen = __mbrtowc (&wc,
 319                              ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 320                               + byte_idx), remain_len, &pstr->cur_state);
 321          if (BE (mbclen + 2 > 2, 1))
 322            {
 323              wchar_t wcu = wc;
 324              if (iswlower (wc))
 325                {
 326                  size_t mbcdlen;
 327
 328                  wcu = towupper (wc);
 329                  mbcdlen = wcrtomb (buf, wcu, &prev_st);
 330                  if (BE (mbclen == mbcdlen, 1))
 331                    memcpy (pstr->mbs + byte_idx, buf, mbclen);
 332                  else
 333                    {
 334                      src_idx = byte_idx;
 335                      goto offsets_needed;
 336                    }
 337                }
 338              else
 339                memcpy (pstr->mbs + byte_idx,
 340                        pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 341              pstr->wcs[byte_idx++] = wcu;
 342              /* Write paddings.  */
 343              for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 344                pstr->wcs[byte_idx++] = WEOF;
 345            }
 346          else if (mbclen == (size_t) -1 || mbclen == 0)
 347            {
 348              /* It is an invalid character or '\0'.  Just use the byte.  */
 349              int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 350              pstr->mbs[byte_idx] = ch;
 351              /* And also cast it to wide char.  */
 352              pstr->wcs[byte_idx++] = (wchar_t) ch;
 353              if (BE (mbclen == (size_t) -1, 0))
 354                pstr->cur_state = prev_st;
 355            }
 356          else
 357            {
 358              /* The buffer doesn't have enough space, finish to build.  */
 359              pstr->cur_state = prev_st;
 360              break;
 361            }
 362        }
 363      pstr->valid_len = byte_idx;
 364      pstr->valid_raw_len = byte_idx;
 365      return REG_NOERROR;
 366    }
 367  else
 368    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 369      {
 370        wchar_t wc;
 371        const char *p;
 372      offsets_needed:
 373        remain_len = end_idx - byte_idx;
 374        prev_st = pstr->cur_state;
 375        if (BE (pstr->trans != NULL, 0))
 376          {
 377            int i, ch;
 378
 379            for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 380              {
 381                ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 382                buf[i] = pstr->trans[ch];
 383              }
 384            p = (const char *) buf;
 385          }
 386        else
 387          p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 388        mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 389        if (BE (mbclen + 2 > 2, 1))
 390          {
 391            wchar_t wcu = wc;
 392            if (iswlower (wc))
 393              {
 394                size_t mbcdlen;
 395
 396                wcu = towupper (wc);
 397                mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 398                if (BE (mbclen == mbcdlen, 1))
 399                  memcpy (pstr->mbs + byte_idx, buf, mbclen);
 400                else if (mbcdlen != (size_t) -1)
 401                  {
 402                    size_t i;
 403
 404                    if (byte_idx + mbcdlen > pstr->bufs_len)
 405                      {
 406                        pstr->cur_state = prev_st;
 407                        break;
 408                      }
 409
 410                    if (pstr->offsets == NULL)
 411                      {
 412                        pstr->offsets = re_malloc (int, pstr->bufs_len);
 413
 414                        if (pstr->offsets == NULL)
 415                          return REG_ESPACE;
 416                      }
 417                    if (!pstr->offsets_needed)
 418                      {
 419                        for (i = 0; i < (size_t) byte_idx; ++i)
 420                          pstr->offsets[i] = i;
 421                        pstr->offsets_needed = 1;
 422                      }
 423
 424                    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 425                    pstr->wcs[byte_idx] = wcu;
 426                    pstr->offsets[byte_idx] = src_idx;
 427                    for (i = 1; i < mbcdlen; ++i)
 428                      {
 429                        pstr->offsets[byte_idx + i]
 430                          = src_idx + (i < mbclen ? i : mbclen - 1);
 431                        pstr->wcs[byte_idx + i] = WEOF;
 432                      }
 433                    pstr->len += mbcdlen - mbclen;
 434                    if (pstr->raw_stop > src_idx)
 435                      pstr->stop += mbcdlen - mbclen;
 436                    end_idx = (pstr->bufs_len > pstr->len)
 437                              ? pstr->len : pstr->bufs_len;
 438                    byte_idx += mbcdlen;
 439                    src_idx += mbclen;
 440                    continue;
 441                  }
 442                else
 443                  memcpy (pstr->mbs + byte_idx, p, mbclen);
 444              }
 445            else
 446              memcpy (pstr->mbs + byte_idx, p, mbclen);
 447
 448            if (BE (pstr->offsets_needed != 0, 0))
 449              {
 450                size_t i;
 451                for (i = 0; i < mbclen; ++i)
 452                  pstr->offsets[byte_idx + i] = src_idx + i;
 453              }
 454            src_idx += mbclen;
 455
 456            pstr->wcs[byte_idx++] = wcu;
 457            /* Write paddings.  */
 458            for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 459              pstr->wcs[byte_idx++] = WEOF;
 460          }
 461        else if (mbclen == (size_t) -1 || mbclen == 0)
 462          {
 463            /* It is an invalid character or '\0'.  Just use the byte.  */
 464            int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 465
 466            if (BE (pstr->trans != NULL, 0))
 467              ch = pstr->trans [ch];
 468            pstr->mbs[byte_idx] = ch;
 469
 470            if (BE (pstr->offsets_needed != 0, 0))
 471              pstr->offsets[byte_idx] = src_idx;
 472            ++src_idx;
 473
 474            /* And also cast it to wide char.  */
 475            pstr->wcs[byte_idx++] = (wchar_t) ch;
 476            if (BE (mbclen == (size_t) -1, 0))
 477              pstr->cur_state = prev_st;
 478          }
 479        else
 480          {
 481            /* The buffer doesn't have enough space, finish to build.  */
 482            pstr->cur_state = prev_st;
 483            break;
 484          }
 485      }
 486  pstr->valid_len = byte_idx;
 487  pstr->valid_raw_len = src_idx;
 488  return REG_NOERROR;
 489}
 490
 491/* Skip characters until the index becomes greater than NEW_RAW_IDX.
 492   Return the index.  */
 493
 494static int
 495internal_function
 496re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 497{
 498  mbstate_t prev_st;
 499  int rawbuf_idx;
 500  size_t mbclen;
 501  wint_t wc = WEOF;
 502
 503  /* Skip the characters which are not necessary to check.  */
 504  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 505       rawbuf_idx < new_raw_idx;)
 506    {
 507      wchar_t wc2;
 508      int remain_len = pstr->len - rawbuf_idx;
 509      prev_st = pstr->cur_state;
 510      mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 511                          remain_len, &pstr->cur_state);
 512      if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 513        {
 514          /* We treat these cases as a single byte character.  */
 515          if (mbclen == 0 || remain_len == 0)
 516            wc = L'\0';
 517          else
 518            wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 519          mbclen = 1;
 520          pstr->cur_state = prev_st;
 521        }
 522      else
 523        wc = (wint_t) wc2;
 524      /* Then proceed the next character.  */
 525      rawbuf_idx += mbclen;
 526    }
 527  *last_wc = (wint_t) wc;
 528  return rawbuf_idx;
 529}
 530#endif /* RE_ENABLE_I18N  */
 531
 532/* Build the buffer PSTR->MBS, and apply the translation if we need.
 533   This function is used in case of REG_ICASE.  */
 534
 535static void
 536internal_function
 537build_upper_buffer (re_string_t *pstr)
 538{
 539  int char_idx, end_idx;
 540  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 541
 542  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 543    {
 544      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 545      if (BE (pstr->trans != NULL, 0))
 546        ch = pstr->trans[ch];
 547      if (islower (ch))
 548        pstr->mbs[char_idx] = toupper (ch);
 549      else
 550        pstr->mbs[char_idx] = ch;
 551    }
 552  pstr->valid_len = char_idx;
 553  pstr->valid_raw_len = char_idx;
 554}
 555
 556/* Apply TRANS to the buffer in PSTR.  */
 557
 558static void
 559internal_function
 560re_string_translate_buffer (re_string_t *pstr)
 561{
 562  int buf_idx, end_idx;
 563  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 564
 565  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 566    {
 567      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 568      pstr->mbs[buf_idx] = pstr->trans[ch];
 569    }
 570
 571  pstr->valid_len = buf_idx;
 572  pstr->valid_raw_len = buf_idx;
 573}
 574
 575/* This function re-construct the buffers.
 576   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 577   convert to upper case in case of REG_ICASE, apply translation.  */
 578
 579static reg_errcode_t
 580internal_function
 581re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 582{
 583  int offset = idx - pstr->raw_mbs_idx;
 584  if (BE (offset < 0, 0))
 585    {
 586      /* Reset buffer.  */
 587#ifdef RE_ENABLE_I18N
 588      if (pstr->mb_cur_max > 1)
 589        memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 590#endif /* RE_ENABLE_I18N */
 591      pstr->len = pstr->raw_len;
 592      pstr->stop = pstr->raw_stop;
 593      pstr->valid_len = 0;
 594      pstr->raw_mbs_idx = 0;
 595      pstr->valid_raw_len = 0;
 596      pstr->offsets_needed = 0;
 597      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 598                           : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 599      if (!pstr->mbs_allocated)
 600        pstr->mbs = (unsigned char *) pstr->raw_mbs;
 601      offset = idx;
 602    }
 603
 604  if (BE (offset != 0, 1))
 605    {
 606      /* Should the already checked characters be kept?  */
 607      if (BE (offset < pstr->valid_raw_len, 1))
 608        {
 609          /* Yes, move them to the front of the buffer.  */
 610#ifdef RE_ENABLE_I18N
 611          if (BE (pstr->offsets_needed, 0))
 612            {
 613              int low = 0, high = pstr->valid_len, mid;
 614              do
 615                {
 616                  mid = (high + low) / 2;
 617                  if (pstr->offsets[mid] > offset)
 618                    high = mid;
 619                  else if (pstr->offsets[mid] < offset)
 620                    low = mid + 1;
 621                  else
 622                    break;
 623                }
 624              while (low < high);
 625              if (pstr->offsets[mid] < offset)
 626                ++mid;
 627              pstr->tip_context = re_string_context_at (pstr, mid - 1,
 628                                                        eflags);
 629              /* This can be quite complicated, so handle specially
 630                 only the common and easy case where the character with
 631                 different length representation of lower and upper
 632                 case is present at or after offset.  */
 633              if (pstr->valid_len > offset
 634                  && mid == offset && pstr->offsets[mid] == offset)
 635                {
 636                  memmove (pstr->wcs, pstr->wcs + offset,
 637                           (pstr->valid_len - offset) * sizeof (wint_t));
 638                  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 639                  pstr->valid_len -= offset;
 640                  pstr->valid_raw_len -= offset;
 641                  for (low = 0; low < pstr->valid_len; low++)
 642                    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 643                }
 644              else
 645                {
 646                  /* Otherwise, just find out how long the partial multibyte
 647                     character at offset is and fill it with WEOF/255.  */
 648                  pstr->len = pstr->raw_len - idx + offset;
 649                  pstr->stop = pstr->raw_stop - idx + offset;
 650                  pstr->offsets_needed = 0;
 651                  while (mid > 0 && pstr->offsets[mid - 1] == offset)
 652                    --mid;
 653                  while (mid < pstr->valid_len)
 654                    if (pstr->wcs[mid] != WEOF)
 655                      break;
 656                    else
 657                      ++mid;
 658                  if (mid == pstr->valid_len)
 659                    pstr->valid_len = 0;
 660                  else
 661                    {
 662                      pstr->valid_len = pstr->offsets[mid] - offset;
 663                      if (pstr->valid_len)
 664                        {
 665                          for (low = 0; low < pstr->valid_len; ++low)
 666                            pstr->wcs[low] = WEOF;
 667                          memset (pstr->mbs, 255, pstr->valid_len);
 668                        }
 669                    }
 670                  pstr->valid_raw_len = pstr->valid_len;
 671                }
 672            }
 673          else
 674#endif
 675            {
 676              pstr->tip_context = re_string_context_at (pstr, offset - 1,
 677                                                        eflags);
 678#ifdef RE_ENABLE_I18N
 679              if (pstr->mb_cur_max > 1)
 680                memmove (pstr->wcs, pstr->wcs + offset,
 681                         (pstr->valid_len - offset) * sizeof (wint_t));
 682#endif /* RE_ENABLE_I18N */
 683              if (BE (pstr->mbs_allocated, 0))
 684                memmove (pstr->mbs, pstr->mbs + offset,
 685                         pstr->valid_len - offset);
 686              pstr->valid_len -= offset;
 687              pstr->valid_raw_len -= offset;
 688#if DEBUG
 689              assert (pstr->valid_len > 0);
 690#endif
 691            }
 692        }
 693      else
 694        {
 695#ifdef RE_ENABLE_I18N
 696          /* No, skip all characters until IDX.  */
 697          int prev_valid_len = pstr->valid_len;
 698
 699          if (BE (pstr->offsets_needed, 0))
 700            {
 701              pstr->len = pstr->raw_len - idx + offset;
 702              pstr->stop = pstr->raw_stop - idx + offset;
 703              pstr->offsets_needed = 0;
 704            }
 705#endif
 706          pstr->valid_len = 0;
 707#ifdef RE_ENABLE_I18N
 708          if (pstr->mb_cur_max > 1)
 709            {
 710              int wcs_idx;
 711              wint_t wc = WEOF;
 712
 713              if (pstr->is_utf8)
 714                {
 715                  const unsigned char *raw, *p, *end;
 716
 717                  /* Special case UTF-8.  Multi-byte chars start with any
 718                     byte other than 0x80 - 0xbf.  */
 719                  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 720                  end = raw + (offset - pstr->mb_cur_max);
 721                  if (end < pstr->raw_mbs)
 722                    end = pstr->raw_mbs;
 723                  p = raw + offset - 1;
 724#ifdef _LIBC
 725                  /* We know the wchar_t encoding is UCS4, so for the simple
 726                     case, ASCII characters, skip the conversion step.  */
 727                  if (isascii (*p) && BE (pstr->trans == NULL, 1))
 728                    {
 729                      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 730                      /* pstr->valid_len = 0; */
 731                      wc = (wchar_t) *p;
 732                    }
 733                  else
 734#endif
 735                    for (; p >= end; --p)
 736                      if ((*p & 0xc0) != 0x80)
 737                        {
 738                          mbstate_t cur_state;
 739                          wchar_t wc2;
 740                          int mlen = raw + pstr->len - p;
 741                          unsigned char buf[6];
 742                          size_t mbclen;
 743
 744                          if (BE (pstr->trans != NULL, 0))
 745                            {
 746                              int i = mlen < 6 ? mlen : 6;
 747                              while (--i >= 0)
 748                                buf[i] = pstr->trans[p[i]];
 749                            }
 750                          /* XXX Don't use mbrtowc, we know which conversion
 751                             to use (UTF-8 -> UCS4).  */
 752                          memset (&cur_state, 0, sizeof (cur_state));
 753                          mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
 754                                              &cur_state);
 755                          if (raw + offset - p <= mbclen
 756                              && mbclen < (size_t) -2)
 757                            {
 758                              memset (&pstr->cur_state, '\0',
 759                                      sizeof (mbstate_t));
 760                              pstr->valid_len = mbclen - (raw + offset - p);
 761                              wc = wc2;
 762                            }
 763                          break;
 764                        }
 765                }
 766
 767              if (wc == WEOF)
 768                pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 769              if (wc == WEOF)
 770                pstr->tip_context
 771                  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 772              else
 773                pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 774                                      && IS_WIDE_WORD_CHAR (wc))
 775                                     ? CONTEXT_WORD
 776                                     : ((IS_WIDE_NEWLINE (wc)
 777                                         && pstr->newline_anchor)
 778                                        ? CONTEXT_NEWLINE : 0));
 779              if (BE (pstr->valid_len, 0))
 780                {
 781                  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 782                    pstr->wcs[wcs_idx] = WEOF;
 783                  if (pstr->mbs_allocated)
 784                    memset (pstr->mbs, 255, pstr->valid_len);
 785                }
 786              pstr->valid_raw_len = pstr->valid_len;
 787            }
 788          else
 789#endif /* RE_ENABLE_I18N */
 790            {
 791              int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 792              pstr->valid_raw_len = 0;
 793              if (pstr->trans)
 794                c = pstr->trans[c];
 795              pstr->tip_context = (bitset_contain (pstr->word_char, c)
 796                                   ? CONTEXT_WORD
 797                                   : ((IS_NEWLINE (c) && pstr->newline_anchor)
 798                                      ? CONTEXT_NEWLINE : 0));
 799            }
 800        }
 801      if (!BE (pstr->mbs_allocated, 0))
 802        pstr->mbs += offset;
 803    }
 804  pstr->raw_mbs_idx = idx;
 805  pstr->len -= offset;
 806  pstr->stop -= offset;
 807
 808  /* Then build the buffers.  */
 809#ifdef RE_ENABLE_I18N
 810  if (pstr->mb_cur_max > 1)
 811    {
 812      if (pstr->icase)
 813        {
 814          reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 815          if (BE (ret != REG_NOERROR, 0))
 816            return ret;
 817        }
 818      else
 819        build_wcs_buffer (pstr);
 820    }
 821  else
 822#endif /* RE_ENABLE_I18N */
 823    if (BE (pstr->mbs_allocated, 0))
 824      {
 825        if (pstr->icase)
 826          build_upper_buffer (pstr);
 827        else if (pstr->trans != NULL)
 828          re_string_translate_buffer (pstr);
 829      }
 830    else
 831      pstr->valid_len = pstr->len;
 832
 833  pstr->cur_idx = 0;
 834  return REG_NOERROR;
 835}
 836
 837static unsigned char
 838internal_function __attribute ((pure))
 839re_string_peek_byte_case (const re_string_t *pstr, int idx)
 840{
 841  int ch, off;
 842
 843  /* Handle the common (easiest) cases first.  */
 844  if (BE (!pstr->mbs_allocated, 1))
 845    return re_string_peek_byte (pstr, idx);
 846
 847#ifdef RE_ENABLE_I18N
 848  if (pstr->mb_cur_max > 1
 849      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 850    return re_string_peek_byte (pstr, idx);
 851#endif
 852
 853  off = pstr->cur_idx + idx;
 854#ifdef RE_ENABLE_I18N
 855  if (pstr->offsets_needed)
 856    off = pstr->offsets[off];
 857#endif
 858
 859  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 860
 861#ifdef RE_ENABLE_I18N
 862  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 863     this function returns CAPITAL LETTER I instead of first byte of
 864     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 865     since peek_byte_case doesn't advance cur_idx in any way.  */
 866  if (pstr->offsets_needed && !isascii (ch))
 867    return re_string_peek_byte (pstr, idx);
 868#endif
 869
 870  return ch;
 871}
 872
 873static unsigned char
 874internal_function __attribute ((pure))
 875re_string_fetch_byte_case (re_string_t *pstr)
 876{
 877  if (BE (!pstr->mbs_allocated, 1))
 878    return re_string_fetch_byte (pstr);
 879
 880#ifdef RE_ENABLE_I18N
 881  if (pstr->offsets_needed)
 882    {
 883      int off, ch;
 884
 885      /* For tr_TR.UTF-8 [[:islower:]] there is
 886         [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 887         in that case the whole multi-byte character and return
 888         the original letter.  On the other side, with
 889         [[: DOTLESS SMALL LETTER I return [[:I, as doing
 890         anything else would complicate things too much.  */
 891
 892      if (!re_string_first_byte (pstr, pstr->cur_idx))
 893        return re_string_fetch_byte (pstr);
 894
 895      off = pstr->offsets[pstr->cur_idx];
 896      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 897
 898      if (! isascii (ch))
 899        return re_string_fetch_byte (pstr);
 900
 901      re_string_skip_bytes (pstr,
 902                            re_string_char_size_at (pstr, pstr->cur_idx));
 903      return ch;
 904    }
 905#endif
 906
 907  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 908}
 909
 910static void
 911internal_function
 912re_string_destruct (re_string_t *pstr)
 913{
 914#ifdef RE_ENABLE_I18N
 915  re_free (pstr->wcs);
 916  re_free (pstr->offsets);
 917#endif /* RE_ENABLE_I18N  */
 918  if (pstr->mbs_allocated)
 919    re_free (pstr->mbs);
 920}
 921
 922/* Return the context at IDX in INPUT.  */
 923
 924static unsigned int
 925internal_function
 926re_string_context_at (const re_string_t *input, int idx, int eflags)
 927{
 928  int c;
 929  if (BE (idx < 0, 0))
 930    /* In this case, we use the value stored in input->tip_context,
 931       since we can't know the character in input->mbs[-1] here.  */
 932    return input->tip_context;
 933  if (BE (idx == input->len, 0))
 934    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 935            : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 936#ifdef RE_ENABLE_I18N
 937  if (input->mb_cur_max > 1)
 938    {
 939      wint_t wc;
 940      int wc_idx = idx;
 941      while(input->wcs[wc_idx] == WEOF)
 942        {
 943#ifdef DEBUG
 944          /* It must not happen.  */
 945          assert (wc_idx >= 0);
 946#endif
 947          --wc_idx;
 948          if (wc_idx < 0)
 949            return input->tip_context;
 950        }
 951      wc = input->wcs[wc_idx];
 952      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 953        return CONTEXT_WORD;
 954      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 955              ? CONTEXT_NEWLINE : 0);
 956    }
 957  else
 958#endif
 959    {
 960      c = re_string_byte_at (input, idx);
 961      if (bitset_contain (input->word_char, c))
 962        return CONTEXT_WORD;
 963      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 964    }
 965}
 966\f
 967/* Functions for set operation.  */
 968
 969static reg_errcode_t
 970internal_function
 971re_node_set_alloc (re_node_set *set, int size)
 972{
 973  /*
 974   * ADR: valgrind says size can be 0, which then doesn't
 975   * free the block of size 0.  Harumph. This seems
 976   * to work ok, though.
 977   */
 978  if (size == 0)
 979    {
 980       memset(set, 0, sizeof(*set));
 981       return REG_NOERROR;
 982    }
 983  set->alloc = size;
 984  set->nelem = 0;
 985  set->elems = re_malloc (int, size);
 986  if (BE (set->elems == NULL, 0))
 987    return REG_ESPACE;
 988  return REG_NOERROR;
 989}
 990
 991static reg_errcode_t
 992internal_function
 993re_node_set_init_1 (re_node_set *set, int elem)
 994{
 995  set->alloc = 1;
 996  set->nelem = 1;
 997  set->elems = re_malloc (int, 1);
 998  if (BE (set->elems == NULL, 0))
 999    {
1000      set->alloc = set->nelem = 0;
1001      return REG_ESPACE;
1002    }
1003  set->elems[0] = elem;
1004  return REG_NOERROR;
1005}
1006
1007static reg_errcode_t
1008internal_function
1009re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1010{
1011  set->alloc = 2;
1012  set->elems = re_malloc (int, 2);
1013  if (BE (set->elems == NULL, 0))
1014    return REG_ESPACE;
1015  if (elem1 == elem2)
1016    {
1017      set->nelem = 1;
1018      set->elems[0] = elem1;
1019    }
1020  else
1021    {
1022      set->nelem = 2;
1023      if (elem1 < elem2)
1024        {
1025          set->elems[0] = elem1;
1026          set->elems[1] = elem2;
1027        }
1028      else
1029        {
1030          set->elems[0] = elem2;
1031          set->elems[1] = elem1;
1032        }
1033    }
1034  return REG_NOERROR;
1035}
1036
1037static reg_errcode_t
1038internal_function
1039re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1040{
1041  dest->nelem = src->nelem;
1042  if (src->nelem > 0)
1043    {
1044      dest->alloc = dest->nelem;
1045      dest->elems = re_malloc (int, dest->alloc);
1046      if (BE (dest->elems == NULL, 0))
1047        {
1048          dest->alloc = dest->nelem = 0;
1049          return REG_ESPACE;
1050        }
1051      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1052    }
1053  else
1054    re_node_set_init_empty (dest);
1055  return REG_NOERROR;
1056}
1057
1058/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1059   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1060   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1061
1062static reg_errcode_t
1063internal_function
1064re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1065                           const re_node_set *src2)
1066{
1067  int i1, i2, is, id, delta, sbase;
1068  if (src1->nelem == 0 || src2->nelem == 0)
1069    return REG_NOERROR;
1070
1071  /* We need dest->nelem + 2 * elems_in_intersection; this is a
1072     conservative estimate.  */
1073  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1074    {
1075      int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1076      int *new_elems = re_realloc (dest->elems, int, new_alloc);
1077      if (BE (new_elems == NULL, 0))
1078        return REG_ESPACE;
1079      dest->elems = new_elems;
1080      dest->alloc = new_alloc;
1081    }
1082
1083  /* Find the items in the intersection of SRC1 and SRC2, and copy
1084     into the top of DEST those that are not already in DEST itself.  */
1085  sbase = dest->nelem + src1->nelem + src2->nelem;
1086  i1 = src1->nelem - 1;
1087  i2 = src2->nelem - 1;
1088  id = dest->nelem - 1;
1089  for (;;)
1090    {
1091      if (src1->elems[i1] == src2->elems[i2])
1092        {
1093          /* Try to find the item in DEST.  Maybe we could binary search?  */
1094          while (id >= 0 && dest->elems[id] > src1->elems[i1])
1095            --id;
1096
1097          if (id < 0 || dest->elems[id] != src1->elems[i1])
1098            dest->elems[--sbase] = src1->elems[i1];
1099
1100          if (--i1 < 0 || --i2 < 0)
1101            break;
1102        }
1103
1104      /* Lower the highest of the two items.  */
1105      else if (src1->elems[i1] < src2->elems[i2])
1106        {
1107          if (--i2 < 0)
1108            break;
1109        }
1110      else
1111        {
1112          if (--i1 < 0)
1113            break;
1114        }
1115    }
1116
1117  id = dest->nelem - 1;
1118  is = dest->nelem + src1->nelem + src2->nelem - 1;
1119  delta = is - sbase + 1;
1120
1121  /* Now copy.  When DELTA becomes zero, the remaining
1122     DEST elements are already in place; this is more or
1123     less the same loop that is in re_node_set_merge.  */
1124  dest->nelem += delta;
1125  if (delta > 0 && id >= 0)
1126    for (;;)
1127      {
1128        if (dest->elems[is] > dest->elems[id])
1129          {
1130            /* Copy from the top.  */
1131            dest->elems[id + delta--] = dest->elems[is--];
1132            if (delta == 0)
1133              break;
1134          }
1135        else
1136          {
1137            /* Slide from the bottom.  */
1138            dest->elems[id + delta] = dest->elems[id];
1139            if (--id < 0)
1140              break;
1141          }
1142      }
1143
1144  /* Copy remaining SRC elements.  */
1145  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1146
1147  return REG_NOERROR;
1148}
1149
1150/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1151   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1152
1153static reg_errcode_t
1154internal_function
1155re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1156                        const re_node_set *src2)
1157{
1158  int i1, i2, id;
1159  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1160    {
1161      dest->alloc = src1->nelem + src2->nelem;
1162      dest->elems = re_malloc (int, dest->alloc);
1163      if (BE (dest->elems == NULL, 0))
1164        return REG_ESPACE;
1165    }
1166  else
1167    {
1168      if (src1 != NULL && src1->nelem > 0)
1169        return re_node_set_init_copy (dest, src1);
1170      else if (src2 != NULL && src2->nelem > 0)
1171        return re_node_set_init_copy (dest, src2);
1172      else
1173        re_node_set_init_empty (dest);
1174      return REG_NOERROR;
1175    }
1176  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1177    {
1178      if (src1->elems[i1] > src2->elems[i2])
1179        {
1180          dest->elems[id++] = src2->elems[i2++];
1181          continue;
1182        }
1183      if (src1->elems[i1] == src2->elems[i2])
1184        ++i2;
1185      dest->elems[id++] = src1->elems[i1++];
1186    }
1187  if (i1 < src1->nelem)
1188    {
1189      memcpy (dest->elems + id, src1->elems + i1,
1190             (src1->nelem - i1) * sizeof (int));
1191      id += src1->nelem - i1;
1192    }
1193  else if (i2 < src2->nelem)
1194    {
1195      memcpy (dest->elems + id, src2->elems + i2,
1196             (src2->nelem - i2) * sizeof (int));
1197      id += src2->nelem - i2;
1198    }
1199  dest->nelem = id;
1200  return REG_NOERROR;
1201}
1202
1203/* Calculate the union set of the sets DEST and SRC. And store it to
1204   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1205
1206static reg_errcode_t
1207internal_function
1208re_node_set_merge (re_node_set *dest, const re_node_set *src)
1209{
1210  int is, id, sbase, delta;
1211  if (src == NULL || src->nelem == 0)
1212    return REG_NOERROR;
1213  if (dest->alloc < 2 * src->nelem + dest->nelem)
1214    {
1215      int new_alloc = 2 * (src->nelem + dest->alloc);
1216      int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1217      if (BE (new_buffer == NULL, 0))
1218        return REG_ESPACE;
1219      dest->elems = new_buffer;
1220      dest->alloc = new_alloc;
1221    }
1222
1223  if (BE (dest->nelem == 0, 0))
1224    {
1225      dest->nelem = src->nelem;
1226      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1227      return REG_NOERROR;
1228    }
1229
1230  /* Copy into the top of DEST the items of SRC that are not
1231     found in DEST.  Maybe we could binary search in DEST?  */
1232  for (sbase = dest->nelem + 2 * src->nelem,
1233       is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1234    {
1235      if (dest->elems[id] == src->elems[is])
1236        is--, id--;
1237      else if (dest->elems[id] < src->elems[is])
1238        dest->elems[--sbase] = src->elems[is--];
1239      else /* if (dest->elems[id] > src->elems[is]) */
1240        --id;
1241    }
1242
1243  if (is >= 0)
1244    {
1245      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1246      sbase -= is + 1;
1247      memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1248    }
1249
1250  id = dest->nelem - 1;
1251  is = dest->nelem + 2 * src->nelem - 1;
1252  delta = is - sbase + 1;
1253  if (delta == 0)
1254    return REG_NOERROR;
1255
1256  /* Now copy.  When DELTA becomes zero, the remaining
1257     DEST elements are already in place.  */
1258  dest->nelem += delta;
1259  for (;;)
1260    {
1261      if (dest->elems[is] > dest->elems[id])
1262        {
1263          /* Copy from the top.  */
1264          dest->elems[id + delta--] = dest->elems[is--];
1265          if (delta == 0)
1266            break;
1267        }
1268      else
1269        {
1270          /* Slide from the bottom.  */
1271          dest->elems[id + delta] = dest->elems[id];
1272          if (--id < 0)
1273            {
1274              /* Copy remaining SRC elements.  */
1275              memcpy (dest->elems, dest->elems + sbase,
1276                      delta * sizeof (int));
1277              break;
1278            }
1279        }
1280    }
1281
1282  return REG_NOERROR;
1283}
1284
1285/* Insert the new element ELEM to the re_node_set* SET.
1286   SET should not already have ELEM.
1287   return -1 if an error has occurred, return 1 otherwise.  */
1288
1289static int
1290internal_function
1291re_node_set_insert (re_node_set *set, int elem)
1292{
1293  int idx;
1294  /* In case the set is empty.  */
1295  if (set->alloc == 0)
1296    {
1297      if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1298        return 1;
1299      else
1300        return -1;
1301    }
1302
1303  if (BE (set->nelem, 0) == 0)
1304    {
1305      /* We already guaranteed above that set->alloc != 0.  */
1306      set->elems[0] = elem;
1307      ++set->nelem;
1308      return 1;
1309    }
1310
1311  /* Realloc if we need.  */
1312  if (set->alloc == set->nelem)
1313    {
1314      int *new_elems;
1315      set->alloc = set->alloc * 2;
1316      new_elems = re_realloc (set->elems, int, set->alloc);
1317      if (BE (new_elems == NULL, 0))
1318        return -1;
1319      set->elems = new_elems;
1320    }
1321
1322  /* Move the elements which follows the new element.  Test the
1323     first element separately to skip a check in the inner loop.  */
1324  if (elem < set->elems[0])
1325    {
1326      idx = 0;
1327      for (idx = set->nelem; idx > 0; idx--)
1328        set->elems[idx] = set->elems[idx - 1];
1329    }
1330  else
1331    {
1332      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1333        set->elems[idx] = set->elems[idx - 1];
1334    }
1335
1336  /* Insert the new element.  */
1337  set->elems[idx] = elem;
1338  ++set->nelem;
1339  return 1;
1340}
1341
1342/* Insert the new element ELEM to the re_node_set* SET.
1343   SET should not already have any element greater than or equal to ELEM.
1344   Return -1 if an error has occurred, return 1 otherwise.  */
1345
1346static int
1347internal_function
1348re_node_set_insert_last (re_node_set *set, int elem)
1349{
1350  /* Realloc if we need.  */
1351  if (set->alloc == set->nelem)
1352    {
1353      int *new_elems;
1354      set->alloc = (set->alloc + 1) * 2;
1355      new_elems = re_realloc (set->elems, int, set->alloc);
1356      if (BE (new_elems == NULL, 0))
1357        return -1;
1358      set->elems = new_elems;
1359    }
1360
1361  /* Insert the new element.  */
1362  set->elems[set->nelem++] = elem;
1363  return 1;
1364}
1365
1366/* Compare two node sets SET1 and SET2.
1367   return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1368
1369static int
1370internal_function __attribute ((pure))
1371re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1372{
1373  int i;
1374  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1375    return 0;
1376  for (i = set1->nelem ; --i >= 0 ; )
1377    if (set1->elems[i] != set2->elems[i])
1378      return 0;
1379  return 1;
1380}
1381
1382/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1383
1384static int
1385internal_function __attribute ((pure))
1386re_node_set_contains (const re_node_set *set, int elem)
1387{
1388  unsigned int idx, right, mid;
1389  if (set->nelem <= 0)
1390    return 0;
1391
1392  /* Binary search the element.  */
1393  idx = 0;
1394  right = set->nelem - 1;
1395  while (idx < right)
1396    {
1397      mid = (idx + right) / 2;
1398      if (set->elems[mid] < elem)
1399        idx = mid + 1;
1400      else
1401        right = mid;
1402    }
1403  return set->elems[idx] == elem ? idx + 1 : 0;
1404}
1405
1406static void
1407internal_function
1408re_node_set_remove_at (re_node_set *set, int idx)
1409{
1410  if (idx < 0 || idx >= set->nelem)
1411    return;
1412  --set->nelem;
1413  for (; idx < set->nelem; idx++)
1414    set->elems[idx] = set->elems[idx + 1];
1415}
1416\f
1417
1418/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1419   Or return -1, if an error has occurred.  */
1420
1421static int
1422internal_function
1423re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1424{
1425  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1426    {
1427      size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1428      int *new_nexts, *new_indices;
1429      re_node_set *new_edests, *new_eclosures;
1430      re_token_t *new_nodes;
1431
1432      /* Avoid overflows in realloc.  */
1433      const size_t max_object_size = MAX (sizeof (re_token_t),
1434                                          MAX (sizeof (re_node_set),
1435                                               sizeof (int)));
1436      if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1437        return -1;
1438
1439      new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1440      if (BE (new_nodes == NULL, 0))
1441        return -1;
1442      dfa->nodes = new_nodes;
1443      new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1444      new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1445      new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1446      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1447      if (BE (new_nexts == NULL || new_indices == NULL
1448              || new_edests == NULL || new_eclosures == NULL, 0))
1449        return -1;
1450      dfa->nexts = new_nexts;
1451      dfa->org_indices = new_indices;
1452      dfa->edests = new_edests;
1453      dfa->eclosures = new_eclosures;
1454      dfa->nodes_alloc = new_nodes_alloc;
1455    }
1456  dfa->nodes[dfa->nodes_len] = token;
1457  dfa->nodes[dfa->nodes_len].constraint = 0;
1458#ifdef RE_ENABLE_I18N
1459  dfa->nodes[dfa->nodes_len].accept_mb =
1460    (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
1461#endif
1462  dfa->nexts[dfa->nodes_len] = -1;
1463  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1464  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1465  return dfa->nodes_len++;
1466}
1467
1468static inline unsigned int
1469internal_function
1470calc_state_hash (const re_node_set *nodes, unsigned int context)
1471{
1472  unsigned int hash = nodes->nelem + context;
1473  int i;
1474  for (i = 0 ; i < nodes->nelem ; i++)
1475    hash += nodes->elems[i];
1476  return hash;
1477}
1478
1479/* Search for the state whose node_set is equivalent to NODES.
1480   Return the pointer to the state, if we found it in the DFA.
1481   Otherwise create the new one and return it.  In case of an error
1482   return NULL and set the error code in ERR.
1483   Note: - We assume NULL as the invalid state, then it is possible that
1484           return value is NULL and ERR is REG_NOERROR.
1485         - We never return non-NULL value in case of any errors, it is for
1486           optimization.  */
1487
1488static re_dfastate_t *
1489internal_function
1490re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1491                  const re_node_set *nodes)
1492{
1493  unsigned int hash;
1494  re_dfastate_t *new_state;
1495  struct re_state_table_entry *spot;
1496  int i;
1497  if (BE (nodes->nelem == 0, 0))
1498    {
1499      *err = REG_NOERROR;
1500      return NULL;
1501    }
1502  hash = calc_state_hash (nodes, 0);
1503  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1504
1505  for (i = 0 ; i < spot->num ; i++)
1506    {
1507      re_dfastate_t *state = spot->array[i];
1508      if (hash != state->hash)
1509        continue;
1510      if (re_node_set_compare (&state->nodes, nodes))
1511        return state;
1512    }
1513
1514  /* There are no appropriate state in the dfa, create the new one.  */
1515  new_state = create_ci_newstate (dfa, nodes, hash);
1516  if (BE (new_state == NULL, 0))
1517    *err = REG_ESPACE;
1518
1519  return new_state;
1520}
1521
1522/* Search for the state whose node_set is equivalent to NODES and
1523   whose context is equivalent to CONTEXT.
1524   Return the pointer to the state, if we found it in the DFA.
1525   Otherwise create the new one and return it.  In case of an error
1526   return NULL and set the error code in ERR.
1527   Note: - We assume NULL as the invalid state, then it is possible that
1528           return value is NULL and ERR is REG_NOERROR.
1529         - We never return non-NULL value in case of any errors, it is for
1530           optimization.  */
1531
1532static re_dfastate_t *
1533internal_function
1534re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1535                          const re_node_set *nodes, unsigned int context)
1536{
1537  unsigned int hash;
1538  re_dfastate_t *new_state;
1539  struct re_state_table_entry *spot;
1540  int i;
1541  if (nodes->nelem == 0)
1542    {
1543      *err = REG_NOERROR;
1544      return NULL;
1545    }
1546  hash = calc_state_hash (nodes, context);
1547  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1548
1549  for (i = 0 ; i < spot->num ; i++)
1550    {
1551      re_dfastate_t *state = spot->array[i];
1552      if (state->hash == hash
1553          && state->context == context
1554          && re_node_set_compare (state->entrance_nodes, nodes))
1555        return state;
1556    }
1557  /* There are no appropriate state in `dfa', create the new one.  */
1558  new_state = create_cd_newstate (dfa, nodes, context, hash);
1559  if (BE (new_state == NULL, 0))
1560    *err = REG_ESPACE;
1561
1562  return new_state;
1563}
1564
1565/* Finish initialization of the new state NEWSTATE, and using its hash value
1566   HASH put in the appropriate bucket of DFA's state table.  Return value
1567   indicates the error code if failed.  */
1568
1569static reg_errcode_t
1570register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1571                unsigned int hash)
1572{
1573  struct re_state_table_entry *spot;
1574  reg_errcode_t err;
1575  int i;
1576
1577  newstate->hash = hash;
1578  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1579  if (BE (err != REG_NOERROR, 0))
1580    return REG_ESPACE;
1581  for (i = 0; i < newstate->nodes.nelem; i++)
1582    {
1583      int elem = newstate->nodes.elems[i];
1584      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1585        if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1586          return REG_ESPACE;
1587    }
1588
1589  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1590  if (BE (spot->alloc <= spot->num, 0))
1591    {
1592      int new_alloc = 2 * spot->num + 2;
1593      re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1594                                              new_alloc);
1595      if (BE (new_array == NULL, 0))
1596        return REG_ESPACE;
1597      spot->array = new_array;
1598      spot->alloc = new_alloc;
1599    }
1600  spot->array[spot->num++] = newstate;
1601  return REG_NOERROR;
1602}
1603
1604static void
1605free_state (re_dfastate_t *state)
1606{
1607  re_node_set_free (&state->non_eps_nodes);
1608  re_node_set_free (&state->inveclosure);
1609  if (state->entrance_nodes != &state->nodes)
1610    {
1611      re_node_set_free (state->entrance_nodes);
1612      re_free (state->entrance_nodes);
1613    }
1614  re_node_set_free (&state->nodes);
1615  re_free (state->word_trtable);
1616  re_free (state->trtable);
1617  re_free (state);
1618}
1619
1620/* Create the new state which is independ of contexts.
1621   Return the new state if succeeded, otherwise return NULL.  */
1622
1623static re_dfastate_t *
1624internal_function
1625create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1626                    unsigned int hash)
1627{
1628  int i;
1629  reg_errcode_t err;
1630  re_dfastate_t *newstate;
1631
1632  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1633  if (BE (newstate == NULL, 0))
1634    return NULL;
1635  err = re_node_set_init_copy (&newstate->nodes, nodes);
1636  if (BE (err != REG_NOERROR, 0))
1637    {
1638      re_free (newstate);
1639      return NULL;
1640    }
1641
1642  newstate->entrance_nodes = &newstate->nodes;
1643  for (i = 0 ; i < nodes->nelem ; i++)
1644    {
1645      re_token_t *node = dfa->nodes + nodes->elems[i];
1646      re_token_type_t type = node->type;
1647      if (type == CHARACTER && !node->constraint)
1648        continue;
1649#ifdef RE_ENABLE_I18N
1650      newstate->accept_mb |= node->accept_mb;
1651#endif /* RE_ENABLE_I18N */
1652
1653      /* If the state has the halt node, the state is a halt state.  */
1654      if (type == END_OF_RE)
1655        newstate->halt = 1;
1656      else if (type == OP_BACK_REF)
1657        newstate->has_backref = 1;
1658      else if (type == ANCHOR || node->constraint)
1659        newstate->has_constraint = 1;
1660    }
1661  err = register_state (dfa, newstate, hash);
1662  if (BE (err != REG_NOERROR, 0))
1663    {
1664      free_state (newstate);
1665      newstate = NULL;
1666    }
1667  return newstate;
1668}
1669
1670/* Create the new state which is depend on the context CONTEXT.
1671   Return the new state if succeeded, otherwise return NULL.  */
1672
1673static re_dfastate_t *
1674internal_function
1675create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1676                    unsigned int context, unsigned int hash)
1677{
1678  int i, nctx_nodes = 0;
1679  reg_errcode_t err;
1680  re_dfastate_t *newstate;
1681
1682  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1683  if (BE (newstate == NULL, 0))
1684    return NULL;
1685  err = re_node_set_init_copy (&newstate->nodes, nodes);
1686  if (BE (err != REG_NOERROR, 0))
1687    {
1688      re_free (newstate);
1689      return NULL;
1690    }
1691
1692  newstate->context = context;
1693  newstate->entrance_nodes = &newstate->nodes;
1694
1695  for (i = 0 ; i < nodes->nelem ; i++)
1696    {
1697      re_token_t *node = dfa->nodes + nodes->elems[i];
1698      re_token_type_t type = node->type;
1699      unsigned int constraint = node->constraint;
1700
1701      if (type == CHARACTER && !constraint)
1702        continue;
1703#ifdef RE_ENABLE_I18N
1704      newstate->accept_mb |= node->accept_mb;
1705#endif /* RE_ENABLE_I18N */
1706
1707      /* If the state has the halt node, the state is a halt state.  */
1708      if (type == END_OF_RE)
1709        newstate->halt = 1;
1710      else if (type == OP_BACK_REF)
1711        newstate->has_backref = 1;
1712
1713      if (constraint)
1714        {
1715          if (newstate->entrance_nodes == &newstate->nodes)
1716            {
1717              newstate->entrance_nodes = re_malloc (re_node_set, 1);
1718              if (BE (newstate->entrance_nodes == NULL, 0))
1719                {
1720                  free_state (newstate);
1721                  return NULL;
1722                }
1723              if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1724                  != REG_NOERROR)
1725                return NULL;
1726              nctx_nodes = 0;
1727              newstate->has_constraint = 1;
1728            }
1729
1730          if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1731            {
1732              re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1733              ++nctx_nodes;
1734            }
1735        }
1736    }
1737  err = register_state (dfa, newstate, hash);
1738  if (BE (err != REG_NOERROR, 0))
1739    {
1740      free_state (newstate);
1741      newstate = NULL;
1742    }
1743  return  newstate;
1744}