compat / regex / regex_internal.con commit Merge branch 'jh/memihash-opt' (bc27a2e)
   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, see
  18   <http://www.gnu.org/licenses/>.  */
  19
  20static void re_string_construct_common (const char *str, int len,
  21                                        re_string_t *pstr,
  22                                        RE_TRANSLATE_TYPE trans, int icase,
  23                                        const re_dfa_t *dfa) internal_function;
  24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                          const re_node_set *nodes,
  26                                          unsigned int hash) internal_function;
  27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                          const re_node_set *nodes,
  29                                          unsigned int context,
  30                                          unsigned int hash) internal_function;
  31
  32#ifdef GAWK
  33#undef MAX      /* safety */
  34static int
  35MAX(size_t a, size_t b)
  36{
  37        return (a > b ? a : b);
  38}
  39#endif
  40\f
  41/* Functions for string operation.  */
  42
  43/* This function allocate the buffers.  It is necessary to call
  44   re_string_reconstruct before using the object.  */
  45
  46static reg_errcode_t
  47internal_function
  48re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  49                    RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  50{
  51  reg_errcode_t ret;
  52  int init_buf_len;
  53
  54  /* Ensure at least one character fits into the buffers.  */
  55  if (init_len < dfa->mb_cur_max)
  56    init_len = dfa->mb_cur_max;
  57  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  58  re_string_construct_common (str, len, pstr, trans, icase, dfa);
  59
  60  ret = re_string_realloc_buffers (pstr, init_buf_len);
  61  if (BE (ret != REG_NOERROR, 0))
  62    return ret;
  63
  64  pstr->word_char = dfa->word_char;
  65  pstr->word_ops_used = dfa->word_ops_used;
  66  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  67  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  68  pstr->valid_raw_len = pstr->valid_len;
  69  return REG_NOERROR;
  70}
  71
  72/* This function allocate the buffers, and initialize them.  */
  73
  74static reg_errcode_t
  75internal_function
  76re_string_construct (re_string_t *pstr, const char *str, int len,
  77                     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  78{
  79  reg_errcode_t ret;
  80  memset (pstr, '\0', sizeof (re_string_t));
  81  re_string_construct_common (str, len, pstr, trans, icase, dfa);
  82
  83  if (len > 0)
  84    {
  85      ret = re_string_realloc_buffers (pstr, len + 1);
  86      if (BE (ret != REG_NOERROR, 0))
  87        return ret;
  88    }
  89  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  90
  91  if (icase)
  92    {
  93#ifdef RE_ENABLE_I18N
  94      if (dfa->mb_cur_max > 1)
  95        {
  96          while (1)
  97            {
  98              ret = build_wcs_upper_buffer (pstr);
  99              if (BE (ret != REG_NOERROR, 0))
 100                return ret;
 101              if (pstr->valid_raw_len >= len)
 102                break;
 103              if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 104                break;
 105              ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 106              if (BE (ret != REG_NOERROR, 0))
 107                return ret;
 108            }
 109        }
 110      else
 111#endif /* RE_ENABLE_I18N  */
 112        build_upper_buffer (pstr);
 113    }
 114  else
 115    {
 116#ifdef RE_ENABLE_I18N
 117      if (dfa->mb_cur_max > 1)
 118        build_wcs_buffer (pstr);
 119      else
 120#endif /* RE_ENABLE_I18N  */
 121        {
 122          if (trans != NULL)
 123            re_string_translate_buffer (pstr);
 124          else
 125            {
 126              pstr->valid_len = pstr->bufs_len;
 127              pstr->valid_raw_len = pstr->bufs_len;
 128            }
 129        }
 130    }
 131
 132  return REG_NOERROR;
 133}
 134
 135/* Helper functions for re_string_allocate, and re_string_construct.  */
 136
 137static reg_errcode_t
 138internal_function
 139re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 140{
 141#ifdef RE_ENABLE_I18N
 142  if (pstr->mb_cur_max > 1)
 143    {
 144      wint_t *new_wcs;
 145
 146      /* Avoid overflow in realloc.  */
 147      const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
 148      if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 149        return REG_ESPACE;
 150
 151      new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 152      if (BE (new_wcs == NULL, 0))
 153        return REG_ESPACE;
 154      pstr->wcs = new_wcs;
 155      if (pstr->offsets != NULL)
 156        {
 157          int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 158          if (BE (new_offsets == NULL, 0))
 159            return REG_ESPACE;
 160          pstr->offsets = new_offsets;
 161        }
 162    }
 163#endif /* RE_ENABLE_I18N  */
 164  if (pstr->mbs_allocated)
 165    {
 166      unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 167                                           new_buf_len);
 168      if (BE (new_mbs == NULL, 0))
 169        return REG_ESPACE;
 170      pstr->mbs = new_mbs;
 171    }
 172  pstr->bufs_len = new_buf_len;
 173  return REG_NOERROR;
 174}
 175
 176
 177static void
 178internal_function
 179re_string_construct_common (const char *str, int len, re_string_t *pstr,
 180                            RE_TRANSLATE_TYPE trans, int icase,
 181                            const re_dfa_t *dfa)
 182{
 183  pstr->raw_mbs = (const unsigned char *) str;
 184  pstr->len = len;
 185  pstr->raw_len = len;
 186  pstr->trans = trans;
 187  pstr->icase = icase ? 1 : 0;
 188  pstr->mbs_allocated = (trans != NULL || icase);
 189  pstr->mb_cur_max = dfa->mb_cur_max;
 190  pstr->is_utf8 = dfa->is_utf8;
 191  pstr->map_notascii = dfa->map_notascii;
 192  pstr->stop = pstr->len;
 193  pstr->raw_stop = pstr->stop;
 194}
 195
 196#ifdef RE_ENABLE_I18N
 197
 198/* Build wide character buffer PSTR->WCS.
 199   If the byte sequence of the string are:
 200     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 201   Then wide character buffer will be:
 202     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 203   We use WEOF for padding, they indicate that the position isn't
 204   a first byte of a multibyte character.
 205
 206   Note that this function assumes PSTR->VALID_LEN elements are already
 207   built and starts from PSTR->VALID_LEN.  */
 208
 209static void
 210internal_function
 211build_wcs_buffer (re_string_t *pstr)
 212{
 213#ifdef _LIBC
 214  unsigned char buf[MB_LEN_MAX];
 215  assert (MB_LEN_MAX >= pstr->mb_cur_max);
 216#else
 217  unsigned char buf[64];
 218#endif
 219  mbstate_t prev_st;
 220  int byte_idx, end_idx, remain_len;
 221  size_t mbclen;
 222
 223  /* Build the buffers from pstr->valid_len to either pstr->len or
 224     pstr->bufs_len.  */
 225  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 226  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 227    {
 228      wchar_t wc;
 229      const char *p;
 230
 231      remain_len = end_idx - byte_idx;
 232      prev_st = pstr->cur_state;
 233      /* Apply the translation if we need.  */
 234      if (BE (pstr->trans != NULL, 0))
 235        {
 236          int i, ch;
 237
 238          for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 239            {
 240              ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 241              buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 242            }
 243          p = (const char *) buf;
 244        }
 245      else
 246        p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 247      mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 248      if (BE (mbclen == (size_t) -2, 0))
 249        {
 250          /* The buffer doesn't have enough space, finish to build.  */
 251          pstr->cur_state = prev_st;
 252          break;
 253        }
 254      else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 255        {
 256          /* We treat these cases as a singlebyte character.  */
 257          mbclen = 1;
 258          wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 259          if (BE (pstr->trans != NULL, 0))
 260            wc = pstr->trans[wc];
 261          pstr->cur_state = prev_st;
 262        }
 263
 264      /* Write wide character and padding.  */
 265      pstr->wcs[byte_idx++] = wc;
 266      /* Write paddings.  */
 267      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 268        pstr->wcs[byte_idx++] = WEOF;
 269    }
 270  pstr->valid_len = byte_idx;
 271  pstr->valid_raw_len = byte_idx;
 272}
 273
 274/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 275   but for REG_ICASE.  */
 276
 277static reg_errcode_t
 278internal_function
 279build_wcs_upper_buffer (re_string_t *pstr)
 280{
 281  mbstate_t prev_st;
 282  int src_idx, byte_idx, end_idx, remain_len;
 283  size_t mbclen;
 284#ifdef _LIBC
 285  char buf[MB_LEN_MAX];
 286  assert (MB_LEN_MAX >= pstr->mb_cur_max);
 287#else
 288  char buf[64];
 289#endif
 290
 291  byte_idx = pstr->valid_len;
 292  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 293
 294  /* The following optimization assumes that ASCII characters can be
 295     mapped to wide characters with a simple cast.  */
 296  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 297    {
 298      while (byte_idx < end_idx)
 299        {
 300          wchar_t wc;
 301
 302          if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 303              && mbsinit (&pstr->cur_state))
 304            {
 305              /* In case of a singlebyte character.  */
 306              pstr->mbs[byte_idx]
 307                = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 308              /* The next step uses the assumption that wchar_t is encoded
 309                 ASCII-safe: all ASCII values can be converted like this.  */
 310              pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 311              ++byte_idx;
 312              continue;
 313            }
 314
 315          remain_len = end_idx - byte_idx;
 316          prev_st = pstr->cur_state;
 317          mbclen = __mbrtowc (&wc,
 318                              ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 319                               + byte_idx), remain_len, &pstr->cur_state);
 320          if (BE (mbclen + 2 > 2, 1))
 321            {
 322              wchar_t wcu = wc;
 323              if (iswlower (wc))
 324                {
 325                  size_t mbcdlen;
 326
 327                  wcu = towupper (wc);
 328                  mbcdlen = wcrtomb (buf, wcu, &prev_st);
 329                  if (BE (mbclen == mbcdlen, 1))
 330                    memcpy (pstr->mbs + byte_idx, buf, mbclen);
 331                  else
 332                    {
 333                      src_idx = byte_idx;
 334                      goto offsets_needed;
 335                    }
 336                }
 337              else
 338                memcpy (pstr->mbs + byte_idx,
 339                        pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 340              pstr->wcs[byte_idx++] = wcu;
 341              /* Write paddings.  */
 342              for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 343                pstr->wcs[byte_idx++] = WEOF;
 344            }
 345          else if (mbclen == (size_t) -1 || mbclen == 0)
 346            {
 347              /* It is an invalid character or '\0'.  Just use the byte.  */
 348              int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 349              pstr->mbs[byte_idx] = ch;
 350              /* And also cast it to wide char.  */
 351              pstr->wcs[byte_idx++] = (wchar_t) ch;
 352              if (BE (mbclen == (size_t) -1, 0))
 353                pstr->cur_state = prev_st;
 354            }
 355          else
 356            {
 357              /* The buffer doesn't have enough space, finish to build.  */
 358              pstr->cur_state = prev_st;
 359              break;
 360            }
 361        }
 362      pstr->valid_len = byte_idx;
 363      pstr->valid_raw_len = byte_idx;
 364      return REG_NOERROR;
 365    }
 366  else
 367    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 368      {
 369        wchar_t wc;
 370        const char *p;
 371      offsets_needed:
 372        remain_len = end_idx - byte_idx;
 373        prev_st = pstr->cur_state;
 374        if (BE (pstr->trans != NULL, 0))
 375          {
 376            int i, ch;
 377
 378            for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 379              {
 380                ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 381                buf[i] = pstr->trans[ch];
 382              }
 383            p = (const char *) buf;
 384          }
 385        else
 386          p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 387        mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 388        if (BE (mbclen + 2 > 2, 1))
 389          {
 390            wchar_t wcu = wc;
 391            if (iswlower (wc))
 392              {
 393                size_t mbcdlen;
 394
 395                wcu = towupper (wc);
 396                mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 397                if (BE (mbclen == mbcdlen, 1))
 398                  memcpy (pstr->mbs + byte_idx, buf, mbclen);
 399                else if (mbcdlen != (size_t) -1)
 400                  {
 401                    size_t i;
 402
 403                    if (byte_idx + mbcdlen > pstr->bufs_len)
 404                      {
 405                        pstr->cur_state = prev_st;
 406                        break;
 407                      }
 408
 409                    if (pstr->offsets == NULL)
 410                      {
 411                        pstr->offsets = re_malloc (int, pstr->bufs_len);
 412
 413                        if (pstr->offsets == NULL)
 414                          return REG_ESPACE;
 415                      }
 416                    if (!pstr->offsets_needed)
 417                      {
 418                        for (i = 0; i < (size_t) byte_idx; ++i)
 419                          pstr->offsets[i] = i;
 420                        pstr->offsets_needed = 1;
 421                      }
 422
 423                    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 424                    pstr->wcs[byte_idx] = wcu;
 425                    pstr->offsets[byte_idx] = src_idx;
 426                    for (i = 1; i < mbcdlen; ++i)
 427                      {
 428                        pstr->offsets[byte_idx + i]
 429                          = src_idx + (i < mbclen ? i : mbclen - 1);
 430                        pstr->wcs[byte_idx + i] = WEOF;
 431                      }
 432                    pstr->len += mbcdlen - mbclen;
 433                    if (pstr->raw_stop > src_idx)
 434                      pstr->stop += mbcdlen - mbclen;
 435                    end_idx = (pstr->bufs_len > pstr->len)
 436                              ? pstr->len : pstr->bufs_len;
 437                    byte_idx += mbcdlen;
 438                    src_idx += mbclen;
 439                    continue;
 440                  }
 441                else
 442                  memcpy (pstr->mbs + byte_idx, p, mbclen);
 443              }
 444            else
 445              memcpy (pstr->mbs + byte_idx, p, mbclen);
 446
 447            if (BE (pstr->offsets_needed != 0, 0))
 448              {
 449                size_t i;
 450                for (i = 0; i < mbclen; ++i)
 451                  pstr->offsets[byte_idx + i] = src_idx + i;
 452              }
 453            src_idx += mbclen;
 454
 455            pstr->wcs[byte_idx++] = wcu;
 456            /* Write paddings.  */
 457            for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 458              pstr->wcs[byte_idx++] = WEOF;
 459          }
 460        else if (mbclen == (size_t) -1 || mbclen == 0)
 461          {
 462            /* It is an invalid character or '\0'.  Just use the byte.  */
 463            int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 464
 465            if (BE (pstr->trans != NULL, 0))
 466              ch = pstr->trans [ch];
 467            pstr->mbs[byte_idx] = ch;
 468
 469            if (BE (pstr->offsets_needed != 0, 0))
 470              pstr->offsets[byte_idx] = src_idx;
 471            ++src_idx;
 472
 473            /* And also cast it to wide char.  */
 474            pstr->wcs[byte_idx++] = (wchar_t) ch;
 475            if (BE (mbclen == (size_t) -1, 0))
 476              pstr->cur_state = prev_st;
 477          }
 478        else
 479          {
 480            /* The buffer doesn't have enough space, finish to build.  */
 481            pstr->cur_state = prev_st;
 482            break;
 483          }
 484      }
 485  pstr->valid_len = byte_idx;
 486  pstr->valid_raw_len = src_idx;
 487  return REG_NOERROR;
 488}
 489
 490/* Skip characters until the index becomes greater than NEW_RAW_IDX.
 491   Return the index.  */
 492
 493static int
 494internal_function
 495re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 496{
 497  mbstate_t prev_st;
 498  int rawbuf_idx;
 499  size_t mbclen;
 500  wint_t wc = WEOF;
 501
 502  /* Skip the characters which are not necessary to check.  */
 503  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 504       rawbuf_idx < new_raw_idx;)
 505    {
 506      wchar_t wc2;
 507      int remain_len = pstr->len - rawbuf_idx;
 508      prev_st = pstr->cur_state;
 509      mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 510                          remain_len, &pstr->cur_state);
 511      if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 512        {
 513          /* We treat these cases as a single byte character.  */
 514          if (mbclen == 0 || remain_len == 0)
 515            wc = L'\0';
 516          else
 517            wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 518          mbclen = 1;
 519          pstr->cur_state = prev_st;
 520        }
 521      else
 522        wc = (wint_t) wc2;
 523      /* Then proceed the next character.  */
 524      rawbuf_idx += mbclen;
 525    }
 526  *last_wc = (wint_t) wc;
 527  return rawbuf_idx;
 528}
 529#endif /* RE_ENABLE_I18N  */
 530
 531/* Build the buffer PSTR->MBS, and apply the translation if we need.
 532   This function is used in case of REG_ICASE.  */
 533
 534static void
 535internal_function
 536build_upper_buffer (re_string_t *pstr)
 537{
 538  int char_idx, end_idx;
 539  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 540
 541  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 542    {
 543      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 544      if (BE (pstr->trans != NULL, 0))
 545        ch = pstr->trans[ch];
 546      if (islower (ch))
 547        pstr->mbs[char_idx] = toupper (ch);
 548      else
 549        pstr->mbs[char_idx] = ch;
 550    }
 551  pstr->valid_len = char_idx;
 552  pstr->valid_raw_len = char_idx;
 553}
 554
 555/* Apply TRANS to the buffer in PSTR.  */
 556
 557static void
 558internal_function
 559re_string_translate_buffer (re_string_t *pstr)
 560{
 561  int buf_idx, end_idx;
 562  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 563
 564  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 565    {
 566      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 567      pstr->mbs[buf_idx] = pstr->trans[ch];
 568    }
 569
 570  pstr->valid_len = buf_idx;
 571  pstr->valid_raw_len = buf_idx;
 572}
 573
 574/* This function re-construct the buffers.
 575   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 576   convert to upper case in case of REG_ICASE, apply translation.  */
 577
 578static reg_errcode_t
 579internal_function
 580re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 581{
 582  int offset = idx - pstr->raw_mbs_idx;
 583  if (BE (offset < 0, 0))
 584    {
 585      /* Reset buffer.  */
 586#ifdef RE_ENABLE_I18N
 587      if (pstr->mb_cur_max > 1)
 588        memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 589#endif /* RE_ENABLE_I18N */
 590      pstr->len = pstr->raw_len;
 591      pstr->stop = pstr->raw_stop;
 592      pstr->valid_len = 0;
 593      pstr->raw_mbs_idx = 0;
 594      pstr->valid_raw_len = 0;
 595      pstr->offsets_needed = 0;
 596      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 597                           : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 598      if (!pstr->mbs_allocated)
 599        pstr->mbs = (unsigned char *) pstr->raw_mbs;
 600      offset = idx;
 601    }
 602
 603  if (BE (offset != 0, 1))
 604    {
 605      /* Should the already checked characters be kept?  */
 606      if (BE (offset < pstr->valid_raw_len, 1))
 607        {
 608          /* Yes, move them to the front of the buffer.  */
 609#ifdef RE_ENABLE_I18N
 610          if (BE (pstr->offsets_needed, 0))
 611            {
 612              int low = 0, high = pstr->valid_len, mid;
 613              do
 614                {
 615                  mid = low + (high - low) / 2;
 616                  if (pstr->offsets[mid] > offset)
 617                    high = mid;
 618                  else if (pstr->offsets[mid] < offset)
 619                    low = mid + 1;
 620                  else
 621                    break;
 622                }
 623              while (low < high);
 624              if (pstr->offsets[mid] < offset)
 625                ++mid;
 626              pstr->tip_context = re_string_context_at (pstr, mid - 1,
 627                                                        eflags);
 628              /* This can be quite complicated, so handle specially
 629                 only the common and easy case where the character with
 630                 different length representation of lower and upper
 631                 case is present at or after offset.  */
 632              if (pstr->valid_len > offset
 633                  && mid == offset && pstr->offsets[mid] == offset)
 634                {
 635                  memmove (pstr->wcs, pstr->wcs + offset,
 636                           (pstr->valid_len - offset) * sizeof (wint_t));
 637                  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 638                  pstr->valid_len -= offset;
 639                  pstr->valid_raw_len -= offset;
 640                  for (low = 0; low < pstr->valid_len; low++)
 641                    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 642                }
 643              else
 644                {
 645                  /* Otherwise, just find out how long the partial multibyte
 646                     character at offset is and fill it with WEOF/255.  */
 647                  pstr->len = pstr->raw_len - idx + offset;
 648                  pstr->stop = pstr->raw_stop - idx + offset;
 649                  pstr->offsets_needed = 0;
 650                  while (mid > 0 && pstr->offsets[mid - 1] == offset)
 651                    --mid;
 652                  while (mid < pstr->valid_len)
 653                    if (pstr->wcs[mid] != WEOF)
 654                      break;
 655                    else
 656                      ++mid;
 657                  if (mid == pstr->valid_len)
 658                    pstr->valid_len = 0;
 659                  else
 660                    {
 661                      pstr->valid_len = pstr->offsets[mid] - offset;
 662                      if (pstr->valid_len)
 663                        {
 664                          for (low = 0; low < pstr->valid_len; ++low)
 665                            pstr->wcs[low] = WEOF;
 666                          memset (pstr->mbs, 255, pstr->valid_len);
 667                        }
 668                    }
 669                  pstr->valid_raw_len = pstr->valid_len;
 670                }
 671            }
 672          else
 673#endif
 674            {
 675              pstr->tip_context = re_string_context_at (pstr, offset - 1,
 676                                                        eflags);
 677#ifdef RE_ENABLE_I18N
 678              if (pstr->mb_cur_max > 1)
 679                memmove (pstr->wcs, pstr->wcs + offset,
 680                         (pstr->valid_len - offset) * sizeof (wint_t));
 681#endif /* RE_ENABLE_I18N */
 682              if (BE (pstr->mbs_allocated, 0))
 683                memmove (pstr->mbs, pstr->mbs + offset,
 684                         pstr->valid_len - offset);
 685              pstr->valid_len -= offset;
 686              pstr->valid_raw_len -= offset;
 687#if DEBUG
 688              assert (pstr->valid_len > 0);
 689#endif
 690            }
 691        }
 692      else
 693        {
 694#ifdef RE_ENABLE_I18N
 695          /* No, skip all characters until IDX.  */
 696          int prev_valid_len = pstr->valid_len;
 697
 698          if (BE (pstr->offsets_needed, 0))
 699            {
 700              pstr->len = pstr->raw_len - idx + offset;
 701              pstr->stop = pstr->raw_stop - idx + offset;
 702              pstr->offsets_needed = 0;
 703            }
 704#endif
 705          pstr->valid_len = 0;
 706#ifdef RE_ENABLE_I18N
 707          if (pstr->mb_cur_max > 1)
 708            {
 709              int wcs_idx;
 710              wint_t wc = WEOF;
 711
 712              if (pstr->is_utf8)
 713                {
 714                  const unsigned char *raw, *p, *end;
 715
 716                  /* Special case UTF-8.  Multi-byte chars start with any
 717                     byte other than 0x80 - 0xbf.  */
 718                  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 719                  end = raw + (offset - pstr->mb_cur_max);
 720                  if (end < pstr->raw_mbs)
 721                    end = pstr->raw_mbs;
 722                  p = raw + offset - 1;
 723#ifdef _LIBC
 724                  /* We know the wchar_t encoding is UCS4, so for the simple
 725                     case, ASCII characters, skip the conversion step.  */
 726                  if (isascii (*p) && BE (pstr->trans == NULL, 1))
 727                    {
 728                      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 729                      /* pstr->valid_len = 0; */
 730                      wc = (wchar_t) *p;
 731                    }
 732                  else
 733#endif
 734                    for (; p >= end; --p)
 735                      if ((*p & 0xc0) != 0x80)
 736                        {
 737                          mbstate_t cur_state;
 738                          wchar_t wc2;
 739                          int mlen = raw + pstr->len - p;
 740                          unsigned char buf[6];
 741                          size_t mbclen;
 742
 743                          if (BE (pstr->trans != NULL, 0))
 744                            {
 745                              int i = mlen < 6 ? mlen : 6;
 746                              while (--i >= 0)
 747                                buf[i] = pstr->trans[p[i]];
 748                            }
 749                          /* XXX Don't use mbrtowc, we know which conversion
 750                             to use (UTF-8 -> UCS4).  */
 751                          memset (&cur_state, 0, sizeof (cur_state));
 752                          mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
 753                                              &cur_state);
 754                          if (raw + offset - p <= mbclen
 755                              && mbclen < (size_t) -2)
 756                            {
 757                              memset (&pstr->cur_state, '\0',
 758                                      sizeof (mbstate_t));
 759                              pstr->valid_len = mbclen - (raw + offset - p);
 760                              wc = wc2;
 761                            }
 762                          break;
 763                        }
 764                }
 765
 766              if (wc == WEOF)
 767                pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 768              if (wc == WEOF)
 769                pstr->tip_context
 770                  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 771              else
 772                pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 773                                      && IS_WIDE_WORD_CHAR (wc))
 774                                     ? CONTEXT_WORD
 775                                     : ((IS_WIDE_NEWLINE (wc)
 776                                         && pstr->newline_anchor)
 777                                        ? CONTEXT_NEWLINE : 0));
 778              if (BE (pstr->valid_len, 0))
 779                {
 780                  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 781                    pstr->wcs[wcs_idx] = WEOF;
 782                  if (pstr->mbs_allocated)
 783                    memset (pstr->mbs, 255, pstr->valid_len);
 784                }
 785              pstr->valid_raw_len = pstr->valid_len;
 786            }
 787          else
 788#endif /* RE_ENABLE_I18N */
 789            {
 790              int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 791              pstr->valid_raw_len = 0;
 792              if (pstr->trans)
 793                c = pstr->trans[c];
 794              pstr->tip_context = (bitset_contain (pstr->word_char, c)
 795                                   ? CONTEXT_WORD
 796                                   : ((IS_NEWLINE (c) && pstr->newline_anchor)
 797                                      ? CONTEXT_NEWLINE : 0));
 798            }
 799        }
 800      if (!BE (pstr->mbs_allocated, 0))
 801        pstr->mbs += offset;
 802    }
 803  pstr->raw_mbs_idx = idx;
 804  pstr->len -= offset;
 805  pstr->stop -= offset;
 806
 807  /* Then build the buffers.  */
 808#ifdef RE_ENABLE_I18N
 809  if (pstr->mb_cur_max > 1)
 810    {
 811      if (pstr->icase)
 812        {
 813          reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 814          if (BE (ret != REG_NOERROR, 0))
 815            return ret;
 816        }
 817      else
 818        build_wcs_buffer (pstr);
 819    }
 820  else
 821#endif /* RE_ENABLE_I18N */
 822    if (BE (pstr->mbs_allocated, 0))
 823      {
 824        if (pstr->icase)
 825          build_upper_buffer (pstr);
 826        else if (pstr->trans != NULL)
 827          re_string_translate_buffer (pstr);
 828      }
 829    else
 830      pstr->valid_len = pstr->len;
 831
 832  pstr->cur_idx = 0;
 833  return REG_NOERROR;
 834}
 835
 836static unsigned char
 837internal_function __attribute ((pure))
 838re_string_peek_byte_case (const re_string_t *pstr, int idx)
 839{
 840  int ch, off;
 841
 842  /* Handle the common (easiest) cases first.  */
 843  if (BE (!pstr->mbs_allocated, 1))
 844    return re_string_peek_byte (pstr, idx);
 845
 846#ifdef RE_ENABLE_I18N
 847  if (pstr->mb_cur_max > 1
 848      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 849    return re_string_peek_byte (pstr, idx);
 850#endif
 851
 852  off = pstr->cur_idx + idx;
 853#ifdef RE_ENABLE_I18N
 854  if (pstr->offsets_needed)
 855    off = pstr->offsets[off];
 856#endif
 857
 858  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 859
 860#ifdef RE_ENABLE_I18N
 861  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 862     this function returns CAPITAL LETTER I instead of first byte of
 863     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 864     since peek_byte_case doesn't advance cur_idx in any way.  */
 865  if (pstr->offsets_needed && !isascii (ch))
 866    return re_string_peek_byte (pstr, idx);
 867#endif
 868
 869  return ch;
 870}
 871
 872static unsigned char
 873internal_function __attribute ((pure))
 874re_string_fetch_byte_case (re_string_t *pstr)
 875{
 876  if (BE (!pstr->mbs_allocated, 1))
 877    return re_string_fetch_byte (pstr);
 878
 879#ifdef RE_ENABLE_I18N
 880  if (pstr->offsets_needed)
 881    {
 882      int off, ch;
 883
 884      /* For tr_TR.UTF-8 [[:islower:]] there is
 885         [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 886         in that case the whole multi-byte character and return
 887         the original letter.  On the other side, with
 888         [[: DOTLESS SMALL LETTER I return [[:I, as doing
 889         anything else would complicate things too much.  */
 890
 891      if (!re_string_first_byte (pstr, pstr->cur_idx))
 892        return re_string_fetch_byte (pstr);
 893
 894      off = pstr->offsets[pstr->cur_idx];
 895      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 896
 897      if (! isascii (ch))
 898        return re_string_fetch_byte (pstr);
 899
 900      re_string_skip_bytes (pstr,
 901                            re_string_char_size_at (pstr, pstr->cur_idx));
 902      return ch;
 903    }
 904#endif
 905
 906  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 907}
 908
 909static void
 910internal_function
 911re_string_destruct (re_string_t *pstr)
 912{
 913#ifdef RE_ENABLE_I18N
 914  re_free (pstr->wcs);
 915  re_free (pstr->offsets);
 916#endif /* RE_ENABLE_I18N  */
 917  if (pstr->mbs_allocated)
 918    re_free (pstr->mbs);
 919}
 920
 921/* Return the context at IDX in INPUT.  */
 922
 923static unsigned int
 924internal_function
 925re_string_context_at (const re_string_t *input, int idx, int eflags)
 926{
 927  int c;
 928  if (BE (idx < 0, 0))
 929    /* In this case, we use the value stored in input->tip_context,
 930       since we can't know the character in input->mbs[-1] here.  */
 931    return input->tip_context;
 932  if (BE (idx == input->len, 0))
 933    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 934            : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 935#ifdef RE_ENABLE_I18N
 936  if (input->mb_cur_max > 1)
 937    {
 938      wint_t wc;
 939      int wc_idx = idx;
 940      while(input->wcs[wc_idx] == WEOF)
 941        {
 942#ifdef DEBUG
 943          /* It must not happen.  */
 944          assert (wc_idx >= 0);
 945#endif
 946          --wc_idx;
 947          if (wc_idx < 0)
 948            return input->tip_context;
 949        }
 950      wc = input->wcs[wc_idx];
 951      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 952        return CONTEXT_WORD;
 953      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 954              ? CONTEXT_NEWLINE : 0);
 955    }
 956  else
 957#endif
 958    {
 959      c = re_string_byte_at (input, idx);
 960      if (bitset_contain (input->word_char, c))
 961        return CONTEXT_WORD;
 962      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 963    }
 964}
 965\f
 966/* Functions for set operation.  */
 967
 968static reg_errcode_t
 969internal_function
 970re_node_set_alloc (re_node_set *set, int size)
 971{
 972  /*
 973   * ADR: valgrind says size can be 0, which then doesn't
 974   * free the block of size 0.  Harumph. This seems
 975   * to work ok, though.
 976   */
 977  if (size == 0)
 978    {
 979       memset(set, 0, sizeof(*set));
 980       return REG_NOERROR;
 981    }
 982  set->alloc = size;
 983  set->nelem = 0;
 984  set->elems = re_malloc (int, size);
 985  if (BE (set->elems == NULL, 0))
 986    return REG_ESPACE;
 987  return REG_NOERROR;
 988}
 989
 990static reg_errcode_t
 991internal_function
 992re_node_set_init_1 (re_node_set *set, int elem)
 993{
 994  set->alloc = 1;
 995  set->nelem = 1;
 996  set->elems = re_malloc (int, 1);
 997  if (BE (set->elems == NULL, 0))
 998    {
 999      set->alloc = set->nelem = 0;
1000      return REG_ESPACE;
1001    }
1002  set->elems[0] = elem;
1003  return REG_NOERROR;
1004}
1005
1006static reg_errcode_t
1007internal_function
1008re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1009{
1010  set->alloc = 2;
1011  set->elems = re_malloc (int, 2);
1012  if (BE (set->elems == NULL, 0))
1013    return REG_ESPACE;
1014  if (elem1 == elem2)
1015    {
1016      set->nelem = 1;
1017      set->elems[0] = elem1;
1018    }
1019  else
1020    {
1021      set->nelem = 2;
1022      if (elem1 < elem2)
1023        {
1024          set->elems[0] = elem1;
1025          set->elems[1] = elem2;
1026        }
1027      else
1028        {
1029          set->elems[0] = elem2;
1030          set->elems[1] = elem1;
1031        }
1032    }
1033  return REG_NOERROR;
1034}
1035
1036static reg_errcode_t
1037internal_function
1038re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1039{
1040  dest->nelem = src->nelem;
1041  if (src->nelem > 0)
1042    {
1043      dest->alloc = dest->nelem;
1044      dest->elems = re_malloc (int, dest->alloc);
1045      if (BE (dest->elems == NULL, 0))
1046        {
1047          dest->alloc = dest->nelem = 0;
1048          return REG_ESPACE;
1049        }
1050      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1051    }
1052  else
1053    re_node_set_init_empty (dest);
1054  return REG_NOERROR;
1055}
1056
1057/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1058   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1059   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1060
1061static reg_errcode_t
1062internal_function
1063re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1064                           const re_node_set *src2)
1065{
1066  int i1, i2, is, id, delta, sbase;
1067  if (src1->nelem == 0 || src2->nelem == 0)
1068    return REG_NOERROR;
1069
1070  /* We need dest->nelem + 2 * elems_in_intersection; this is a
1071     conservative estimate.  */
1072  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1073    {
1074      int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1075      int *new_elems = re_realloc (dest->elems, int, new_alloc);
1076      if (BE (new_elems == NULL, 0))
1077        return REG_ESPACE;
1078      dest->elems = new_elems;
1079      dest->alloc = new_alloc;
1080    }
1081
1082  /* Find the items in the intersection of SRC1 and SRC2, and copy
1083     into the top of DEST those that are not already in DEST itself.  */
1084  sbase = dest->nelem + src1->nelem + src2->nelem;
1085  i1 = src1->nelem - 1;
1086  i2 = src2->nelem - 1;
1087  id = dest->nelem - 1;
1088  for (;;)
1089    {
1090      if (src1->elems[i1] == src2->elems[i2])
1091        {
1092          /* Try to find the item in DEST.  Maybe we could binary search?  */
1093          while (id >= 0 && dest->elems[id] > src1->elems[i1])
1094            --id;
1095
1096          if (id < 0 || dest->elems[id] != src1->elems[i1])
1097            dest->elems[--sbase] = src1->elems[i1];
1098
1099          if (--i1 < 0 || --i2 < 0)
1100            break;
1101        }
1102
1103      /* Lower the highest of the two items.  */
1104      else if (src1->elems[i1] < src2->elems[i2])
1105        {
1106          if (--i2 < 0)
1107            break;
1108        }
1109      else
1110        {
1111          if (--i1 < 0)
1112            break;
1113        }
1114    }
1115
1116  id = dest->nelem - 1;
1117  is = dest->nelem + src1->nelem + src2->nelem - 1;
1118  delta = is - sbase + 1;
1119
1120  /* Now copy.  When DELTA becomes zero, the remaining
1121     DEST elements are already in place; this is more or
1122     less the same loop that is in re_node_set_merge.  */
1123  dest->nelem += delta;
1124  if (delta > 0 && id >= 0)
1125    for (;;)
1126      {
1127        if (dest->elems[is] > dest->elems[id])
1128          {
1129            /* Copy from the top.  */
1130            dest->elems[id + delta--] = dest->elems[is--];
1131            if (delta == 0)
1132              break;
1133          }
1134        else
1135          {
1136            /* Slide from the bottom.  */
1137            dest->elems[id + delta] = dest->elems[id];
1138            if (--id < 0)
1139              break;
1140          }
1141      }
1142
1143  /* Copy remaining SRC elements.  */
1144  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1145
1146  return REG_NOERROR;
1147}
1148
1149/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1150   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1151
1152static reg_errcode_t
1153internal_function
1154re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1155                        const re_node_set *src2)
1156{
1157  int i1, i2, id;
1158  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1159    {
1160      dest->alloc = src1->nelem + src2->nelem;
1161      dest->elems = re_malloc (int, dest->alloc);
1162      if (BE (dest->elems == NULL, 0))
1163        return REG_ESPACE;
1164    }
1165  else
1166    {
1167      if (src1 != NULL && src1->nelem > 0)
1168        return re_node_set_init_copy (dest, src1);
1169      else if (src2 != NULL && src2->nelem > 0)
1170        return re_node_set_init_copy (dest, src2);
1171      else
1172        re_node_set_init_empty (dest);
1173      return REG_NOERROR;
1174    }
1175  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1176    {
1177      if (src1->elems[i1] > src2->elems[i2])
1178        {
1179          dest->elems[id++] = src2->elems[i2++];
1180          continue;
1181        }
1182      if (src1->elems[i1] == src2->elems[i2])
1183        ++i2;
1184      dest->elems[id++] = src1->elems[i1++];
1185    }
1186  if (i1 < src1->nelem)
1187    {
1188      memcpy (dest->elems + id, src1->elems + i1,
1189             (src1->nelem - i1) * sizeof (int));
1190      id += src1->nelem - i1;
1191    }
1192  else if (i2 < src2->nelem)
1193    {
1194      memcpy (dest->elems + id, src2->elems + i2,
1195             (src2->nelem - i2) * sizeof (int));
1196      id += src2->nelem - i2;
1197    }
1198  dest->nelem = id;
1199  return REG_NOERROR;
1200}
1201
1202/* Calculate the union set of the sets DEST and SRC. And store it to
1203   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1204
1205static reg_errcode_t
1206internal_function
1207re_node_set_merge (re_node_set *dest, const re_node_set *src)
1208{
1209  int is, id, sbase, delta;
1210  if (src == NULL || src->nelem == 0)
1211    return REG_NOERROR;
1212  if (dest->alloc < 2 * src->nelem + dest->nelem)
1213    {
1214      int new_alloc = 2 * (src->nelem + dest->alloc);
1215      int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1216      if (BE (new_buffer == NULL, 0))
1217        return REG_ESPACE;
1218      dest->elems = new_buffer;
1219      dest->alloc = new_alloc;
1220    }
1221
1222  if (BE (dest->nelem == 0, 0))
1223    {
1224      dest->nelem = src->nelem;
1225      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1226      return REG_NOERROR;
1227    }
1228
1229  /* Copy into the top of DEST the items of SRC that are not
1230     found in DEST.  Maybe we could binary search in DEST?  */
1231  for (sbase = dest->nelem + 2 * src->nelem,
1232       is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1233    {
1234      if (dest->elems[id] == src->elems[is])
1235        is--, id--;
1236      else if (dest->elems[id] < src->elems[is])
1237        dest->elems[--sbase] = src->elems[is--];
1238      else /* if (dest->elems[id] > src->elems[is]) */
1239        --id;
1240    }
1241
1242  if (is >= 0)
1243    {
1244      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1245      sbase -= is + 1;
1246      memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1247    }
1248
1249  id = dest->nelem - 1;
1250  is = dest->nelem + 2 * src->nelem - 1;
1251  delta = is - sbase + 1;
1252  if (delta == 0)
1253    return REG_NOERROR;
1254
1255  /* Now copy.  When DELTA becomes zero, the remaining
1256     DEST elements are already in place.  */
1257  dest->nelem += delta;
1258  for (;;)
1259    {
1260      if (dest->elems[is] > dest->elems[id])
1261        {
1262          /* Copy from the top.  */
1263          dest->elems[id + delta--] = dest->elems[is--];
1264          if (delta == 0)
1265            break;
1266        }
1267      else
1268        {
1269          /* Slide from the bottom.  */
1270          dest->elems[id + delta] = dest->elems[id];
1271          if (--id < 0)
1272            {
1273              /* Copy remaining SRC elements.  */
1274              memcpy (dest->elems, dest->elems + sbase,
1275                      delta * sizeof (int));
1276              break;
1277            }
1278        }
1279    }
1280
1281  return REG_NOERROR;
1282}
1283
1284/* Insert the new element ELEM to the re_node_set* SET.
1285   SET should not already have ELEM.
1286   return -1 if an error has occurred, return 1 otherwise.  */
1287
1288static int
1289internal_function
1290re_node_set_insert (re_node_set *set, int elem)
1291{
1292  int idx;
1293  /* In case the set is empty.  */
1294  if (set->alloc == 0)
1295    {
1296      if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1297        return 1;
1298      else
1299        return -1;
1300    }
1301
1302  if (BE (set->nelem, 0) == 0)
1303    {
1304      /* We already guaranteed above that set->alloc != 0.  */
1305      set->elems[0] = elem;
1306      ++set->nelem;
1307      return 1;
1308    }
1309
1310  /* Realloc if we need.  */
1311  if (set->alloc == set->nelem)
1312    {
1313      int *new_elems;
1314      set->alloc = set->alloc * 2;
1315      new_elems = re_realloc (set->elems, int, set->alloc);
1316      if (BE (new_elems == NULL, 0))
1317        return -1;
1318      set->elems = new_elems;
1319    }
1320
1321  /* Move the elements which follows the new element.  Test the
1322     first element separately to skip a check in the inner loop.  */
1323  if (elem < set->elems[0])
1324    {
1325      idx = 0;
1326      for (idx = set->nelem; idx > 0; idx--)
1327        set->elems[idx] = set->elems[idx - 1];
1328    }
1329  else
1330    {
1331      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1332        set->elems[idx] = set->elems[idx - 1];
1333    }
1334
1335  /* Insert the new element.  */
1336  set->elems[idx] = elem;
1337  ++set->nelem;
1338  return 1;
1339}
1340
1341/* Insert the new element ELEM to the re_node_set* SET.
1342   SET should not already have any element greater than or equal to ELEM.
1343   Return -1 if an error has occurred, return 1 otherwise.  */
1344
1345static int
1346internal_function
1347re_node_set_insert_last (re_node_set *set, int elem)
1348{
1349  /* Realloc if we need.  */
1350  if (set->alloc == set->nelem)
1351    {
1352      int *new_elems;
1353      set->alloc = (set->alloc + 1) * 2;
1354      new_elems = re_realloc (set->elems, int, set->alloc);
1355      if (BE (new_elems == NULL, 0))
1356        return -1;
1357      set->elems = new_elems;
1358    }
1359
1360  /* Insert the new element.  */
1361  set->elems[set->nelem++] = elem;
1362  return 1;
1363}
1364
1365/* Compare two node sets SET1 and SET2.
1366   return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1367
1368static int
1369internal_function __attribute ((pure))
1370re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1371{
1372  int i;
1373  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1374    return 0;
1375  for (i = set1->nelem ; --i >= 0 ; )
1376    if (set1->elems[i] != set2->elems[i])
1377      return 0;
1378  return 1;
1379}
1380
1381/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1382
1383static int
1384internal_function __attribute ((pure))
1385re_node_set_contains (const re_node_set *set, int elem)
1386{
1387  unsigned int idx, right, mid;
1388  if (set->nelem <= 0)
1389    return 0;
1390
1391  /* Binary search the element.  */
1392  idx = 0;
1393  right = set->nelem - 1;
1394  while (idx < right)
1395    {
1396      mid = idx + (right - idx) / 2;
1397      if (set->elems[mid] < elem)
1398        idx = mid + 1;
1399      else
1400        right = mid;
1401    }
1402  return set->elems[idx] == elem ? idx + 1 : 0;
1403}
1404
1405static void
1406internal_function
1407re_node_set_remove_at (re_node_set *set, int idx)
1408{
1409  if (idx < 0 || idx >= set->nelem)
1410    return;
1411  --set->nelem;
1412  for (; idx < set->nelem; idx++)
1413    set->elems[idx] = set->elems[idx + 1];
1414}
1415\f
1416
1417/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1418   Or return -1, if an error has occurred.  */
1419
1420static int
1421internal_function
1422re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1423{
1424  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1425    {
1426      size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1427      int *new_nexts, *new_indices;
1428      re_node_set *new_edests, *new_eclosures;
1429      re_token_t *new_nodes;
1430
1431      /* Avoid overflows in realloc.  */
1432      const size_t max_object_size = MAX (sizeof (re_token_t),
1433                                          MAX (sizeof (re_node_set),
1434                                               sizeof (int)));
1435      if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1436        return -1;
1437
1438      new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1439      if (BE (new_nodes == NULL, 0))
1440        return -1;
1441      dfa->nodes = new_nodes;
1442      new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1443      new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1444      new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1445      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1446      if (BE (new_nexts == NULL || new_indices == NULL
1447              || new_edests == NULL || new_eclosures == NULL, 0))
1448        return -1;
1449      dfa->nexts = new_nexts;
1450      dfa->org_indices = new_indices;
1451      dfa->edests = new_edests;
1452      dfa->eclosures = new_eclosures;
1453      dfa->nodes_alloc = new_nodes_alloc;
1454    }
1455  dfa->nodes[dfa->nodes_len] = token;
1456  dfa->nodes[dfa->nodes_len].constraint = 0;
1457#ifdef RE_ENABLE_I18N
1458  dfa->nodes[dfa->nodes_len].accept_mb =
1459    (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
1460#endif
1461  dfa->nexts[dfa->nodes_len] = -1;
1462  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1463  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1464  return dfa->nodes_len++;
1465}
1466
1467static inline unsigned int
1468internal_function
1469calc_state_hash (const re_node_set *nodes, unsigned int context)
1470{
1471  unsigned int hash = nodes->nelem + context;
1472  int i;
1473  for (i = 0 ; i < nodes->nelem ; i++)
1474    hash += nodes->elems[i];
1475  return hash;
1476}
1477
1478/* Search for the state whose node_set is equivalent to NODES.
1479   Return the pointer to the state, if we found it in the DFA.
1480   Otherwise create the new one and return it.  In case of an error
1481   return NULL and set the error code in ERR.
1482   Note: - We assume NULL as the invalid state, then it is possible that
1483           return value is NULL and ERR is REG_NOERROR.
1484         - We never return non-NULL value in case of any errors, it is for
1485           optimization.  */
1486
1487static re_dfastate_t *
1488internal_function
1489re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1490                  const re_node_set *nodes)
1491{
1492  unsigned int hash;
1493  re_dfastate_t *new_state;
1494  struct re_state_table_entry *spot;
1495  int i;
1496  if (BE (nodes->nelem == 0, 0))
1497    {
1498      *err = REG_NOERROR;
1499      return NULL;
1500    }
1501  hash = calc_state_hash (nodes, 0);
1502  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1503
1504  for (i = 0 ; i < spot->num ; i++)
1505    {
1506      re_dfastate_t *state = spot->array[i];
1507      if (hash != state->hash)
1508        continue;
1509      if (re_node_set_compare (&state->nodes, nodes))
1510        return state;
1511    }
1512
1513  /* There are no appropriate state in the dfa, create the new one.  */
1514  new_state = create_ci_newstate (dfa, nodes, hash);
1515  if (BE (new_state == NULL, 0))
1516    *err = REG_ESPACE;
1517
1518  return new_state;
1519}
1520
1521/* Search for the state whose node_set is equivalent to NODES and
1522   whose context is equivalent to CONTEXT.
1523   Return the pointer to the state, if we found it in the DFA.
1524   Otherwise create the new one and return it.  In case of an error
1525   return NULL and set the error code in ERR.
1526   Note: - We assume NULL as the invalid state, then it is possible that
1527           return value is NULL and ERR is REG_NOERROR.
1528         - We never return non-NULL value in case of any errors, it is for
1529           optimization.  */
1530
1531static re_dfastate_t *
1532internal_function
1533re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1534                          const re_node_set *nodes, unsigned int context)
1535{
1536  unsigned int hash;
1537  re_dfastate_t *new_state;
1538  struct re_state_table_entry *spot;
1539  int i;
1540  if (nodes->nelem == 0)
1541    {
1542      *err = REG_NOERROR;
1543      return NULL;
1544    }
1545  hash = calc_state_hash (nodes, context);
1546  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1547
1548  for (i = 0 ; i < spot->num ; i++)
1549    {
1550      re_dfastate_t *state = spot->array[i];
1551      if (state->hash == hash
1552          && state->context == context
1553          && re_node_set_compare (state->entrance_nodes, nodes))
1554        return state;
1555    }
1556  /* There are no appropriate state in `dfa', create the new one.  */
1557  new_state = create_cd_newstate (dfa, nodes, context, hash);
1558  if (BE (new_state == NULL, 0))
1559    *err = REG_ESPACE;
1560
1561  return new_state;
1562}
1563
1564/* Finish initialization of the new state NEWSTATE, and using its hash value
1565   HASH put in the appropriate bucket of DFA's state table.  Return value
1566   indicates the error code if failed.  */
1567
1568static reg_errcode_t
1569register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1570                unsigned int hash)
1571{
1572  struct re_state_table_entry *spot;
1573  reg_errcode_t err;
1574  int i;
1575
1576  newstate->hash = hash;
1577  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1578  if (BE (err != REG_NOERROR, 0))
1579    return REG_ESPACE;
1580  for (i = 0; i < newstate->nodes.nelem; i++)
1581    {
1582      int elem = newstate->nodes.elems[i];
1583      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1584        if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1585          return REG_ESPACE;
1586    }
1587
1588  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1589  if (BE (spot->alloc <= spot->num, 0))
1590    {
1591      int new_alloc = 2 * spot->num + 2;
1592      re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1593                                              new_alloc);
1594      if (BE (new_array == NULL, 0))
1595        return REG_ESPACE;
1596      spot->array = new_array;
1597      spot->alloc = new_alloc;
1598    }
1599  spot->array[spot->num++] = newstate;
1600  return REG_NOERROR;
1601}
1602
1603static void
1604free_state (re_dfastate_t *state)
1605{
1606  re_node_set_free (&state->non_eps_nodes);
1607  re_node_set_free (&state->inveclosure);
1608  if (state->entrance_nodes != &state->nodes)
1609    {
1610      re_node_set_free (state->entrance_nodes);
1611      re_free (state->entrance_nodes);
1612    }
1613  re_node_set_free (&state->nodes);
1614  re_free (state->word_trtable);
1615  re_free (state->trtable);
1616  re_free (state);
1617}
1618
1619/* Create the new state which is independ of contexts.
1620   Return the new state if succeeded, otherwise return NULL.  */
1621
1622static re_dfastate_t *
1623internal_function
1624create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1625                    unsigned int hash)
1626{
1627  int i;
1628  reg_errcode_t err;
1629  re_dfastate_t *newstate;
1630
1631  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1632  if (BE (newstate == NULL, 0))
1633    return NULL;
1634  err = re_node_set_init_copy (&newstate->nodes, nodes);
1635  if (BE (err != REG_NOERROR, 0))
1636    {
1637      re_free (newstate);
1638      return NULL;
1639    }
1640
1641  newstate->entrance_nodes = &newstate->nodes;
1642  for (i = 0 ; i < nodes->nelem ; i++)
1643    {
1644      re_token_t *node = dfa->nodes + nodes->elems[i];
1645      re_token_type_t type = node->type;
1646      if (type == CHARACTER && !node->constraint)
1647        continue;
1648#ifdef RE_ENABLE_I18N
1649      newstate->accept_mb |= node->accept_mb;
1650#endif /* RE_ENABLE_I18N */
1651
1652      /* If the state has the halt node, the state is a halt state.  */
1653      if (type == END_OF_RE)
1654        newstate->halt = 1;
1655      else if (type == OP_BACK_REF)
1656        newstate->has_backref = 1;
1657      else if (type == ANCHOR || node->constraint)
1658        newstate->has_constraint = 1;
1659    }
1660  err = register_state (dfa, newstate, hash);
1661  if (BE (err != REG_NOERROR, 0))
1662    {
1663      free_state (newstate);
1664      newstate = NULL;
1665    }
1666  return newstate;
1667}
1668
1669/* Create the new state which is depend on the context CONTEXT.
1670   Return the new state if succeeded, otherwise return NULL.  */
1671
1672static re_dfastate_t *
1673internal_function
1674create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1675                    unsigned int context, unsigned int hash)
1676{
1677  int i, nctx_nodes = 0;
1678  reg_errcode_t err;
1679  re_dfastate_t *newstate;
1680
1681  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1682  if (BE (newstate == NULL, 0))
1683    return NULL;
1684  err = re_node_set_init_copy (&newstate->nodes, nodes);
1685  if (BE (err != REG_NOERROR, 0))
1686    {
1687      re_free (newstate);
1688      return NULL;
1689    }
1690
1691  newstate->context = context;
1692  newstate->entrance_nodes = &newstate->nodes;
1693
1694  for (i = 0 ; i < nodes->nelem ; i++)
1695    {
1696      re_token_t *node = dfa->nodes + nodes->elems[i];
1697      re_token_type_t type = node->type;
1698      unsigned int constraint = node->constraint;
1699
1700      if (type == CHARACTER && !constraint)
1701        continue;
1702#ifdef RE_ENABLE_I18N
1703      newstate->accept_mb |= node->accept_mb;
1704#endif /* RE_ENABLE_I18N */
1705
1706      /* If the state has the halt node, the state is a halt state.  */
1707      if (type == END_OF_RE)
1708        newstate->halt = 1;
1709      else if (type == OP_BACK_REF)
1710        newstate->has_backref = 1;
1711
1712      if (constraint)
1713        {
1714          if (newstate->entrance_nodes == &newstate->nodes)
1715            {
1716              newstate->entrance_nodes = re_malloc (re_node_set, 1);
1717              if (BE (newstate->entrance_nodes == NULL, 0))
1718                {
1719                  free_state (newstate);
1720                  return NULL;
1721                }
1722              if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1723                  != REG_NOERROR)
1724                return NULL;
1725              nctx_nodes = 0;
1726              newstate->has_constraint = 1;
1727            }
1728
1729          if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1730            {
1731              re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1732              ++nctx_nodes;
1733            }
1734        }
1735    }
1736  err = register_state (dfa, newstate, hash);
1737  if (BE (err != REG_NOERROR, 0))
1738    {
1739      free_state (newstate);
1740      newstate = NULL;
1741    }
1742  return  newstate;
1743}