1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20static void re_string_construct_common (const char *str, int len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, int icase,
23 const re_dfa_t *dfa) internal_function;
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 unsigned int hash) internal_function;
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 unsigned int hash) internal_function;
31
32#ifdef GAWK
33#undef MAX /* safety */
34static int
35MAX(size_t a, size_t b)
36{
37 return (a > b ? a : b);
38}
39#endif
40\f
41/* Functions for string operation. */
42
43/* This function allocate the buffers. It is necessary to call
44 re_string_reconstruct before using the object. */
45
46static reg_errcode_t
47internal_function
48re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
49 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
50{
51 reg_errcode_t ret;
52 int init_buf_len;
53
54 /* Ensure at least one character fits into the buffers. */
55 if (init_len < dfa->mb_cur_max)
56 init_len = dfa->mb_cur_max;
57 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
58 re_string_construct_common (str, len, pstr, trans, icase, dfa);
59
60 ret = re_string_realloc_buffers (pstr, init_buf_len);
61 if (BE (ret != REG_NOERROR, 0))
62 return ret;
63
64 pstr->word_char = dfa->word_char;
65 pstr->word_ops_used = dfa->word_ops_used;
66 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
67 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
68 pstr->valid_raw_len = pstr->valid_len;
69 return REG_NOERROR;
70}
71
72/* This function allocate the buffers, and initialize them. */
73
74static reg_errcode_t
75internal_function
76re_string_construct (re_string_t *pstr, const char *str, int len,
77 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
78{
79 reg_errcode_t ret;
80 memset (pstr, '\0', sizeof (re_string_t));
81 re_string_construct_common (str, len, pstr, trans, icase, dfa);
82
83 if (len > 0)
84 {
85 ret = re_string_realloc_buffers (pstr, len + 1);
86 if (BE (ret != REG_NOERROR, 0))
87 return ret;
88 }
89 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
90
91 if (icase)
92 {
93#ifdef RE_ENABLE_I18N
94 if (dfa->mb_cur_max > 1)
95 {
96 while (1)
97 {
98 ret = build_wcs_upper_buffer (pstr);
99 if (BE (ret != REG_NOERROR, 0))
100 return ret;
101 if (pstr->valid_raw_len >= len)
102 break;
103 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
104 break;
105 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
106 if (BE (ret != REG_NOERROR, 0))
107 return ret;
108 }
109 }
110 else
111#endif /* RE_ENABLE_I18N */
112 build_upper_buffer (pstr);
113 }
114 else
115 {
116#ifdef RE_ENABLE_I18N
117 if (dfa->mb_cur_max > 1)
118 build_wcs_buffer (pstr);
119 else
120#endif /* RE_ENABLE_I18N */
121 {
122 if (trans != NULL)
123 re_string_translate_buffer (pstr);
124 else
125 {
126 pstr->valid_len = pstr->bufs_len;
127 pstr->valid_raw_len = pstr->bufs_len;
128 }
129 }
130 }
131
132 return REG_NOERROR;
133}
134
135/* Helper functions for re_string_allocate, and re_string_construct. */
136
137static reg_errcode_t
138internal_function
139re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
140{
141#ifdef RE_ENABLE_I18N
142 if (pstr->mb_cur_max > 1)
143 {
144 wint_t *new_wcs;
145
146 /* Avoid overflow in realloc. */
147 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
148 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
149 return REG_ESPACE;
150
151 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
152 if (BE (new_wcs == NULL, 0))
153 return REG_ESPACE;
154 pstr->wcs = new_wcs;
155 if (pstr->offsets != NULL)
156 {
157 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
158 if (BE (new_offsets == NULL, 0))
159 return REG_ESPACE;
160 pstr->offsets = new_offsets;
161 }
162 }
163#endif /* RE_ENABLE_I18N */
164 if (pstr->mbs_allocated)
165 {
166 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
167 new_buf_len);
168 if (BE (new_mbs == NULL, 0))
169 return REG_ESPACE;
170 pstr->mbs = new_mbs;
171 }
172 pstr->bufs_len = new_buf_len;
173 return REG_NOERROR;
174}
175
176
177static void
178internal_function
179re_string_construct_common (const char *str, int len, re_string_t *pstr,
180 RE_TRANSLATE_TYPE trans, int icase,
181 const re_dfa_t *dfa)
182{
183 pstr->raw_mbs = (const unsigned char *) str;
184 pstr->len = len;
185 pstr->raw_len = len;
186 pstr->trans = trans;
187 pstr->icase = icase ? 1 : 0;
188 pstr->mbs_allocated = (trans != NULL || icase);
189 pstr->mb_cur_max = dfa->mb_cur_max;
190 pstr->is_utf8 = dfa->is_utf8;
191 pstr->map_notascii = dfa->map_notascii;
192 pstr->stop = pstr->len;
193 pstr->raw_stop = pstr->stop;
194}
195
196#ifdef RE_ENABLE_I18N
197
198/* Build wide character buffer PSTR->WCS.
199 If the byte sequence of the string are:
200 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
201 Then wide character buffer will be:
202 <wc1> , WEOF , <wc2> , WEOF , <wc3>
203 We use WEOF for padding, they indicate that the position isn't
204 a first byte of a multibyte character.
205
206 Note that this function assumes PSTR->VALID_LEN elements are already
207 built and starts from PSTR->VALID_LEN. */
208
209static void
210internal_function
211build_wcs_buffer (re_string_t *pstr)
212{
213#ifdef _LIBC
214 unsigned char buf[MB_LEN_MAX];
215 assert (MB_LEN_MAX >= pstr->mb_cur_max);
216#else
217 unsigned char buf[64];
218#endif
219 mbstate_t prev_st;
220 int byte_idx, end_idx, remain_len;
221 size_t mbclen;
222
223 /* Build the buffers from pstr->valid_len to either pstr->len or
224 pstr->bufs_len. */
225 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
226 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227 {
228 wchar_t wc;
229 const char *p;
230
231 remain_len = end_idx - byte_idx;
232 prev_st = pstr->cur_state;
233 /* Apply the translation if we need. */
234 if (BE (pstr->trans != NULL, 0))
235 {
236 int i, ch;
237
238 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
239 {
240 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
241 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
242 }
243 p = (const char *) buf;
244 }
245 else
246 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
247 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248 if (BE (mbclen == (size_t) -2, 0))
249 {
250 /* The buffer doesn't have enough space, finish to build. */
251 pstr->cur_state = prev_st;
252 break;
253 }
254 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
255 {
256 /* We treat these cases as a singlebyte character. */
257 mbclen = 1;
258 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
259 if (BE (pstr->trans != NULL, 0))
260 wc = pstr->trans[wc];
261 pstr->cur_state = prev_st;
262 }
263
264 /* Write wide character and padding. */
265 pstr->wcs[byte_idx++] = wc;
266 /* Write paddings. */
267 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
268 pstr->wcs[byte_idx++] = WEOF;
269 }
270 pstr->valid_len = byte_idx;
271 pstr->valid_raw_len = byte_idx;
272}
273
274/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
275 but for REG_ICASE. */
276
277static reg_errcode_t
278internal_function
279build_wcs_upper_buffer (re_string_t *pstr)
280{
281 mbstate_t prev_st;
282 int src_idx, byte_idx, end_idx, remain_len;
283 size_t mbclen;
284#ifdef _LIBC
285 char buf[MB_LEN_MAX];
286 assert (MB_LEN_MAX >= pstr->mb_cur_max);
287#else
288 char buf[64];
289#endif
290
291 byte_idx = pstr->valid_len;
292 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
293
294 /* The following optimization assumes that ASCII characters can be
295 mapped to wide characters with a simple cast. */
296 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
297 {
298 while (byte_idx < end_idx)
299 {
300 wchar_t wc;
301
302 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
303 && mbsinit (&pstr->cur_state))
304 {
305 /* In case of a singlebyte character. */
306 pstr->mbs[byte_idx]
307 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
308 /* The next step uses the assumption that wchar_t is encoded
309 ASCII-safe: all ASCII values can be converted like this. */
310 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
311 ++byte_idx;
312 continue;
313 }
314
315 remain_len = end_idx - byte_idx;
316 prev_st = pstr->cur_state;
317 mbclen = __mbrtowc (&wc,
318 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
319 + byte_idx), remain_len, &pstr->cur_state);
320 if (BE (mbclen + 2 > 2, 1))
321 {
322 wchar_t wcu = wc;
323 if (iswlower (wc))
324 {
325 size_t mbcdlen;
326
327 wcu = towupper (wc);
328 mbcdlen = wcrtomb (buf, wcu, &prev_st);
329 if (BE (mbclen == mbcdlen, 1))
330 memcpy (pstr->mbs + byte_idx, buf, mbclen);
331 else
332 {
333 src_idx = byte_idx;
334 goto offsets_needed;
335 }
336 }
337 else
338 memcpy (pstr->mbs + byte_idx,
339 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
340 pstr->wcs[byte_idx++] = wcu;
341 /* Write paddings. */
342 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
343 pstr->wcs[byte_idx++] = WEOF;
344 }
345 else if (mbclen == (size_t) -1 || mbclen == 0)
346 {
347 /* It is an invalid character or '\0'. Just use the byte. */
348 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
349 pstr->mbs[byte_idx] = ch;
350 /* And also cast it to wide char. */
351 pstr->wcs[byte_idx++] = (wchar_t) ch;
352 if (BE (mbclen == (size_t) -1, 0))
353 pstr->cur_state = prev_st;
354 }
355 else
356 {
357 /* The buffer doesn't have enough space, finish to build. */
358 pstr->cur_state = prev_st;
359 break;
360 }
361 }
362 pstr->valid_len = byte_idx;
363 pstr->valid_raw_len = byte_idx;
364 return REG_NOERROR;
365 }
366 else
367 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
368 {
369 wchar_t wc;
370 const char *p;
371 offsets_needed:
372 remain_len = end_idx - byte_idx;
373 prev_st = pstr->cur_state;
374 if (BE (pstr->trans != NULL, 0))
375 {
376 int i, ch;
377
378 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
379 {
380 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
381 buf[i] = pstr->trans[ch];
382 }
383 p = (const char *) buf;
384 }
385 else
386 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
387 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
388 if (BE (mbclen + 2 > 2, 1))
389 {
390 wchar_t wcu = wc;
391 if (iswlower (wc))
392 {
393 size_t mbcdlen;
394
395 wcu = towupper (wc);
396 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
397 if (BE (mbclen == mbcdlen, 1))
398 memcpy (pstr->mbs + byte_idx, buf, mbclen);
399 else if (mbcdlen != (size_t) -1)
400 {
401 size_t i;
402
403 if (byte_idx + mbcdlen > pstr->bufs_len)
404 {
405 pstr->cur_state = prev_st;
406 break;
407 }
408
409 if (pstr->offsets == NULL)
410 {
411 pstr->offsets = re_malloc (int, pstr->bufs_len);
412
413 if (pstr->offsets == NULL)
414 return REG_ESPACE;
415 }
416 if (!pstr->offsets_needed)
417 {
418 for (i = 0; i < (size_t) byte_idx; ++i)
419 pstr->offsets[i] = i;
420 pstr->offsets_needed = 1;
421 }
422
423 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
424 pstr->wcs[byte_idx] = wcu;
425 pstr->offsets[byte_idx] = src_idx;
426 for (i = 1; i < mbcdlen; ++i)
427 {
428 pstr->offsets[byte_idx + i]
429 = src_idx + (i < mbclen ? i : mbclen - 1);
430 pstr->wcs[byte_idx + i] = WEOF;
431 }
432 pstr->len += mbcdlen - mbclen;
433 if (pstr->raw_stop > src_idx)
434 pstr->stop += mbcdlen - mbclen;
435 end_idx = (pstr->bufs_len > pstr->len)
436 ? pstr->len : pstr->bufs_len;
437 byte_idx += mbcdlen;
438 src_idx += mbclen;
439 continue;
440 }
441 else
442 memcpy (pstr->mbs + byte_idx, p, mbclen);
443 }
444 else
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
446
447 if (BE (pstr->offsets_needed != 0, 0))
448 {
449 size_t i;
450 for (i = 0; i < mbclen; ++i)
451 pstr->offsets[byte_idx + i] = src_idx + i;
452 }
453 src_idx += mbclen;
454
455 pstr->wcs[byte_idx++] = wcu;
456 /* Write paddings. */
457 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
458 pstr->wcs[byte_idx++] = WEOF;
459 }
460 else if (mbclen == (size_t) -1 || mbclen == 0)
461 {
462 /* It is an invalid character or '\0'. Just use the byte. */
463 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
464
465 if (BE (pstr->trans != NULL, 0))
466 ch = pstr->trans [ch];
467 pstr->mbs[byte_idx] = ch;
468
469 if (BE (pstr->offsets_needed != 0, 0))
470 pstr->offsets[byte_idx] = src_idx;
471 ++src_idx;
472
473 /* And also cast it to wide char. */
474 pstr->wcs[byte_idx++] = (wchar_t) ch;
475 if (BE (mbclen == (size_t) -1, 0))
476 pstr->cur_state = prev_st;
477 }
478 else
479 {
480 /* The buffer doesn't have enough space, finish to build. */
481 pstr->cur_state = prev_st;
482 break;
483 }
484 }
485 pstr->valid_len = byte_idx;
486 pstr->valid_raw_len = src_idx;
487 return REG_NOERROR;
488}
489
490/* Skip characters until the index becomes greater than NEW_RAW_IDX.
491 Return the index. */
492
493static int
494internal_function
495re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
496{
497 mbstate_t prev_st;
498 int rawbuf_idx;
499 size_t mbclen;
500 wint_t wc = WEOF;
501
502 /* Skip the characters which are not necessary to check. */
503 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
504 rawbuf_idx < new_raw_idx;)
505 {
506 wchar_t wc2;
507 int remain_len = pstr->len - rawbuf_idx;
508 prev_st = pstr->cur_state;
509 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
510 remain_len, &pstr->cur_state);
511 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
512 {
513 /* We treat these cases as a single byte character. */
514 if (mbclen == 0 || remain_len == 0)
515 wc = L'\0';
516 else
517 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
518 mbclen = 1;
519 pstr->cur_state = prev_st;
520 }
521 else
522 wc = (wint_t) wc2;
523 /* Then proceed the next character. */
524 rawbuf_idx += mbclen;
525 }
526 *last_wc = (wint_t) wc;
527 return rawbuf_idx;
528}
529#endif /* RE_ENABLE_I18N */
530
531/* Build the buffer PSTR->MBS, and apply the translation if we need.
532 This function is used in case of REG_ICASE. */
533
534static void
535internal_function
536build_upper_buffer (re_string_t *pstr)
537{
538 int char_idx, end_idx;
539 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
540
541 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
542 {
543 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
544 if (BE (pstr->trans != NULL, 0))
545 ch = pstr->trans[ch];
546 if (islower (ch))
547 pstr->mbs[char_idx] = toupper (ch);
548 else
549 pstr->mbs[char_idx] = ch;
550 }
551 pstr->valid_len = char_idx;
552 pstr->valid_raw_len = char_idx;
553}
554
555/* Apply TRANS to the buffer in PSTR. */
556
557static void
558internal_function
559re_string_translate_buffer (re_string_t *pstr)
560{
561 int buf_idx, end_idx;
562 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
563
564 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
565 {
566 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567 pstr->mbs[buf_idx] = pstr->trans[ch];
568 }
569
570 pstr->valid_len = buf_idx;
571 pstr->valid_raw_len = buf_idx;
572}
573
574/* This function re-construct the buffers.
575 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576 convert to upper case in case of REG_ICASE, apply translation. */
577
578static reg_errcode_t
579internal_function
580re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
581{
582 int offset = idx - pstr->raw_mbs_idx;
583 if (BE (offset < 0, 0))
584 {
585 /* Reset buffer. */
586#ifdef RE_ENABLE_I18N
587 if (pstr->mb_cur_max > 1)
588 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589#endif /* RE_ENABLE_I18N */
590 pstr->len = pstr->raw_len;
591 pstr->stop = pstr->raw_stop;
592 pstr->valid_len = 0;
593 pstr->raw_mbs_idx = 0;
594 pstr->valid_raw_len = 0;
595 pstr->offsets_needed = 0;
596 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598 if (!pstr->mbs_allocated)
599 pstr->mbs = (unsigned char *) pstr->raw_mbs;
600 offset = idx;
601 }
602
603 if (BE (offset != 0, 1))
604 {
605 /* Should the already checked characters be kept? */
606 if (BE (offset < pstr->valid_raw_len, 1))
607 {
608 /* Yes, move them to the front of the buffer. */
609#ifdef RE_ENABLE_I18N
610 if (BE (pstr->offsets_needed, 0))
611 {
612 int low = 0, high = pstr->valid_len, mid;
613 do
614 {
615 mid = low + (high - low) / 2;
616 if (pstr->offsets[mid] > offset)
617 high = mid;
618 else if (pstr->offsets[mid] < offset)
619 low = mid + 1;
620 else
621 break;
622 }
623 while (low < high);
624 if (pstr->offsets[mid] < offset)
625 ++mid;
626 pstr->tip_context = re_string_context_at (pstr, mid - 1,
627 eflags);
628 /* This can be quite complicated, so handle specially
629 only the common and easy case where the character with
630 different length representation of lower and upper
631 case is present at or after offset. */
632 if (pstr->valid_len > offset
633 && mid == offset && pstr->offsets[mid] == offset)
634 {
635 memmove (pstr->wcs, pstr->wcs + offset,
636 (pstr->valid_len - offset) * sizeof (wint_t));
637 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
638 pstr->valid_len -= offset;
639 pstr->valid_raw_len -= offset;
640 for (low = 0; low < pstr->valid_len; low++)
641 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
642 }
643 else
644 {
645 /* Otherwise, just find out how long the partial multibyte
646 character at offset is and fill it with WEOF/255. */
647 pstr->len = pstr->raw_len - idx + offset;
648 pstr->stop = pstr->raw_stop - idx + offset;
649 pstr->offsets_needed = 0;
650 while (mid > 0 && pstr->offsets[mid - 1] == offset)
651 --mid;
652 while (mid < pstr->valid_len)
653 if (pstr->wcs[mid] != WEOF)
654 break;
655 else
656 ++mid;
657 if (mid == pstr->valid_len)
658 pstr->valid_len = 0;
659 else
660 {
661 pstr->valid_len = pstr->offsets[mid] - offset;
662 if (pstr->valid_len)
663 {
664 for (low = 0; low < pstr->valid_len; ++low)
665 pstr->wcs[low] = WEOF;
666 memset (pstr->mbs, 255, pstr->valid_len);
667 }
668 }
669 pstr->valid_raw_len = pstr->valid_len;
670 }
671 }
672 else
673#endif
674 {
675 pstr->tip_context = re_string_context_at (pstr, offset - 1,
676 eflags);
677#ifdef RE_ENABLE_I18N
678 if (pstr->mb_cur_max > 1)
679 memmove (pstr->wcs, pstr->wcs + offset,
680 (pstr->valid_len - offset) * sizeof (wint_t));
681#endif /* RE_ENABLE_I18N */
682 if (BE (pstr->mbs_allocated, 0))
683 memmove (pstr->mbs, pstr->mbs + offset,
684 pstr->valid_len - offset);
685 pstr->valid_len -= offset;
686 pstr->valid_raw_len -= offset;
687#if DEBUG
688 assert (pstr->valid_len > 0);
689#endif
690 }
691 }
692 else
693 {
694#ifdef RE_ENABLE_I18N
695 /* No, skip all characters until IDX. */
696 int prev_valid_len = pstr->valid_len;
697
698 if (BE (pstr->offsets_needed, 0))
699 {
700 pstr->len = pstr->raw_len - idx + offset;
701 pstr->stop = pstr->raw_stop - idx + offset;
702 pstr->offsets_needed = 0;
703 }
704#endif
705 pstr->valid_len = 0;
706#ifdef RE_ENABLE_I18N
707 if (pstr->mb_cur_max > 1)
708 {
709 int wcs_idx;
710 wint_t wc = WEOF;
711
712 if (pstr->is_utf8)
713 {
714 const unsigned char *raw, *p, *end;
715
716 /* Special case UTF-8. Multi-byte chars start with any
717 byte other than 0x80 - 0xbf. */
718 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719 end = raw + (offset - pstr->mb_cur_max);
720 if (end < pstr->raw_mbs)
721 end = pstr->raw_mbs;
722 p = raw + offset - 1;
723#ifdef _LIBC
724 /* We know the wchar_t encoding is UCS4, so for the simple
725 case, ASCII characters, skip the conversion step. */
726 if (isascii (*p) && BE (pstr->trans == NULL, 1))
727 {
728 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729 /* pstr->valid_len = 0; */
730 wc = (wchar_t) *p;
731 }
732 else
733#endif
734 for (; p >= end; --p)
735 if ((*p & 0xc0) != 0x80)
736 {
737 mbstate_t cur_state;
738 wchar_t wc2;
739 int mlen = raw + pstr->len - p;
740 unsigned char buf[6];
741 size_t mbclen;
742
743 if (BE (pstr->trans != NULL, 0))
744 {
745 int i = mlen < 6 ? mlen : 6;
746 while (--i >= 0)
747 buf[i] = pstr->trans[p[i]];
748 }
749 /* XXX Don't use mbrtowc, we know which conversion
750 to use (UTF-8 -> UCS4). */
751 memset (&cur_state, 0, sizeof (cur_state));
752 mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
753 &cur_state);
754 if (raw + offset - p <= mbclen
755 && mbclen < (size_t) -2)
756 {
757 memset (&pstr->cur_state, '\0',
758 sizeof (mbstate_t));
759 pstr->valid_len = mbclen - (raw + offset - p);
760 wc = wc2;
761 }
762 break;
763 }
764 }
765
766 if (wc == WEOF)
767 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
768 if (wc == WEOF)
769 pstr->tip_context
770 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
771 else
772 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
773 && IS_WIDE_WORD_CHAR (wc))
774 ? CONTEXT_WORD
775 : ((IS_WIDE_NEWLINE (wc)
776 && pstr->newline_anchor)
777 ? CONTEXT_NEWLINE : 0));
778 if (BE (pstr->valid_len, 0))
779 {
780 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
781 pstr->wcs[wcs_idx] = WEOF;
782 if (pstr->mbs_allocated)
783 memset (pstr->mbs, 255, pstr->valid_len);
784 }
785 pstr->valid_raw_len = pstr->valid_len;
786 }
787 else
788#endif /* RE_ENABLE_I18N */
789 {
790 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
791 pstr->valid_raw_len = 0;
792 if (pstr->trans)
793 c = pstr->trans[c];
794 pstr->tip_context = (bitset_contain (pstr->word_char, c)
795 ? CONTEXT_WORD
796 : ((IS_NEWLINE (c) && pstr->newline_anchor)
797 ? CONTEXT_NEWLINE : 0));
798 }
799 }
800 if (!BE (pstr->mbs_allocated, 0))
801 pstr->mbs += offset;
802 }
803 pstr->raw_mbs_idx = idx;
804 pstr->len -= offset;
805 pstr->stop -= offset;
806
807 /* Then build the buffers. */
808#ifdef RE_ENABLE_I18N
809 if (pstr->mb_cur_max > 1)
810 {
811 if (pstr->icase)
812 {
813 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
814 if (BE (ret != REG_NOERROR, 0))
815 return ret;
816 }
817 else
818 build_wcs_buffer (pstr);
819 }
820 else
821#endif /* RE_ENABLE_I18N */
822 if (BE (pstr->mbs_allocated, 0))
823 {
824 if (pstr->icase)
825 build_upper_buffer (pstr);
826 else if (pstr->trans != NULL)
827 re_string_translate_buffer (pstr);
828 }
829 else
830 pstr->valid_len = pstr->len;
831
832 pstr->cur_idx = 0;
833 return REG_NOERROR;
834}
835
836static unsigned char
837internal_function __attribute ((pure))
838re_string_peek_byte_case (const re_string_t *pstr, int idx)
839{
840 int ch, off;
841
842 /* Handle the common (easiest) cases first. */
843 if (BE (!pstr->mbs_allocated, 1))
844 return re_string_peek_byte (pstr, idx);
845
846#ifdef RE_ENABLE_I18N
847 if (pstr->mb_cur_max > 1
848 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
849 return re_string_peek_byte (pstr, idx);
850#endif
851
852 off = pstr->cur_idx + idx;
853#ifdef RE_ENABLE_I18N
854 if (pstr->offsets_needed)
855 off = pstr->offsets[off];
856#endif
857
858 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
859
860#ifdef RE_ENABLE_I18N
861 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
862 this function returns CAPITAL LETTER I instead of first byte of
863 DOTLESS SMALL LETTER I. The latter would confuse the parser,
864 since peek_byte_case doesn't advance cur_idx in any way. */
865 if (pstr->offsets_needed && !isascii (ch))
866 return re_string_peek_byte (pstr, idx);
867#endif
868
869 return ch;
870}
871
872static unsigned char
873internal_function __attribute ((pure))
874re_string_fetch_byte_case (re_string_t *pstr)
875{
876 if (BE (!pstr->mbs_allocated, 1))
877 return re_string_fetch_byte (pstr);
878
879#ifdef RE_ENABLE_I18N
880 if (pstr->offsets_needed)
881 {
882 int off, ch;
883
884 /* For tr_TR.UTF-8 [[:islower:]] there is
885 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
886 in that case the whole multi-byte character and return
887 the original letter. On the other side, with
888 [[: DOTLESS SMALL LETTER I return [[:I, as doing
889 anything else would complicate things too much. */
890
891 if (!re_string_first_byte (pstr, pstr->cur_idx))
892 return re_string_fetch_byte (pstr);
893
894 off = pstr->offsets[pstr->cur_idx];
895 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
896
897 if (! isascii (ch))
898 return re_string_fetch_byte (pstr);
899
900 re_string_skip_bytes (pstr,
901 re_string_char_size_at (pstr, pstr->cur_idx));
902 return ch;
903 }
904#endif
905
906 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
907}
908
909static void
910internal_function
911re_string_destruct (re_string_t *pstr)
912{
913#ifdef RE_ENABLE_I18N
914 re_free (pstr->wcs);
915 re_free (pstr->offsets);
916#endif /* RE_ENABLE_I18N */
917 if (pstr->mbs_allocated)
918 re_free (pstr->mbs);
919}
920
921/* Return the context at IDX in INPUT. */
922
923static unsigned int
924internal_function
925re_string_context_at (const re_string_t *input, int idx, int eflags)
926{
927 int c;
928 if (BE (idx < 0, 0))
929 /* In this case, we use the value stored in input->tip_context,
930 since we can't know the character in input->mbs[-1] here. */
931 return input->tip_context;
932 if (BE (idx == input->len, 0))
933 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
935#ifdef RE_ENABLE_I18N
936 if (input->mb_cur_max > 1)
937 {
938 wint_t wc;
939 int wc_idx = idx;
940 while(input->wcs[wc_idx] == WEOF)
941 {
942#ifdef DEBUG
943 /* It must not happen. */
944 assert (wc_idx >= 0);
945#endif
946 --wc_idx;
947 if (wc_idx < 0)
948 return input->tip_context;
949 }
950 wc = input->wcs[wc_idx];
951 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
952 return CONTEXT_WORD;
953 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954 ? CONTEXT_NEWLINE : 0);
955 }
956 else
957#endif
958 {
959 c = re_string_byte_at (input, idx);
960 if (bitset_contain (input->word_char, c))
961 return CONTEXT_WORD;
962 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963 }
964}
965\f
966/* Functions for set operation. */
967
968static reg_errcode_t
969internal_function
970re_node_set_alloc (re_node_set *set, int size)
971{
972 /*
973 * ADR: valgrind says size can be 0, which then doesn't
974 * free the block of size 0. Harumph. This seems
975 * to work ok, though.
976 */
977 if (size == 0)
978 {
979 memset(set, 0, sizeof(*set));
980 return REG_NOERROR;
981 }
982 set->alloc = size;
983 set->nelem = 0;
984 set->elems = re_malloc (int, size);
985 if (BE (set->elems == NULL, 0))
986 return REG_ESPACE;
987 return REG_NOERROR;
988}
989
990static reg_errcode_t
991internal_function
992re_node_set_init_1 (re_node_set *set, int elem)
993{
994 set->alloc = 1;
995 set->nelem = 1;
996 set->elems = re_malloc (int, 1);
997 if (BE (set->elems == NULL, 0))
998 {
999 set->alloc = set->nelem = 0;
1000 return REG_ESPACE;
1001 }
1002 set->elems[0] = elem;
1003 return REG_NOERROR;
1004}
1005
1006static reg_errcode_t
1007internal_function
1008re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1009{
1010 set->alloc = 2;
1011 set->elems = re_malloc (int, 2);
1012 if (BE (set->elems == NULL, 0))
1013 return REG_ESPACE;
1014 if (elem1 == elem2)
1015 {
1016 set->nelem = 1;
1017 set->elems[0] = elem1;
1018 }
1019 else
1020 {
1021 set->nelem = 2;
1022 if (elem1 < elem2)
1023 {
1024 set->elems[0] = elem1;
1025 set->elems[1] = elem2;
1026 }
1027 else
1028 {
1029 set->elems[0] = elem2;
1030 set->elems[1] = elem1;
1031 }
1032 }
1033 return REG_NOERROR;
1034}
1035
1036static reg_errcode_t
1037internal_function
1038re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1039{
1040 dest->nelem = src->nelem;
1041 if (src->nelem > 0)
1042 {
1043 dest->alloc = dest->nelem;
1044 dest->elems = re_malloc (int, dest->alloc);
1045 if (BE (dest->elems == NULL, 0))
1046 {
1047 dest->alloc = dest->nelem = 0;
1048 return REG_ESPACE;
1049 }
1050 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1051 }
1052 else
1053 re_node_set_init_empty (dest);
1054 return REG_NOERROR;
1055}
1056
1057/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1058 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1059 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1060
1061static reg_errcode_t
1062internal_function
1063re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1064 const re_node_set *src2)
1065{
1066 int i1, i2, is, id, delta, sbase;
1067 if (src1->nelem == 0 || src2->nelem == 0)
1068 return REG_NOERROR;
1069
1070 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1071 conservative estimate. */
1072 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1073 {
1074 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1075 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1076 if (BE (new_elems == NULL, 0))
1077 return REG_ESPACE;
1078 dest->elems = new_elems;
1079 dest->alloc = new_alloc;
1080 }
1081
1082 /* Find the items in the intersection of SRC1 and SRC2, and copy
1083 into the top of DEST those that are not already in DEST itself. */
1084 sbase = dest->nelem + src1->nelem + src2->nelem;
1085 i1 = src1->nelem - 1;
1086 i2 = src2->nelem - 1;
1087 id = dest->nelem - 1;
1088 for (;;)
1089 {
1090 if (src1->elems[i1] == src2->elems[i2])
1091 {
1092 /* Try to find the item in DEST. Maybe we could binary search? */
1093 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1094 --id;
1095
1096 if (id < 0 || dest->elems[id] != src1->elems[i1])
1097 dest->elems[--sbase] = src1->elems[i1];
1098
1099 if (--i1 < 0 || --i2 < 0)
1100 break;
1101 }
1102
1103 /* Lower the highest of the two items. */
1104 else if (src1->elems[i1] < src2->elems[i2])
1105 {
1106 if (--i2 < 0)
1107 break;
1108 }
1109 else
1110 {
1111 if (--i1 < 0)
1112 break;
1113 }
1114 }
1115
1116 id = dest->nelem - 1;
1117 is = dest->nelem + src1->nelem + src2->nelem - 1;
1118 delta = is - sbase + 1;
1119
1120 /* Now copy. When DELTA becomes zero, the remaining
1121 DEST elements are already in place; this is more or
1122 less the same loop that is in re_node_set_merge. */
1123 dest->nelem += delta;
1124 if (delta > 0 && id >= 0)
1125 for (;;)
1126 {
1127 if (dest->elems[is] > dest->elems[id])
1128 {
1129 /* Copy from the top. */
1130 dest->elems[id + delta--] = dest->elems[is--];
1131 if (delta == 0)
1132 break;
1133 }
1134 else
1135 {
1136 /* Slide from the bottom. */
1137 dest->elems[id + delta] = dest->elems[id];
1138 if (--id < 0)
1139 break;
1140 }
1141 }
1142
1143 /* Copy remaining SRC elements. */
1144 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1145
1146 return REG_NOERROR;
1147}
1148
1149/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1150 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1151
1152static reg_errcode_t
1153internal_function
1154re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1155 const re_node_set *src2)
1156{
1157 int i1, i2, id;
1158 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1159 {
1160 dest->alloc = src1->nelem + src2->nelem;
1161 dest->elems = re_malloc (int, dest->alloc);
1162 if (BE (dest->elems == NULL, 0))
1163 return REG_ESPACE;
1164 }
1165 else
1166 {
1167 if (src1 != NULL && src1->nelem > 0)
1168 return re_node_set_init_copy (dest, src1);
1169 else if (src2 != NULL && src2->nelem > 0)
1170 return re_node_set_init_copy (dest, src2);
1171 else
1172 re_node_set_init_empty (dest);
1173 return REG_NOERROR;
1174 }
1175 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1176 {
1177 if (src1->elems[i1] > src2->elems[i2])
1178 {
1179 dest->elems[id++] = src2->elems[i2++];
1180 continue;
1181 }
1182 if (src1->elems[i1] == src2->elems[i2])
1183 ++i2;
1184 dest->elems[id++] = src1->elems[i1++];
1185 }
1186 if (i1 < src1->nelem)
1187 {
1188 memcpy (dest->elems + id, src1->elems + i1,
1189 (src1->nelem - i1) * sizeof (int));
1190 id += src1->nelem - i1;
1191 }
1192 else if (i2 < src2->nelem)
1193 {
1194 memcpy (dest->elems + id, src2->elems + i2,
1195 (src2->nelem - i2) * sizeof (int));
1196 id += src2->nelem - i2;
1197 }
1198 dest->nelem = id;
1199 return REG_NOERROR;
1200}
1201
1202/* Calculate the union set of the sets DEST and SRC. And store it to
1203 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1204
1205static reg_errcode_t
1206internal_function
1207re_node_set_merge (re_node_set *dest, const re_node_set *src)
1208{
1209 int is, id, sbase, delta;
1210 if (src == NULL || src->nelem == 0)
1211 return REG_NOERROR;
1212 if (dest->alloc < 2 * src->nelem + dest->nelem)
1213 {
1214 int new_alloc = 2 * (src->nelem + dest->alloc);
1215 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1216 if (BE (new_buffer == NULL, 0))
1217 return REG_ESPACE;
1218 dest->elems = new_buffer;
1219 dest->alloc = new_alloc;
1220 }
1221
1222 if (BE (dest->nelem == 0, 0))
1223 {
1224 dest->nelem = src->nelem;
1225 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1226 return REG_NOERROR;
1227 }
1228
1229 /* Copy into the top of DEST the items of SRC that are not
1230 found in DEST. Maybe we could binary search in DEST? */
1231 for (sbase = dest->nelem + 2 * src->nelem,
1232 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1233 {
1234 if (dest->elems[id] == src->elems[is])
1235 is--, id--;
1236 else if (dest->elems[id] < src->elems[is])
1237 dest->elems[--sbase] = src->elems[is--];
1238 else /* if (dest->elems[id] > src->elems[is]) */
1239 --id;
1240 }
1241
1242 if (is >= 0)
1243 {
1244 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1245 sbase -= is + 1;
1246 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1247 }
1248
1249 id = dest->nelem - 1;
1250 is = dest->nelem + 2 * src->nelem - 1;
1251 delta = is - sbase + 1;
1252 if (delta == 0)
1253 return REG_NOERROR;
1254
1255 /* Now copy. When DELTA becomes zero, the remaining
1256 DEST elements are already in place. */
1257 dest->nelem += delta;
1258 for (;;)
1259 {
1260 if (dest->elems[is] > dest->elems[id])
1261 {
1262 /* Copy from the top. */
1263 dest->elems[id + delta--] = dest->elems[is--];
1264 if (delta == 0)
1265 break;
1266 }
1267 else
1268 {
1269 /* Slide from the bottom. */
1270 dest->elems[id + delta] = dest->elems[id];
1271 if (--id < 0)
1272 {
1273 /* Copy remaining SRC elements. */
1274 memcpy (dest->elems, dest->elems + sbase,
1275 delta * sizeof (int));
1276 break;
1277 }
1278 }
1279 }
1280
1281 return REG_NOERROR;
1282}
1283
1284/* Insert the new element ELEM to the re_node_set* SET.
1285 SET should not already have ELEM.
1286 return -1 if an error has occurred, return 1 otherwise. */
1287
1288static int
1289internal_function
1290re_node_set_insert (re_node_set *set, int elem)
1291{
1292 int idx;
1293 /* In case the set is empty. */
1294 if (set->alloc == 0)
1295 {
1296 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1297 return 1;
1298 else
1299 return -1;
1300 }
1301
1302 if (BE (set->nelem, 0) == 0)
1303 {
1304 /* We already guaranteed above that set->alloc != 0. */
1305 set->elems[0] = elem;
1306 ++set->nelem;
1307 return 1;
1308 }
1309
1310 /* Realloc if we need. */
1311 if (set->alloc == set->nelem)
1312 {
1313 int *new_elems;
1314 set->alloc = set->alloc * 2;
1315 new_elems = re_realloc (set->elems, int, set->alloc);
1316 if (BE (new_elems == NULL, 0))
1317 return -1;
1318 set->elems = new_elems;
1319 }
1320
1321 /* Move the elements which follows the new element. Test the
1322 first element separately to skip a check in the inner loop. */
1323 if (elem < set->elems[0])
1324 {
1325 idx = 0;
1326 for (idx = set->nelem; idx > 0; idx--)
1327 set->elems[idx] = set->elems[idx - 1];
1328 }
1329 else
1330 {
1331 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1332 set->elems[idx] = set->elems[idx - 1];
1333 }
1334
1335 /* Insert the new element. */
1336 set->elems[idx] = elem;
1337 ++set->nelem;
1338 return 1;
1339}
1340
1341/* Insert the new element ELEM to the re_node_set* SET.
1342 SET should not already have any element greater than or equal to ELEM.
1343 Return -1 if an error has occurred, return 1 otherwise. */
1344
1345static int
1346internal_function
1347re_node_set_insert_last (re_node_set *set, int elem)
1348{
1349 /* Realloc if we need. */
1350 if (set->alloc == set->nelem)
1351 {
1352 int *new_elems;
1353 set->alloc = (set->alloc + 1) * 2;
1354 new_elems = re_realloc (set->elems, int, set->alloc);
1355 if (BE (new_elems == NULL, 0))
1356 return -1;
1357 set->elems = new_elems;
1358 }
1359
1360 /* Insert the new element. */
1361 set->elems[set->nelem++] = elem;
1362 return 1;
1363}
1364
1365/* Compare two node sets SET1 and SET2.
1366 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1367
1368static int
1369internal_function __attribute ((pure))
1370re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1371{
1372 int i;
1373 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1374 return 0;
1375 for (i = set1->nelem ; --i >= 0 ; )
1376 if (set1->elems[i] != set2->elems[i])
1377 return 0;
1378 return 1;
1379}
1380
1381/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1382
1383static int
1384internal_function __attribute ((pure))
1385re_node_set_contains (const re_node_set *set, int elem)
1386{
1387 unsigned int idx, right, mid;
1388 if (set->nelem <= 0)
1389 return 0;
1390
1391 /* Binary search the element. */
1392 idx = 0;
1393 right = set->nelem - 1;
1394 while (idx < right)
1395 {
1396 mid = idx + (right - idx) / 2;
1397 if (set->elems[mid] < elem)
1398 idx = mid + 1;
1399 else
1400 right = mid;
1401 }
1402 return set->elems[idx] == elem ? idx + 1 : 0;
1403}
1404
1405static void
1406internal_function
1407re_node_set_remove_at (re_node_set *set, int idx)
1408{
1409 if (idx < 0 || idx >= set->nelem)
1410 return;
1411 --set->nelem;
1412 for (; idx < set->nelem; idx++)
1413 set->elems[idx] = set->elems[idx + 1];
1414}
1415\f
1416
1417/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1418 Or return -1, if an error has occurred. */
1419
1420static int
1421internal_function
1422re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1423{
1424 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1425 {
1426 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1427 int *new_nexts, *new_indices;
1428 re_node_set *new_edests, *new_eclosures;
1429 re_token_t *new_nodes;
1430
1431 /* Avoid overflows in realloc. */
1432 const size_t max_object_size = MAX (sizeof (re_token_t),
1433 MAX (sizeof (re_node_set),
1434 sizeof (int)));
1435 if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1436 return -1;
1437
1438 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1439 if (BE (new_nodes == NULL, 0))
1440 return -1;
1441 dfa->nodes = new_nodes;
1442 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1443 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1444 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1445 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1446 if (BE (new_nexts == NULL || new_indices == NULL
1447 || new_edests == NULL || new_eclosures == NULL, 0))
1448 return -1;
1449 dfa->nexts = new_nexts;
1450 dfa->org_indices = new_indices;
1451 dfa->edests = new_edests;
1452 dfa->eclosures = new_eclosures;
1453 dfa->nodes_alloc = new_nodes_alloc;
1454 }
1455 dfa->nodes[dfa->nodes_len] = token;
1456 dfa->nodes[dfa->nodes_len].constraint = 0;
1457#ifdef RE_ENABLE_I18N
1458 dfa->nodes[dfa->nodes_len].accept_mb =
1459 (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
1460#endif
1461 dfa->nexts[dfa->nodes_len] = -1;
1462 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1463 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1464 return dfa->nodes_len++;
1465}
1466
1467static inline unsigned int
1468internal_function
1469calc_state_hash (const re_node_set *nodes, unsigned int context)
1470{
1471 unsigned int hash = nodes->nelem + context;
1472 int i;
1473 for (i = 0 ; i < nodes->nelem ; i++)
1474 hash += nodes->elems[i];
1475 return hash;
1476}
1477
1478/* Search for the state whose node_set is equivalent to NODES.
1479 Return the pointer to the state, if we found it in the DFA.
1480 Otherwise create the new one and return it. In case of an error
1481 return NULL and set the error code in ERR.
1482 Note: - We assume NULL as the invalid state, then it is possible that
1483 return value is NULL and ERR is REG_NOERROR.
1484 - We never return non-NULL value in case of any errors, it is for
1485 optimization. */
1486
1487static re_dfastate_t *
1488internal_function
1489re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1490 const re_node_set *nodes)
1491{
1492 unsigned int hash;
1493 re_dfastate_t *new_state;
1494 struct re_state_table_entry *spot;
1495 int i;
1496 if (BE (nodes->nelem == 0, 0))
1497 {
1498 *err = REG_NOERROR;
1499 return NULL;
1500 }
1501 hash = calc_state_hash (nodes, 0);
1502 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1503
1504 for (i = 0 ; i < spot->num ; i++)
1505 {
1506 re_dfastate_t *state = spot->array[i];
1507 if (hash != state->hash)
1508 continue;
1509 if (re_node_set_compare (&state->nodes, nodes))
1510 return state;
1511 }
1512
1513 /* There are no appropriate state in the dfa, create the new one. */
1514 new_state = create_ci_newstate (dfa, nodes, hash);
1515 if (BE (new_state == NULL, 0))
1516 *err = REG_ESPACE;
1517
1518 return new_state;
1519}
1520
1521/* Search for the state whose node_set is equivalent to NODES and
1522 whose context is equivalent to CONTEXT.
1523 Return the pointer to the state, if we found it in the DFA.
1524 Otherwise create the new one and return it. In case of an error
1525 return NULL and set the error code in ERR.
1526 Note: - We assume NULL as the invalid state, then it is possible that
1527 return value is NULL and ERR is REG_NOERROR.
1528 - We never return non-NULL value in case of any errors, it is for
1529 optimization. */
1530
1531static re_dfastate_t *
1532internal_function
1533re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1534 const re_node_set *nodes, unsigned int context)
1535{
1536 unsigned int hash;
1537 re_dfastate_t *new_state;
1538 struct re_state_table_entry *spot;
1539 int i;
1540 if (nodes->nelem == 0)
1541 {
1542 *err = REG_NOERROR;
1543 return NULL;
1544 }
1545 hash = calc_state_hash (nodes, context);
1546 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1547
1548 for (i = 0 ; i < spot->num ; i++)
1549 {
1550 re_dfastate_t *state = spot->array[i];
1551 if (state->hash == hash
1552 && state->context == context
1553 && re_node_set_compare (state->entrance_nodes, nodes))
1554 return state;
1555 }
1556 /* There are no appropriate state in `dfa', create the new one. */
1557 new_state = create_cd_newstate (dfa, nodes, context, hash);
1558 if (BE (new_state == NULL, 0))
1559 *err = REG_ESPACE;
1560
1561 return new_state;
1562}
1563
1564/* Finish initialization of the new state NEWSTATE, and using its hash value
1565 HASH put in the appropriate bucket of DFA's state table. Return value
1566 indicates the error code if failed. */
1567
1568static reg_errcode_t
1569register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1570 unsigned int hash)
1571{
1572 struct re_state_table_entry *spot;
1573 reg_errcode_t err;
1574 int i;
1575
1576 newstate->hash = hash;
1577 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1578 if (BE (err != REG_NOERROR, 0))
1579 return REG_ESPACE;
1580 for (i = 0; i < newstate->nodes.nelem; i++)
1581 {
1582 int elem = newstate->nodes.elems[i];
1583 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1584 if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1585 return REG_ESPACE;
1586 }
1587
1588 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1589 if (BE (spot->alloc <= spot->num, 0))
1590 {
1591 int new_alloc = 2 * spot->num + 2;
1592 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1593 new_alloc);
1594 if (BE (new_array == NULL, 0))
1595 return REG_ESPACE;
1596 spot->array = new_array;
1597 spot->alloc = new_alloc;
1598 }
1599 spot->array[spot->num++] = newstate;
1600 return REG_NOERROR;
1601}
1602
1603static void
1604free_state (re_dfastate_t *state)
1605{
1606 re_node_set_free (&state->non_eps_nodes);
1607 re_node_set_free (&state->inveclosure);
1608 if (state->entrance_nodes != &state->nodes)
1609 {
1610 re_node_set_free (state->entrance_nodes);
1611 re_free (state->entrance_nodes);
1612 }
1613 re_node_set_free (&state->nodes);
1614 re_free (state->word_trtable);
1615 re_free (state->trtable);
1616 re_free (state);
1617}
1618
1619/* Create the new state which is independ of contexts.
1620 Return the new state if succeeded, otherwise return NULL. */
1621
1622static re_dfastate_t *
1623internal_function
1624create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1625 unsigned int hash)
1626{
1627 int i;
1628 reg_errcode_t err;
1629 re_dfastate_t *newstate;
1630
1631 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1632 if (BE (newstate == NULL, 0))
1633 return NULL;
1634 err = re_node_set_init_copy (&newstate->nodes, nodes);
1635 if (BE (err != REG_NOERROR, 0))
1636 {
1637 re_free (newstate);
1638 return NULL;
1639 }
1640
1641 newstate->entrance_nodes = &newstate->nodes;
1642 for (i = 0 ; i < nodes->nelem ; i++)
1643 {
1644 re_token_t *node = dfa->nodes + nodes->elems[i];
1645 re_token_type_t type = node->type;
1646 if (type == CHARACTER && !node->constraint)
1647 continue;
1648#ifdef RE_ENABLE_I18N
1649 newstate->accept_mb |= node->accept_mb;
1650#endif /* RE_ENABLE_I18N */
1651
1652 /* If the state has the halt node, the state is a halt state. */
1653 if (type == END_OF_RE)
1654 newstate->halt = 1;
1655 else if (type == OP_BACK_REF)
1656 newstate->has_backref = 1;
1657 else if (type == ANCHOR || node->constraint)
1658 newstate->has_constraint = 1;
1659 }
1660 err = register_state (dfa, newstate, hash);
1661 if (BE (err != REG_NOERROR, 0))
1662 {
1663 free_state (newstate);
1664 newstate = NULL;
1665 }
1666 return newstate;
1667}
1668
1669/* Create the new state which is depend on the context CONTEXT.
1670 Return the new state if succeeded, otherwise return NULL. */
1671
1672static re_dfastate_t *
1673internal_function
1674create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1675 unsigned int context, unsigned int hash)
1676{
1677 int i, nctx_nodes = 0;
1678 reg_errcode_t err;
1679 re_dfastate_t *newstate;
1680
1681 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1682 if (BE (newstate == NULL, 0))
1683 return NULL;
1684 err = re_node_set_init_copy (&newstate->nodes, nodes);
1685 if (BE (err != REG_NOERROR, 0))
1686 {
1687 re_free (newstate);
1688 return NULL;
1689 }
1690
1691 newstate->context = context;
1692 newstate->entrance_nodes = &newstate->nodes;
1693
1694 for (i = 0 ; i < nodes->nelem ; i++)
1695 {
1696 re_token_t *node = dfa->nodes + nodes->elems[i];
1697 re_token_type_t type = node->type;
1698 unsigned int constraint = node->constraint;
1699
1700 if (type == CHARACTER && !constraint)
1701 continue;
1702#ifdef RE_ENABLE_I18N
1703 newstate->accept_mb |= node->accept_mb;
1704#endif /* RE_ENABLE_I18N */
1705
1706 /* If the state has the halt node, the state is a halt state. */
1707 if (type == END_OF_RE)
1708 newstate->halt = 1;
1709 else if (type == OP_BACK_REF)
1710 newstate->has_backref = 1;
1711
1712 if (constraint)
1713 {
1714 if (newstate->entrance_nodes == &newstate->nodes)
1715 {
1716 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1717 if (BE (newstate->entrance_nodes == NULL, 0))
1718 {
1719 free_state (newstate);
1720 return NULL;
1721 }
1722 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1723 != REG_NOERROR)
1724 return NULL;
1725 nctx_nodes = 0;
1726 newstate->has_constraint = 1;
1727 }
1728
1729 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1730 {
1731 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1732 ++nctx_nodes;
1733 }
1734 }
1735 }
1736 err = register_state (dfa, newstate, hash);
1737 if (BE (err != REG_NOERROR, 0))
1738 {
1739 free_state (newstate);
1740 newstate = NULL;
1741 }
1742 return newstate;
1743}