1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 int n) internal_function;
22static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23static void match_ctx_free (re_match_context_t *cache) internal_function;
24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
25 int str_idx, int from, int to)
26 internal_function;
27static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
28 internal_function;
29static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
30 int str_idx) internal_function;
31static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 int node, int str_idx)
33 internal_function;
34static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, int last_node,
36 int last_str_idx)
37 internal_function;
38static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, int length,
40 int start, int range, int stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags);
43static int re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, int length1,
45 const char *string2, int length2,
46 int start, int range, struct re_registers *regs,
47 int stop, int ret_len);
48static int re_search_stub (struct re_pattern_buffer *bufp,
49 const char *string, int length, int start,
50 int range, int stop, struct re_registers *regs,
51 int ret_len);
52static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
53 int nregs, int regs_allocated);
54static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
55static int check_matching (re_match_context_t *mctx, int fl_longest_match,
56 int *p_match_first) internal_function;
57static int check_halt_state_context (const re_match_context_t *mctx,
58 const re_dfastate_t *state, int idx)
59 internal_function;
60static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
61 regmatch_t *prev_idx_match, int cur_node,
62 int cur_idx, int nmatch) internal_function;
63static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
64 int str_idx, int dest_node, int nregs,
65 regmatch_t *regs,
66 re_node_set *eps_via_nodes)
67 internal_function;
68static reg_errcode_t set_regs (const regex_t *preg,
69 const re_match_context_t *mctx,
70 size_t nmatch, regmatch_t *pmatch,
71 int fl_backtrack) internal_function;
72static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
73 internal_function;
74
75#ifdef RE_ENABLE_I18N
76static int sift_states_iter_mb (const re_match_context_t *mctx,
77 re_sift_context_t *sctx,
78 int node_idx, int str_idx, int max_str_idx)
79 internal_function;
80#endif /* RE_ENABLE_I18N */
81static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
82 re_sift_context_t *sctx)
83 internal_function;
84static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
85 re_sift_context_t *sctx, int str_idx,
86 re_node_set *cur_dest)
87 internal_function;
88static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
89 re_sift_context_t *sctx,
90 int str_idx,
91 re_node_set *dest_nodes)
92 internal_function;
93static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
94 re_node_set *dest_nodes,
95 const re_node_set *candidates)
96 internal_function;
97static int check_dst_limits (const re_match_context_t *mctx,
98 re_node_set *limits,
99 int dst_node, int dst_idx, int src_node,
100 int src_idx) internal_function;
101static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
102 int boundaries, int subexp_idx,
103 int from_node, int bkref_idx)
104 internal_function;
105static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
106 int limit, int subexp_idx,
107 int node, int str_idx,
108 int bkref_idx) internal_function;
109static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
110 re_node_set *dest_nodes,
111 const re_node_set *candidates,
112 re_node_set *limits,
113 struct re_backref_cache_entry *bkref_ents,
114 int str_idx) internal_function;
115static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
116 re_sift_context_t *sctx,
117 int str_idx, const re_node_set *candidates)
118 internal_function;
119static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
120 re_dfastate_t **dst,
121 re_dfastate_t **src, int num)
122 internal_function;
123static re_dfastate_t *find_recover_state (reg_errcode_t *err,
124 re_match_context_t *mctx) internal_function;
125static re_dfastate_t *transit_state (reg_errcode_t *err,
126 re_match_context_t *mctx,
127 re_dfastate_t *state) internal_function;
128static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *next_state)
131 internal_function;
132static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
133 re_node_set *cur_nodes,
134 int str_idx) internal_function;
135#if 0
136static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
137 re_match_context_t *mctx,
138 re_dfastate_t *pstate)
139 internal_function;
140#endif
141#ifdef RE_ENABLE_I18N
142static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
143 re_dfastate_t *pstate)
144 internal_function;
145#endif /* RE_ENABLE_I18N */
146static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
147 const re_node_set *nodes)
148 internal_function;
149static reg_errcode_t get_subexp (re_match_context_t *mctx,
150 int bkref_node, int bkref_str_idx)
151 internal_function;
152static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
153 const re_sub_match_top_t *sub_top,
154 re_sub_match_last_t *sub_last,
155 int bkref_node, int bkref_str)
156 internal_function;
157static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
158 int subexp_idx, int type) internal_function;
159static reg_errcode_t check_arrival (re_match_context_t *mctx,
160 state_array_t *path, int top_node,
161 int top_str, int last_node, int last_str,
162 int type) internal_function;
163static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
164 int str_idx,
165 re_node_set *cur_nodes,
166 re_node_set *next_nodes)
167 internal_function;
168static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
169 re_node_set *cur_nodes,
170 int ex_subexp, int type)
171 internal_function;
172static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
173 re_node_set *dst_nodes,
174 int target, int ex_subexp,
175 int type) internal_function;
176static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
177 re_node_set *cur_nodes, int cur_str,
178 int subexp_num, int type)
179 internal_function;
180static int build_trtable (const re_dfa_t *dfa,
181 re_dfastate_t *state) internal_function;
182#ifdef RE_ENABLE_I18N
183static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
184 const re_string_t *input, int idx)
185 internal_function;
186# ifdef _LIBC
187static unsigned int find_collation_sequence_value (const unsigned char *mbs,
188 size_t name_len)
189 internal_function;
190# endif /* _LIBC */
191#endif /* RE_ENABLE_I18N */
192static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
193 const re_dfastate_t *state,
194 re_node_set *states_node,
195 bitset_t *states_ch) internal_function;
196static int check_node_accept (const re_match_context_t *mctx,
197 const re_token_t *node, int idx)
198 internal_function;
199static reg_errcode_t extend_buffers (re_match_context_t *mctx)
200 internal_function;
201\f
202/* Entry point for POSIX code. */
203
204/* regexec searches for a given pattern, specified by PREG, in the
205 string STRING.
206
207 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
208 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
209 least NMATCH elements, and we set them to the offsets of the
210 corresponding matched substrings.
211
212 EFLAGS specifies `execution flags' which affect matching: if
213 REG_NOTBOL is set, then ^ does not match at the beginning of the
214 string; if REG_NOTEOL is set, then $ does not match at the end.
215
216 We return 0 if we find a match and REG_NOMATCH if not. */
217
218int
219regexec (
220 const regex_t *__restrict preg,
221 const char *__restrict string,
222 size_t nmatch,
223 regmatch_t pmatch[],
224 int eflags)
225{
226 reg_errcode_t err;
227 int start, length;
228
229 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
230 return REG_BADPAT;
231
232 if (eflags & REG_STARTEND)
233 {
234 start = pmatch[0].rm_so;
235 length = pmatch[0].rm_eo;
236 }
237 else
238 {
239 start = 0;
240 length = strlen (string);
241 }
242
243 __libc_lock_lock (dfa->lock);
244 if (preg->no_sub)
245 err = re_search_internal (preg, string, length, start, length - start,
246 length, 0, NULL, eflags);
247 else
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, nmatch, pmatch, eflags);
250 __libc_lock_unlock (dfa->lock);
251 return err != REG_NOERROR;
252}
253
254#ifdef _LIBC
255# include <shlib-compat.h>
256versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
257
258# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
259__typeof__ (__regexec) __compat_regexec;
260
261int
262attribute_compat_text_section
263__compat_regexec (const regex_t *__restrict preg,
264 const char *__restrict string, size_t nmatch,
265 regmatch_t pmatch[], int eflags)
266{
267 return regexec (preg, string, nmatch, pmatch,
268 eflags & (REG_NOTBOL | REG_NOTEOL));
269}
270compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
271# endif
272#endif
273
274/* Entry points for GNU code. */
275
276/* re_match, re_search, re_match_2, re_search_2
277
278 The former two functions operate on STRING with length LENGTH,
279 while the later two operate on concatenation of STRING1 and STRING2
280 with lengths LENGTH1 and LENGTH2, respectively.
281
282 re_match() matches the compiled pattern in BUFP against the string,
283 starting at index START.
284
285 re_search() first tries matching at index START, then it tries to match
286 starting from index START + 1, and so on. The last start position tried
287 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
288 way as re_match().)
289
290 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
291 the first STOP characters of the concatenation of the strings should be
292 concerned.
293
294 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
295 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
296 computed relative to the concatenation, not relative to the individual
297 strings.)
298
299 On success, re_match* functions return the length of the match, re_search*
300 return the position of the start of the match. Return value -1 means no
301 match was found and -2 indicates an internal error. */
302
303int
304re_match (struct re_pattern_buffer *bufp,
305 const char *string,
306 int length,
307 int start,
308 struct re_registers *regs)
309{
310 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
311}
312#ifdef _LIBC
313weak_alias (__re_match, re_match)
314#endif
315
316int
317re_search (struct re_pattern_buffer *bufp,
318 const char *string,
319 int length, int start, int range,
320 struct re_registers *regs)
321{
322 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
323}
324#ifdef _LIBC
325weak_alias (__re_search, re_search)
326#endif
327
328int
329re_match_2 (struct re_pattern_buffer *bufp,
330 const char *string1, int length1,
331 const char *string2, int length2, int start,
332 struct re_registers *regs, int stop)
333{
334 return re_search_2_stub (bufp, string1, length1, string2, length2,
335 start, 0, regs, stop, 1);
336}
337#ifdef _LIBC
338weak_alias (__re_match_2, re_match_2)
339#endif
340
341int
342re_search_2 (struct re_pattern_buffer *bufp,
343 const char *string1, int length1,
344 const char *string2, int length2, int start,
345 int range, struct re_registers *regs, int stop)
346{
347 return re_search_2_stub (bufp, string1, length1, string2, length2,
348 start, range, regs, stop, 0);
349}
350#ifdef _LIBC
351weak_alias (__re_search_2, re_search_2)
352#endif
353
354static int
355re_search_2_stub (struct re_pattern_buffer *bufp,
356 const char *string1, int length1,
357 const char *string2, int length2, int start,
358 int range, struct re_registers *regs,
359 int stop, int ret_len)
360{
361 const char *str;
362 int rval;
363 int len = length1 + length2;
364 int free_str = 0;
365
366 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
367 return -2;
368
369 /* Concatenate the strings. */
370 if (length2 > 0)
371 if (length1 > 0)
372 {
373 char *s = re_malloc (char, len);
374
375 if (BE (s == NULL, 0))
376 return -2;
377 memcpy (s, string1, length1);
378 memcpy (s + length1, string2, length2);
379 str = s;
380 free_str = 1;
381 }
382 else
383 str = string2;
384 else
385 str = string1;
386
387 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
388 if (free_str)
389 re_free ((char *) str);
390 return rval;
391}
392
393/* The parameters have the same meaning as those of re_search.
394 Additional parameters:
395 If RET_LEN is nonzero the length of the match is returned (re_match style);
396 otherwise the position of the match is returned. */
397
398static int
399re_search_stub (struct re_pattern_buffer *bufp,
400 const char *string, int length, int start,
401 int range, int stop,
402 struct re_registers *regs, int ret_len)
403{
404 reg_errcode_t result;
405 regmatch_t *pmatch;
406 int nregs, rval;
407 int eflags = 0;
408
409 /* Check for out-of-range. */
410 if (BE (start < 0 || start > length, 0))
411 return -1;
412 if (BE (start + range > length, 0))
413 range = length - start;
414 else if (BE (start + range < 0, 0))
415 range = -start;
416
417 __libc_lock_lock (dfa->lock);
418
419 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
420 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
421
422 /* Compile fastmap if we haven't yet. */
423 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
424 re_compile_fastmap (bufp);
425
426 if (BE (bufp->no_sub, 0))
427 regs = NULL;
428
429 /* We need at least 1 register. */
430 if (regs == NULL)
431 nregs = 1;
432 else if (BE (bufp->regs_allocated == REGS_FIXED &&
433 regs->num_regs < bufp->re_nsub + 1, 0))
434 {
435 nregs = regs->num_regs;
436 if (BE (nregs < 1, 0))
437 {
438 /* Nothing can be copied to regs. */
439 regs = NULL;
440 nregs = 1;
441 }
442 }
443 else
444 nregs = bufp->re_nsub + 1;
445 pmatch = re_malloc (regmatch_t, nregs);
446 if (BE (pmatch == NULL, 0))
447 {
448 rval = -2;
449 goto out;
450 }
451
452 result = re_search_internal (bufp, string, length, start, range, stop,
453 nregs, pmatch, eflags);
454
455 rval = 0;
456
457 /* I hope we needn't fill their regs with -1's when no match was found. */
458 if (result != REG_NOERROR)
459 rval = -1;
460 else if (regs != NULL)
461 {
462 /* If caller wants register contents data back, copy them. */
463 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
464 bufp->regs_allocated);
465 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
466 rval = -2;
467 }
468
469 if (BE (rval == 0, 1))
470 {
471 if (ret_len)
472 {
473 assert (pmatch[0].rm_so == start);
474 rval = pmatch[0].rm_eo - start;
475 }
476 else
477 rval = pmatch[0].rm_so;
478 }
479 re_free (pmatch);
480 out:
481 __libc_lock_unlock (dfa->lock);
482 return rval;
483}
484
485static unsigned
486re_copy_regs (struct re_registers *regs,
487 regmatch_t *pmatch,
488 int nregs, int regs_allocated)
489{
490 int rval = REGS_REALLOCATE;
491 int i;
492 int need_regs = nregs + 1;
493 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
494 uses. */
495
496 /* Have the register data arrays been allocated? */
497 if (regs_allocated == REGS_UNALLOCATED)
498 { /* No. So allocate them with malloc. */
499 regs->start = re_malloc (regoff_t, need_regs);
500 if (BE (regs->start == NULL, 0))
501 return REGS_UNALLOCATED;
502 regs->end = re_malloc (regoff_t, need_regs);
503 if (BE (regs->end == NULL, 0))
504 {
505 re_free (regs->start);
506 return REGS_UNALLOCATED;
507 }
508 regs->num_regs = need_regs;
509 }
510 else if (regs_allocated == REGS_REALLOCATE)
511 { /* Yes. If we need more elements than were already
512 allocated, reallocate them. If we need fewer, just
513 leave it alone. */
514 if (BE (need_regs > regs->num_regs, 0))
515 {
516 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
517 regoff_t *new_end;
518 if (BE (new_start == NULL, 0))
519 return REGS_UNALLOCATED;
520 new_end = re_realloc (regs->end, regoff_t, need_regs);
521 if (BE (new_end == NULL, 0))
522 {
523 re_free (new_start);
524 return REGS_UNALLOCATED;
525 }
526 regs->start = new_start;
527 regs->end = new_end;
528 regs->num_regs = need_regs;
529 }
530 }
531 else
532 {
533 assert (regs_allocated == REGS_FIXED);
534 /* This function may not be called with REGS_FIXED and nregs too big. */
535 assert (regs->num_regs >= nregs);
536 rval = REGS_FIXED;
537 }
538
539 /* Copy the regs. */
540 for (i = 0; i < nregs; ++i)
541 {
542 regs->start[i] = pmatch[i].rm_so;
543 regs->end[i] = pmatch[i].rm_eo;
544 }
545 for ( ; i < regs->num_regs; ++i)
546 regs->start[i] = regs->end[i] = -1;
547
548 return rval;
549}
550
551/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
552 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
553 this memory for recording register information. STARTS and ENDS
554 must be allocated using the malloc library routine, and must each
555 be at least NUM_REGS * sizeof (regoff_t) bytes long.
556
557 If NUM_REGS == 0, then subsequent matches should allocate their own
558 register data.
559
560 Unless this function is called, the first search or match using
561 PATTERN_BUFFER will allocate its own register data, without
562 freeing the old data. */
563
564void
565re_set_registers (struct re_pattern_buffer *bufp,
566 struct re_registers *regs,
567 unsigned num_regs,
568 regoff_t *starts,
569 regoff_t *ends)
570{
571 if (num_regs)
572 {
573 bufp->regs_allocated = REGS_REALLOCATE;
574 regs->num_regs = num_regs;
575 regs->start = starts;
576 regs->end = ends;
577 }
578 else
579 {
580 bufp->regs_allocated = REGS_UNALLOCATED;
581 regs->num_regs = 0;
582 regs->start = regs->end = (regoff_t *) 0;
583 }
584}
585#ifdef _LIBC
586weak_alias (__re_set_registers, re_set_registers)
587#endif
588\f
589/* Entry points compatible with 4.2 BSD regex library. We don't define
590 them unless specifically requested. */
591
592#if defined _REGEX_RE_COMP || defined _LIBC
593int
594# ifdef _LIBC
595weak_function
596# endif
597re_exec (s)
598 const char *s;
599{
600 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
601}
602#endif /* _REGEX_RE_COMP */
603\f
604/* Internal entry point. */
605
606/* Searches for a compiled pattern PREG in the string STRING, whose
607 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
608 mingings with regexec. START, and RANGE have the same meanings
609 with re_search.
610 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
611 otherwise return the error code.
612 Note: We assume front end functions already check ranges.
613 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
614
615static reg_errcode_t
616re_search_internal (const regex_t *preg,
617 const char *string,
618 int length, int start, int range, int stop,
619 size_t nmatch, regmatch_t pmatch[],
620 int eflags)
621{
622 reg_errcode_t err;
623 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
624 int left_lim, right_lim, incr;
625 int fl_longest_match, match_first, match_kind, match_last = -1;
626 int extra_nmatch;
627 int sb, ch;
628#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
629 re_match_context_t mctx = { .dfa = dfa };
630#else
631 re_match_context_t mctx;
632#endif
633 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
634 && range && !preg->can_be_null) ? preg->fastmap : NULL;
635 RE_TRANSLATE_TYPE t = preg->translate;
636
637#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
638 memset (&mctx, '\0', sizeof (re_match_context_t));
639 mctx.dfa = dfa;
640#endif
641
642 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
643 nmatch -= extra_nmatch;
644
645 /* Check if the DFA haven't been compiled. */
646 if (BE (preg->used == 0 || dfa->init_state == NULL
647 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
648 || dfa->init_state_begbuf == NULL, 0))
649 return REG_NOMATCH;
650
651#ifdef DEBUG
652 /* We assume front-end functions already check them. */
653 assert (start + range >= 0 && start + range <= length);
654#endif
655
656 /* If initial states with non-begbuf contexts have no elements,
657 the regex must be anchored. If preg->newline_anchor is set,
658 we'll never use init_state_nl, so do not check it. */
659 if (dfa->init_state->nodes.nelem == 0
660 && dfa->init_state_word->nodes.nelem == 0
661 && (dfa->init_state_nl->nodes.nelem == 0
662 || !preg->newline_anchor))
663 {
664 if (start != 0 && start + range != 0)
665 return REG_NOMATCH;
666 start = range = 0;
667 }
668
669 /* We must check the longest matching, if nmatch > 0. */
670 fl_longest_match = (nmatch != 0 || dfa->nbackref);
671
672 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
673 preg->translate, preg->syntax & RE_ICASE, dfa);
674 if (BE (err != REG_NOERROR, 0))
675 goto free_return;
676 mctx.input.stop = stop;
677 mctx.input.raw_stop = stop;
678 mctx.input.newline_anchor = preg->newline_anchor;
679
680 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
681 if (BE (err != REG_NOERROR, 0))
682 goto free_return;
683
684 /* We will log all the DFA states through which the dfa pass,
685 if nmatch > 1, or this dfa has "multibyte node", which is a
686 back-reference or a node which can accept multibyte character or
687 multi character collating element. */
688 if (nmatch > 1 || dfa->has_mb_node)
689 {
690 /* Avoid overflow. */
691 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
692 {
693 err = REG_ESPACE;
694 goto free_return;
695 }
696
697 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
698 if (BE (mctx.state_log == NULL, 0))
699 {
700 err = REG_ESPACE;
701 goto free_return;
702 }
703 }
704 else
705 mctx.state_log = NULL;
706
707 match_first = start;
708 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
709 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
710
711 /* Check incrementally whether of not the input string match. */
712 incr = (range < 0) ? -1 : 1;
713 left_lim = (range < 0) ? start + range : start;
714 right_lim = (range < 0) ? start : start + range;
715 sb = dfa->mb_cur_max == 1;
716 match_kind =
717 (fastmap
718 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
719 | (range >= 0 ? 2 : 0)
720 | (t != NULL ? 1 : 0))
721 : 8);
722
723 for (;; match_first += incr)
724 {
725 err = REG_NOMATCH;
726 if (match_first < left_lim || right_lim < match_first)
727 goto free_return;
728
729 /* Advance as rapidly as possible through the string, until we
730 find a plausible place to start matching. This may be done
731 with varying efficiency, so there are various possibilities:
732 only the most common of them are specialized, in order to
733 save on code size. We use a switch statement for speed. */
734 switch (match_kind)
735 {
736 case 8:
737 /* No fastmap. */
738 break;
739
740 case 7:
741 /* Fastmap with single-byte translation, match forward. */
742 while (BE (match_first < right_lim, 1)
743 && !fastmap[t[(unsigned char) string[match_first]]])
744 ++match_first;
745 goto forward_match_found_start_or_reached_end;
746
747 case 6:
748 /* Fastmap without translation, match forward. */
749 while (BE (match_first < right_lim, 1)
750 && !fastmap[(unsigned char) string[match_first]])
751 ++match_first;
752
753 forward_match_found_start_or_reached_end:
754 if (BE (match_first == right_lim, 0))
755 {
756 ch = match_first >= length
757 ? 0 : (unsigned char) string[match_first];
758 if (!fastmap[t ? t[ch] : ch])
759 goto free_return;
760 }
761 break;
762
763 case 4:
764 case 5:
765 /* Fastmap without multi-byte translation, match backwards. */
766 while (match_first >= left_lim)
767 {
768 ch = match_first >= length
769 ? 0 : (unsigned char) string[match_first];
770 if (fastmap[t ? t[ch] : ch])
771 break;
772 --match_first;
773 }
774 if (match_first < left_lim)
775 goto free_return;
776 break;
777
778 default:
779 /* In this case, we can't determine easily the current byte,
780 since it might be a component byte of a multibyte
781 character. Then we use the constructed buffer instead. */
782 for (;;)
783 {
784 /* If MATCH_FIRST is out of the valid range, reconstruct the
785 buffers. */
786 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
787 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
788 {
789 err = re_string_reconstruct (&mctx.input, match_first,
790 eflags);
791 if (BE (err != REG_NOERROR, 0))
792 goto free_return;
793
794 offset = match_first - mctx.input.raw_mbs_idx;
795 }
796 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
797 Note that MATCH_FIRST must not be smaller than 0. */
798 ch = (match_first >= length
799 ? 0 : re_string_byte_at (&mctx.input, offset));
800 if (fastmap[ch])
801 break;
802 match_first += incr;
803 if (match_first < left_lim || match_first > right_lim)
804 {
805 err = REG_NOMATCH;
806 goto free_return;
807 }
808 }
809 break;
810 }
811
812 /* Reconstruct the buffers so that the matcher can assume that
813 the matching starts from the beginning of the buffer. */
814 err = re_string_reconstruct (&mctx.input, match_first, eflags);
815 if (BE (err != REG_NOERROR, 0))
816 goto free_return;
817
818#ifdef RE_ENABLE_I18N
819 /* Don't consider this char as a possible match start if it part,
820 yet isn't the head, of a multibyte character. */
821 if (!sb && !re_string_first_byte (&mctx.input, 0))
822 continue;
823#endif
824
825 /* It seems to be appropriate one, then use the matcher. */
826 /* We assume that the matching starts from 0. */
827 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
828 match_last = check_matching (&mctx, fl_longest_match,
829 range >= 0 ? &match_first : NULL);
830 if (match_last != -1)
831 {
832 if (BE (match_last == -2, 0))
833 {
834 err = REG_ESPACE;
835 goto free_return;
836 }
837 else
838 {
839 mctx.match_last = match_last;
840 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
841 {
842 re_dfastate_t *pstate = mctx.state_log[match_last];
843 mctx.last_node = check_halt_state_context (&mctx, pstate,
844 match_last);
845 }
846 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
847 || dfa->nbackref)
848 {
849 err = prune_impossible_nodes (&mctx);
850 if (err == REG_NOERROR)
851 break;
852 if (BE (err != REG_NOMATCH, 0))
853 goto free_return;
854 match_last = -1;
855 }
856 else
857 break; /* We found a match. */
858 }
859 }
860
861 match_ctx_clean (&mctx);
862 }
863
864#ifdef DEBUG
865 assert (match_last != -1);
866 assert (err == REG_NOERROR);
867#endif
868
869 /* Set pmatch[] if we need. */
870 if (nmatch > 0)
871 {
872 int reg_idx;
873
874 /* Initialize registers. */
875 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
876 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
877
878 /* Set the points where matching start/end. */
879 pmatch[0].rm_so = 0;
880 pmatch[0].rm_eo = mctx.match_last;
881
882 if (!preg->no_sub && nmatch > 1)
883 {
884 err = set_regs (preg, &mctx, nmatch, pmatch,
885 dfa->has_plural_match && dfa->nbackref > 0);
886 if (BE (err != REG_NOERROR, 0))
887 goto free_return;
888 }
889
890 /* At last, add the offset to the each registers, since we slided
891 the buffers so that we could assume that the matching starts
892 from 0. */
893 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
894 if (pmatch[reg_idx].rm_so != -1)
895 {
896#ifdef RE_ENABLE_I18N
897 if (BE (mctx.input.offsets_needed != 0, 0))
898 {
899 pmatch[reg_idx].rm_so =
900 (pmatch[reg_idx].rm_so == mctx.input.valid_len
901 ? mctx.input.valid_raw_len
902 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
903 pmatch[reg_idx].rm_eo =
904 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
905 ? mctx.input.valid_raw_len
906 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
907 }
908#else
909 assert (mctx.input.offsets_needed == 0);
910#endif
911 pmatch[reg_idx].rm_so += match_first;
912 pmatch[reg_idx].rm_eo += match_first;
913 }
914 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
915 {
916 pmatch[nmatch + reg_idx].rm_so = -1;
917 pmatch[nmatch + reg_idx].rm_eo = -1;
918 }
919
920 if (dfa->subexp_map)
921 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
922 if (dfa->subexp_map[reg_idx] != reg_idx)
923 {
924 pmatch[reg_idx + 1].rm_so
925 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
926 pmatch[reg_idx + 1].rm_eo
927 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
928 }
929 }
930
931 free_return:
932 re_free (mctx.state_log);
933 if (dfa->nbackref)
934 match_ctx_free (&mctx);
935 re_string_destruct (&mctx.input);
936 return err;
937}
938
939static reg_errcode_t
940prune_impossible_nodes (re_match_context_t *mctx)
941{
942 const re_dfa_t *const dfa = mctx->dfa;
943 int halt_node, match_last;
944 reg_errcode_t ret;
945 re_dfastate_t **sifted_states;
946 re_dfastate_t **lim_states = NULL;
947 re_sift_context_t sctx;
948#ifdef DEBUG
949 assert (mctx->state_log != NULL);
950#endif
951 match_last = mctx->match_last;
952 halt_node = mctx->last_node;
953
954 /* Avoid overflow. */
955 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
956 return REG_ESPACE;
957
958 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
959 if (BE (sifted_states == NULL, 0))
960 {
961 ret = REG_ESPACE;
962 goto free_return;
963 }
964 if (dfa->nbackref)
965 {
966 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
967 if (BE (lim_states == NULL, 0))
968 {
969 ret = REG_ESPACE;
970 goto free_return;
971 }
972 while (1)
973 {
974 memset (lim_states, '\0',
975 sizeof (re_dfastate_t *) * (match_last + 1));
976 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
977 match_last);
978 ret = sift_states_backward (mctx, &sctx);
979 re_node_set_free (&sctx.limits);
980 if (BE (ret != REG_NOERROR, 0))
981 goto free_return;
982 if (sifted_states[0] != NULL || lim_states[0] != NULL)
983 break;
984 do
985 {
986 --match_last;
987 if (match_last < 0)
988 {
989 ret = REG_NOMATCH;
990 goto free_return;
991 }
992 } while (mctx->state_log[match_last] == NULL
993 || !mctx->state_log[match_last]->halt);
994 halt_node = check_halt_state_context (mctx,
995 mctx->state_log[match_last],
996 match_last);
997 }
998 ret = merge_state_array (dfa, sifted_states, lim_states,
999 match_last + 1);
1000 re_free (lim_states);
1001 lim_states = NULL;
1002 if (BE (ret != REG_NOERROR, 0))
1003 goto free_return;
1004 }
1005 else
1006 {
1007 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1008 ret = sift_states_backward (mctx, &sctx);
1009 re_node_set_free (&sctx.limits);
1010 if (BE (ret != REG_NOERROR, 0))
1011 goto free_return;
1012 if (sifted_states[0] == NULL)
1013 {
1014 ret = REG_NOMATCH;
1015 goto free_return;
1016 }
1017 }
1018 re_free (mctx->state_log);
1019 mctx->state_log = sifted_states;
1020 sifted_states = NULL;
1021 mctx->last_node = halt_node;
1022 mctx->match_last = match_last;
1023 ret = REG_NOERROR;
1024 free_return:
1025 re_free (sifted_states);
1026 re_free (lim_states);
1027 return ret;
1028}
1029
1030/* Acquire an initial state and return it.
1031 We must select appropriate initial state depending on the context,
1032 since initial states may have constraints like "\<", "^", etc.. */
1033
1034static inline re_dfastate_t *
1035__attribute ((always_inline)) internal_function
1036acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1037 int idx)
1038{
1039 const re_dfa_t *const dfa = mctx->dfa;
1040 if (dfa->init_state->has_constraint)
1041 {
1042 unsigned int context;
1043 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1044 if (IS_WORD_CONTEXT (context))
1045 return dfa->init_state_word;
1046 else if (IS_ORDINARY_CONTEXT (context))
1047 return dfa->init_state;
1048 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1049 return dfa->init_state_begbuf;
1050 else if (IS_NEWLINE_CONTEXT (context))
1051 return dfa->init_state_nl;
1052 else if (IS_BEGBUF_CONTEXT (context))
1053 {
1054 /* It is relatively rare case, then calculate on demand. */
1055 return re_acquire_state_context (err, dfa,
1056 dfa->init_state->entrance_nodes,
1057 context);
1058 }
1059 else
1060 /* Must not happen? */
1061 return dfa->init_state;
1062 }
1063 else
1064 return dfa->init_state;
1065}
1066
1067/* Check whether the regular expression match input string INPUT or not,
1068 and return the index where the matching end, return -1 if not match,
1069 or return -2 in case of an error.
1070 FL_LONGEST_MATCH means we want the POSIX longest matching.
1071 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1072 next place where we may want to try matching.
1073 Note that the matcher assume that the matching starts from the current
1074 index of the buffer. */
1075
1076static int
1077internal_function
1078check_matching (re_match_context_t *mctx, int fl_longest_match,
1079 int *p_match_first)
1080{
1081 const re_dfa_t *const dfa = mctx->dfa;
1082 reg_errcode_t err;
1083 int match = 0;
1084 int match_last = -1;
1085 int cur_str_idx = re_string_cur_idx (&mctx->input);
1086 re_dfastate_t *cur_state;
1087 int at_init_state = p_match_first != NULL;
1088 int next_start_idx = cur_str_idx;
1089
1090 err = REG_NOERROR;
1091 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1092 /* An initial state must not be NULL (invalid). */
1093 if (BE (cur_state == NULL, 0))
1094 {
1095 assert (err == REG_ESPACE);
1096 return -2;
1097 }
1098
1099 if (mctx->state_log != NULL)
1100 {
1101 mctx->state_log[cur_str_idx] = cur_state;
1102
1103 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1104 later. E.g. Processing back references. */
1105 if (BE (dfa->nbackref, 0))
1106 {
1107 at_init_state = 0;
1108 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1109 if (BE (err != REG_NOERROR, 0))
1110 return err;
1111
1112 if (cur_state->has_backref)
1113 {
1114 err = transit_state_bkref (mctx, &cur_state->nodes);
1115 if (BE (err != REG_NOERROR, 0))
1116 return err;
1117 }
1118 }
1119 }
1120
1121 /* If the RE accepts NULL string. */
1122 if (BE (cur_state->halt, 0))
1123 {
1124 if (!cur_state->has_constraint
1125 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1126 {
1127 if (!fl_longest_match)
1128 return cur_str_idx;
1129 else
1130 {
1131 match_last = cur_str_idx;
1132 match = 1;
1133 }
1134 }
1135 }
1136
1137 while (!re_string_eoi (&mctx->input))
1138 {
1139 re_dfastate_t *old_state = cur_state;
1140 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1141
1142 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1143 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1144 && mctx->input.valid_len < mctx->input.len))
1145 {
1146 err = extend_buffers (mctx);
1147 if (BE (err != REG_NOERROR, 0))
1148 {
1149 assert (err == REG_ESPACE);
1150 return -2;
1151 }
1152 }
1153
1154 cur_state = transit_state (&err, mctx, cur_state);
1155 if (mctx->state_log != NULL)
1156 cur_state = merge_state_with_log (&err, mctx, cur_state);
1157
1158 if (cur_state == NULL)
1159 {
1160 /* Reached the invalid state or an error. Try to recover a valid
1161 state using the state log, if available and if we have not
1162 already found a valid (even if not the longest) match. */
1163 if (BE (err != REG_NOERROR, 0))
1164 return -2;
1165
1166 if (mctx->state_log == NULL
1167 || (match && !fl_longest_match)
1168 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1169 break;
1170 }
1171
1172 if (BE (at_init_state, 0))
1173 {
1174 if (old_state == cur_state)
1175 next_start_idx = next_char_idx;
1176 else
1177 at_init_state = 0;
1178 }
1179
1180 if (cur_state->halt)
1181 {
1182 /* Reached a halt state.
1183 Check the halt state can satisfy the current context. */
1184 if (!cur_state->has_constraint
1185 || check_halt_state_context (mctx, cur_state,
1186 re_string_cur_idx (&mctx->input)))
1187 {
1188 /* We found an appropriate halt state. */
1189 match_last = re_string_cur_idx (&mctx->input);
1190 match = 1;
1191
1192 /* We found a match, do not modify match_first below. */
1193 p_match_first = NULL;
1194 if (!fl_longest_match)
1195 break;
1196 }
1197 }
1198 }
1199
1200 if (p_match_first)
1201 *p_match_first += next_start_idx;
1202
1203 return match_last;
1204}
1205
1206/* Check NODE match the current context. */
1207
1208static int
1209internal_function
1210check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1211{
1212 re_token_type_t type = dfa->nodes[node].type;
1213 unsigned int constraint = dfa->nodes[node].constraint;
1214 if (type != END_OF_RE)
1215 return 0;
1216 if (!constraint)
1217 return 1;
1218 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1219 return 0;
1220 return 1;
1221}
1222
1223/* Check the halt state STATE match the current context.
1224 Return 0 if not match, if the node, STATE has, is a halt node and
1225 match the context, return the node. */
1226
1227static int
1228internal_function
1229check_halt_state_context (const re_match_context_t *mctx,
1230 const re_dfastate_t *state, int idx)
1231{
1232 int i;
1233 unsigned int context;
1234#ifdef DEBUG
1235 assert (state->halt);
1236#endif
1237 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1238 for (i = 0; i < state->nodes.nelem; ++i)
1239 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1240 return state->nodes.elems[i];
1241 return 0;
1242}
1243
1244/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1245 corresponding to the DFA).
1246 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1247 of errors. */
1248
1249static int
1250internal_function
1251proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1252 int *pidx, int node, re_node_set *eps_via_nodes,
1253 struct re_fail_stack_t *fs)
1254{
1255 const re_dfa_t *const dfa = mctx->dfa;
1256 int i, err;
1257 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1258 {
1259 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1260 re_node_set *edests = &dfa->edests[node];
1261 int dest_node;
1262 err = re_node_set_insert (eps_via_nodes, node);
1263 if (BE (err < 0, 0))
1264 return -2;
1265 /* Pick up a valid destination, or return -1 if none is found. */
1266 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1267 {
1268 int candidate = edests->elems[i];
1269 if (!re_node_set_contains (cur_nodes, candidate))
1270 continue;
1271 if (dest_node == -1)
1272 dest_node = candidate;
1273
1274 else
1275 {
1276 /* In order to avoid infinite loop like "(a*)*", return the second
1277 epsilon-transition if the first was already considered. */
1278 if (re_node_set_contains (eps_via_nodes, dest_node))
1279 return candidate;
1280
1281 /* Otherwise, push the second epsilon-transition on the fail stack. */
1282 else if (fs != NULL
1283 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1284 eps_via_nodes))
1285 return -2;
1286
1287 /* We know we are going to exit. */
1288 break;
1289 }
1290 }
1291 return dest_node;
1292 }
1293 else
1294 {
1295 int naccepted = 0;
1296 re_token_type_t type = dfa->nodes[node].type;
1297
1298#ifdef RE_ENABLE_I18N
1299 if (dfa->nodes[node].accept_mb)
1300 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1301 else
1302#endif /* RE_ENABLE_I18N */
1303 if (type == OP_BACK_REF)
1304 {
1305 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1306 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1307 if (fs != NULL)
1308 {
1309 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1310 return -1;
1311 else if (naccepted)
1312 {
1313 char *buf = (char *) re_string_get_buffer (&mctx->input);
1314 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1315 naccepted) != 0)
1316 return -1;
1317 }
1318 }
1319
1320 if (naccepted == 0)
1321 {
1322 int dest_node;
1323 err = re_node_set_insert (eps_via_nodes, node);
1324 if (BE (err < 0, 0))
1325 return -2;
1326 dest_node = dfa->edests[node].elems[0];
1327 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1328 dest_node))
1329 return dest_node;
1330 }
1331 }
1332
1333 if (naccepted != 0
1334 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1335 {
1336 int dest_node = dfa->nexts[node];
1337 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1338 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1339 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1340 dest_node)))
1341 return -1;
1342 re_node_set_empty (eps_via_nodes);
1343 return dest_node;
1344 }
1345 }
1346 return -1;
1347}
1348
1349static reg_errcode_t
1350internal_function
1351push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1352 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1353{
1354 reg_errcode_t err;
1355 int num = fs->num++;
1356 if (fs->num == fs->alloc)
1357 {
1358 struct re_fail_stack_ent_t *new_array;
1359 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1360 * fs->alloc * 2));
1361 if (new_array == NULL)
1362 return REG_ESPACE;
1363 fs->alloc *= 2;
1364 fs->stack = new_array;
1365 }
1366 fs->stack[num].idx = str_idx;
1367 fs->stack[num].node = dest_node;
1368 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1369 if (fs->stack[num].regs == NULL)
1370 return REG_ESPACE;
1371 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1372 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1373 return err;
1374}
1375
1376static int
1377internal_function
1378pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1379 regmatch_t *regs, re_node_set *eps_via_nodes)
1380{
1381 int num = --fs->num;
1382 assert (num >= 0);
1383 *pidx = fs->stack[num].idx;
1384 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1385 re_node_set_free (eps_via_nodes);
1386 re_free (fs->stack[num].regs);
1387 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1388 return fs->stack[num].node;
1389}
1390
1391/* Set the positions where the subexpressions are starts/ends to registers
1392 PMATCH.
1393 Note: We assume that pmatch[0] is already set, and
1394 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1395
1396static reg_errcode_t
1397internal_function
1398set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1399 regmatch_t *pmatch, int fl_backtrack)
1400{
1401 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1402 int idx, cur_node;
1403 re_node_set eps_via_nodes;
1404 struct re_fail_stack_t *fs;
1405 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1406 regmatch_t *prev_idx_match;
1407 int prev_idx_match_malloced = 0;
1408
1409#ifdef DEBUG
1410 assert (nmatch > 1);
1411 assert (mctx->state_log != NULL);
1412#endif
1413 if (fl_backtrack)
1414 {
1415 fs = &fs_body;
1416 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1417 if (fs->stack == NULL)
1418 return REG_ESPACE;
1419 }
1420 else
1421 fs = NULL;
1422
1423 cur_node = dfa->init_node;
1424 re_node_set_init_empty (&eps_via_nodes);
1425
1426#ifdef HAVE_ALLOCA
1427 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1428 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1429 else
1430#endif
1431 {
1432 prev_idx_match = re_malloc (regmatch_t, nmatch);
1433 if (prev_idx_match == NULL)
1434 {
1435 free_fail_stack_return (fs);
1436 return REG_ESPACE;
1437 }
1438 prev_idx_match_malloced = 1;
1439 }
1440 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1441
1442 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1443 {
1444 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1445
1446 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1447 {
1448 int reg_idx;
1449 if (fs)
1450 {
1451 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1452 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1453 break;
1454 if (reg_idx == nmatch)
1455 {
1456 re_node_set_free (&eps_via_nodes);
1457 if (prev_idx_match_malloced)
1458 re_free (prev_idx_match);
1459 return free_fail_stack_return (fs);
1460 }
1461 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1462 &eps_via_nodes);
1463 }
1464 else
1465 {
1466 re_node_set_free (&eps_via_nodes);
1467 if (prev_idx_match_malloced)
1468 re_free (prev_idx_match);
1469 return REG_NOERROR;
1470 }
1471 }
1472
1473 /* Proceed to next node. */
1474 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1475 &eps_via_nodes, fs);
1476
1477 if (BE (cur_node < 0, 0))
1478 {
1479 if (BE (cur_node == -2, 0))
1480 {
1481 re_node_set_free (&eps_via_nodes);
1482 if (prev_idx_match_malloced)
1483 re_free (prev_idx_match);
1484 free_fail_stack_return (fs);
1485 return REG_ESPACE;
1486 }
1487 if (fs)
1488 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1489 &eps_via_nodes);
1490 else
1491 {
1492 re_node_set_free (&eps_via_nodes);
1493 if (prev_idx_match_malloced)
1494 re_free (prev_idx_match);
1495 return REG_NOMATCH;
1496 }
1497 }
1498 }
1499 re_node_set_free (&eps_via_nodes);
1500 if (prev_idx_match_malloced)
1501 re_free (prev_idx_match);
1502 return free_fail_stack_return (fs);
1503}
1504
1505static reg_errcode_t
1506internal_function
1507free_fail_stack_return (struct re_fail_stack_t *fs)
1508{
1509 if (fs)
1510 {
1511 int fs_idx;
1512 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1513 {
1514 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1515 re_free (fs->stack[fs_idx].regs);
1516 }
1517 re_free (fs->stack);
1518 }
1519 return REG_NOERROR;
1520}
1521
1522static void
1523internal_function
1524update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1525 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1526{
1527 int type = dfa->nodes[cur_node].type;
1528 if (type == OP_OPEN_SUBEXP)
1529 {
1530 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1531
1532 /* We are at the first node of this sub expression. */
1533 if (reg_num < nmatch)
1534 {
1535 pmatch[reg_num].rm_so = cur_idx;
1536 pmatch[reg_num].rm_eo = -1;
1537 }
1538 }
1539 else if (type == OP_CLOSE_SUBEXP)
1540 {
1541 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1542 if (reg_num < nmatch)
1543 {
1544 /* We are at the last node of this sub expression. */
1545 if (pmatch[reg_num].rm_so < cur_idx)
1546 {
1547 pmatch[reg_num].rm_eo = cur_idx;
1548 /* This is a non-empty match or we are not inside an optional
1549 subexpression. Accept this right away. */
1550 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1551 }
1552 else
1553 {
1554 if (dfa->nodes[cur_node].opt_subexp
1555 && prev_idx_match[reg_num].rm_so != -1)
1556 /* We transited through an empty match for an optional
1557 subexpression, like (a?)*, and this is not the subexp's
1558 first match. Copy back the old content of the registers
1559 so that matches of an inner subexpression are undone as
1560 well, like in ((a?))*. */
1561 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1562 else
1563 /* We completed a subexpression, but it may be part of
1564 an optional one, so do not update PREV_IDX_MATCH. */
1565 pmatch[reg_num].rm_eo = cur_idx;
1566 }
1567 }
1568 }
1569}
1570
1571/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1572 and sift the nodes in each states according to the following rules.
1573 Updated state_log will be wrote to STATE_LOG.
1574
1575 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1576 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1577 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1578 the LAST_NODE, we throw away the node `a'.
1579 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1580 string `s' and transit to `b':
1581 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1582 away the node `a'.
1583 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1584 thrown away, we throw away the node `a'.
1585 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1586 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1587 node `a'.
1588 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1589 we throw away the node `a'. */
1590
1591#define STATE_NODE_CONTAINS(state,node) \
1592 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1593
1594static reg_errcode_t
1595internal_function
1596sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1597{
1598 reg_errcode_t err;
1599 int null_cnt = 0;
1600 int str_idx = sctx->last_str_idx;
1601 re_node_set cur_dest;
1602
1603#ifdef DEBUG
1604 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1605#endif
1606
1607 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1608 transit to the last_node and the last_node itself. */
1609 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1610 if (BE (err != REG_NOERROR, 0))
1611 return err;
1612 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1613 if (BE (err != REG_NOERROR, 0))
1614 goto free_return;
1615
1616 /* Then check each states in the state_log. */
1617 while (str_idx > 0)
1618 {
1619 /* Update counters. */
1620 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1621 if (null_cnt > mctx->max_mb_elem_len)
1622 {
1623 memset (sctx->sifted_states, '\0',
1624 sizeof (re_dfastate_t *) * str_idx);
1625 re_node_set_free (&cur_dest);
1626 return REG_NOERROR;
1627 }
1628 re_node_set_empty (&cur_dest);
1629 --str_idx;
1630
1631 if (mctx->state_log[str_idx])
1632 {
1633 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1634 if (BE (err != REG_NOERROR, 0))
1635 goto free_return;
1636 }
1637
1638 /* Add all the nodes which satisfy the following conditions:
1639 - It can epsilon transit to a node in CUR_DEST.
1640 - It is in CUR_SRC.
1641 And update state_log. */
1642 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1643 if (BE (err != REG_NOERROR, 0))
1644 goto free_return;
1645 }
1646 err = REG_NOERROR;
1647 free_return:
1648 re_node_set_free (&cur_dest);
1649 return err;
1650}
1651
1652static reg_errcode_t
1653internal_function
1654build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1655 int str_idx, re_node_set *cur_dest)
1656{
1657 const re_dfa_t *const dfa = mctx->dfa;
1658 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1659 int i;
1660
1661 /* Then build the next sifted state.
1662 We build the next sifted state on `cur_dest', and update
1663 `sifted_states[str_idx]' with `cur_dest'.
1664 Note:
1665 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1666 `cur_src' points the node_set of the old `state_log[str_idx]'
1667 (with the epsilon nodes pre-filtered out). */
1668 for (i = 0; i < cur_src->nelem; i++)
1669 {
1670 int prev_node = cur_src->elems[i];
1671 int naccepted = 0;
1672 int ret;
1673
1674#ifdef DEBUG
1675 re_token_type_t type = dfa->nodes[prev_node].type;
1676 assert (!IS_EPSILON_NODE (type));
1677#endif
1678#ifdef RE_ENABLE_I18N
1679 /* If the node may accept `multi byte'. */
1680 if (dfa->nodes[prev_node].accept_mb)
1681 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1682 str_idx, sctx->last_str_idx);
1683#endif /* RE_ENABLE_I18N */
1684
1685 /* We don't check backreferences here.
1686 See update_cur_sifted_state(). */
1687 if (!naccepted
1688 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1689 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1690 dfa->nexts[prev_node]))
1691 naccepted = 1;
1692
1693 if (naccepted == 0)
1694 continue;
1695
1696 if (sctx->limits.nelem)
1697 {
1698 int to_idx = str_idx + naccepted;
1699 if (check_dst_limits (mctx, &sctx->limits,
1700 dfa->nexts[prev_node], to_idx,
1701 prev_node, str_idx))
1702 continue;
1703 }
1704 ret = re_node_set_insert (cur_dest, prev_node);
1705 if (BE (ret == -1, 0))
1706 return REG_ESPACE;
1707 }
1708
1709 return REG_NOERROR;
1710}
1711
1712/* Helper functions. */
1713
1714static reg_errcode_t
1715internal_function
1716clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1717{
1718 int top = mctx->state_log_top;
1719
1720 if (next_state_log_idx >= mctx->input.bufs_len
1721 || (next_state_log_idx >= mctx->input.valid_len
1722 && mctx->input.valid_len < mctx->input.len))
1723 {
1724 reg_errcode_t err;
1725 err = extend_buffers (mctx);
1726 if (BE (err != REG_NOERROR, 0))
1727 return err;
1728 }
1729
1730 if (top < next_state_log_idx)
1731 {
1732 memset (mctx->state_log + top + 1, '\0',
1733 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1734 mctx->state_log_top = next_state_log_idx;
1735 }
1736 return REG_NOERROR;
1737}
1738
1739static reg_errcode_t
1740internal_function
1741merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1742 re_dfastate_t **src, int num)
1743{
1744 int st_idx;
1745 reg_errcode_t err;
1746 for (st_idx = 0; st_idx < num; ++st_idx)
1747 {
1748 if (dst[st_idx] == NULL)
1749 dst[st_idx] = src[st_idx];
1750 else if (src[st_idx] != NULL)
1751 {
1752 re_node_set merged_set;
1753 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1754 &src[st_idx]->nodes);
1755 if (BE (err != REG_NOERROR, 0))
1756 return err;
1757 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1758 re_node_set_free (&merged_set);
1759 if (BE (err != REG_NOERROR, 0))
1760 return err;
1761 }
1762 }
1763 return REG_NOERROR;
1764}
1765
1766static reg_errcode_t
1767internal_function
1768update_cur_sifted_state (const re_match_context_t *mctx,
1769 re_sift_context_t *sctx, int str_idx,
1770 re_node_set *dest_nodes)
1771{
1772 const re_dfa_t *const dfa = mctx->dfa;
1773 reg_errcode_t err = REG_NOERROR;
1774 const re_node_set *candidates;
1775 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1776 : &mctx->state_log[str_idx]->nodes);
1777
1778 if (dest_nodes->nelem == 0)
1779 sctx->sifted_states[str_idx] = NULL;
1780 else
1781 {
1782 if (candidates)
1783 {
1784 /* At first, add the nodes which can epsilon transit to a node in
1785 DEST_NODE. */
1786 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1787 if (BE (err != REG_NOERROR, 0))
1788 return err;
1789
1790 /* Then, check the limitations in the current sift_context. */
1791 if (sctx->limits.nelem)
1792 {
1793 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1794 mctx->bkref_ents, str_idx);
1795 if (BE (err != REG_NOERROR, 0))
1796 return err;
1797 }
1798 }
1799
1800 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1801 if (BE (err != REG_NOERROR, 0))
1802 return err;
1803 }
1804
1805 if (candidates && mctx->state_log[str_idx]->has_backref)
1806 {
1807 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1808 if (BE (err != REG_NOERROR, 0))
1809 return err;
1810 }
1811 return REG_NOERROR;
1812}
1813
1814static reg_errcode_t
1815internal_function
1816add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1817 const re_node_set *candidates)
1818{
1819 reg_errcode_t err = REG_NOERROR;
1820 int i;
1821
1822 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1823 if (BE (err != REG_NOERROR, 0))
1824 return err;
1825
1826 if (!state->inveclosure.alloc)
1827 {
1828 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1829 if (BE (err != REG_NOERROR, 0))
1830 return REG_ESPACE;
1831 for (i = 0; i < dest_nodes->nelem; i++)
1832 {
1833 err = re_node_set_merge (&state->inveclosure,
1834 dfa->inveclosures + dest_nodes->elems[i]);
1835 if (BE (err != REG_NOERROR, 0))
1836 return REG_ESPACE;
1837 }
1838 }
1839 return re_node_set_add_intersect (dest_nodes, candidates,
1840 &state->inveclosure);
1841}
1842
1843static reg_errcode_t
1844internal_function
1845sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1846 const re_node_set *candidates)
1847{
1848 int ecl_idx;
1849 reg_errcode_t err;
1850 re_node_set *inv_eclosure = dfa->inveclosures + node;
1851 re_node_set except_nodes;
1852 re_node_set_init_empty (&except_nodes);
1853 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1854 {
1855 int cur_node = inv_eclosure->elems[ecl_idx];
1856 if (cur_node == node)
1857 continue;
1858 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1859 {
1860 int edst1 = dfa->edests[cur_node].elems[0];
1861 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1862 ? dfa->edests[cur_node].elems[1] : -1);
1863 if ((!re_node_set_contains (inv_eclosure, edst1)
1864 && re_node_set_contains (dest_nodes, edst1))
1865 || (edst2 > 0
1866 && !re_node_set_contains (inv_eclosure, edst2)
1867 && re_node_set_contains (dest_nodes, edst2)))
1868 {
1869 err = re_node_set_add_intersect (&except_nodes, candidates,
1870 dfa->inveclosures + cur_node);
1871 if (BE (err != REG_NOERROR, 0))
1872 {
1873 re_node_set_free (&except_nodes);
1874 return err;
1875 }
1876 }
1877 }
1878 }
1879 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1880 {
1881 int cur_node = inv_eclosure->elems[ecl_idx];
1882 if (!re_node_set_contains (&except_nodes, cur_node))
1883 {
1884 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1885 re_node_set_remove_at (dest_nodes, idx);
1886 }
1887 }
1888 re_node_set_free (&except_nodes);
1889 return REG_NOERROR;
1890}
1891
1892static int
1893internal_function
1894check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1895 int dst_node, int dst_idx, int src_node, int src_idx)
1896{
1897 const re_dfa_t *const dfa = mctx->dfa;
1898 int lim_idx, src_pos, dst_pos;
1899
1900 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1901 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1902 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1903 {
1904 int subexp_idx;
1905 struct re_backref_cache_entry *ent;
1906 ent = mctx->bkref_ents + limits->elems[lim_idx];
1907 subexp_idx = dfa->nodes[ent->node].opr.idx;
1908
1909 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1910 subexp_idx, dst_node, dst_idx,
1911 dst_bkref_idx);
1912 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1913 subexp_idx, src_node, src_idx,
1914 src_bkref_idx);
1915
1916 /* In case of:
1917 <src> <dst> ( <subexp> )
1918 ( <subexp> ) <src> <dst>
1919 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1920 if (src_pos == dst_pos)
1921 continue; /* This is unrelated limitation. */
1922 else
1923 return 1;
1924 }
1925 return 0;
1926}
1927
1928static int
1929internal_function
1930check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1931 int subexp_idx, int from_node, int bkref_idx)
1932{
1933 const re_dfa_t *const dfa = mctx->dfa;
1934 const re_node_set *eclosures = dfa->eclosures + from_node;
1935 int node_idx;
1936
1937 /* Else, we are on the boundary: examine the nodes on the epsilon
1938 closure. */
1939 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1940 {
1941 int node = eclosures->elems[node_idx];
1942 switch (dfa->nodes[node].type)
1943 {
1944 case OP_BACK_REF:
1945 if (bkref_idx != -1)
1946 {
1947 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1948 do
1949 {
1950 int dst, cpos;
1951
1952 if (ent->node != node)
1953 continue;
1954
1955 if (subexp_idx < BITSET_WORD_BITS
1956 && !(ent->eps_reachable_subexps_map
1957 & ((bitset_word_t) 1 << subexp_idx)))
1958 continue;
1959
1960 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1961 OP_CLOSE_SUBEXP cases below. But, if the
1962 destination node is the same node as the source
1963 node, don't recurse because it would cause an
1964 infinite loop: a regex that exhibits this behavior
1965 is ()\1*\1* */
1966 dst = dfa->edests[node].elems[0];
1967 if (dst == from_node)
1968 {
1969 if (boundaries & 1)
1970 return -1;
1971 else /* if (boundaries & 2) */
1972 return 0;
1973 }
1974
1975 cpos =
1976 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1977 dst, bkref_idx);
1978 if (cpos == -1 /* && (boundaries & 1) */)
1979 return -1;
1980 if (cpos == 0 && (boundaries & 2))
1981 return 0;
1982
1983 if (subexp_idx < BITSET_WORD_BITS)
1984 ent->eps_reachable_subexps_map
1985 &= ~((bitset_word_t) 1 << subexp_idx);
1986 }
1987 while (ent++->more);
1988 }
1989 break;
1990
1991 case OP_OPEN_SUBEXP:
1992 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1993 return -1;
1994 break;
1995
1996 case OP_CLOSE_SUBEXP:
1997 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1998 return 0;
1999 break;
2000
2001 default:
2002 break;
2003 }
2004 }
2005
2006 return (boundaries & 2) ? 1 : 0;
2007}
2008
2009static int
2010internal_function
2011check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2012 int subexp_idx, int from_node, int str_idx,
2013 int bkref_idx)
2014{
2015 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2016 int boundaries;
2017
2018 /* If we are outside the range of the subexpression, return -1 or 1. */
2019 if (str_idx < lim->subexp_from)
2020 return -1;
2021
2022 if (lim->subexp_to < str_idx)
2023 return 1;
2024
2025 /* If we are within the subexpression, return 0. */
2026 boundaries = (str_idx == lim->subexp_from);
2027 boundaries |= (str_idx == lim->subexp_to) << 1;
2028 if (boundaries == 0)
2029 return 0;
2030
2031 /* Else, examine epsilon closure. */
2032 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2033 from_node, bkref_idx);
2034}
2035
2036/* Check the limitations of sub expressions LIMITS, and remove the nodes
2037 which are against limitations from DEST_NODES. */
2038
2039static reg_errcode_t
2040internal_function
2041check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2042 const re_node_set *candidates, re_node_set *limits,
2043 struct re_backref_cache_entry *bkref_ents, int str_idx)
2044{
2045 reg_errcode_t err;
2046 int node_idx, lim_idx;
2047
2048 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2049 {
2050 int subexp_idx;
2051 struct re_backref_cache_entry *ent;
2052 ent = bkref_ents + limits->elems[lim_idx];
2053
2054 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2055 continue; /* This is unrelated limitation. */
2056
2057 subexp_idx = dfa->nodes[ent->node].opr.idx;
2058 if (ent->subexp_to == str_idx)
2059 {
2060 int ops_node = -1;
2061 int cls_node = -1;
2062 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2063 {
2064 int node = dest_nodes->elems[node_idx];
2065 re_token_type_t type = dfa->nodes[node].type;
2066 if (type == OP_OPEN_SUBEXP
2067 && subexp_idx == dfa->nodes[node].opr.idx)
2068 ops_node = node;
2069 else if (type == OP_CLOSE_SUBEXP
2070 && subexp_idx == dfa->nodes[node].opr.idx)
2071 cls_node = node;
2072 }
2073
2074 /* Check the limitation of the open subexpression. */
2075 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2076 if (ops_node >= 0)
2077 {
2078 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2079 candidates);
2080 if (BE (err != REG_NOERROR, 0))
2081 return err;
2082 }
2083
2084 /* Check the limitation of the close subexpression. */
2085 if (cls_node >= 0)
2086 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2087 {
2088 int node = dest_nodes->elems[node_idx];
2089 if (!re_node_set_contains (dfa->inveclosures + node,
2090 cls_node)
2091 && !re_node_set_contains (dfa->eclosures + node,
2092 cls_node))
2093 {
2094 /* It is against this limitation.
2095 Remove it form the current sifted state. */
2096 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2097 candidates);
2098 if (BE (err != REG_NOERROR, 0))
2099 return err;
2100 --node_idx;
2101 }
2102 }
2103 }
2104 else /* (ent->subexp_to != str_idx) */
2105 {
2106 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2107 {
2108 int node = dest_nodes->elems[node_idx];
2109 re_token_type_t type = dfa->nodes[node].type;
2110 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2111 {
2112 if (subexp_idx != dfa->nodes[node].opr.idx)
2113 continue;
2114 /* It is against this limitation.
2115 Remove it form the current sifted state. */
2116 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2117 candidates);
2118 if (BE (err != REG_NOERROR, 0))
2119 return err;
2120 }
2121 }
2122 }
2123 }
2124 return REG_NOERROR;
2125}
2126
2127static reg_errcode_t
2128internal_function
2129sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2130 int str_idx, const re_node_set *candidates)
2131{
2132 const re_dfa_t *const dfa = mctx->dfa;
2133 reg_errcode_t err;
2134 int node_idx, node;
2135 re_sift_context_t local_sctx;
2136 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2137
2138 if (first_idx == -1)
2139 return REG_NOERROR;
2140
2141 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2142
2143 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2144 {
2145 int enabled_idx;
2146 re_token_type_t type;
2147 struct re_backref_cache_entry *entry;
2148 node = candidates->elems[node_idx];
2149 type = dfa->nodes[node].type;
2150 /* Avoid infinite loop for the REs like "()\1+". */
2151 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2152 continue;
2153 if (type != OP_BACK_REF)
2154 continue;
2155
2156 entry = mctx->bkref_ents + first_idx;
2157 enabled_idx = first_idx;
2158 do
2159 {
2160 int subexp_len;
2161 int to_idx;
2162 int dst_node;
2163 int ret;
2164 re_dfastate_t *cur_state;
2165
2166 if (entry->node != node)
2167 continue;
2168 subexp_len = entry->subexp_to - entry->subexp_from;
2169 to_idx = str_idx + subexp_len;
2170 dst_node = (subexp_len ? dfa->nexts[node]
2171 : dfa->edests[node].elems[0]);
2172
2173 if (to_idx > sctx->last_str_idx
2174 || sctx->sifted_states[to_idx] == NULL
2175 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2176 || check_dst_limits (mctx, &sctx->limits, node,
2177 str_idx, dst_node, to_idx))
2178 continue;
2179
2180 if (local_sctx.sifted_states == NULL)
2181 {
2182 local_sctx = *sctx;
2183 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2184 if (BE (err != REG_NOERROR, 0))
2185 goto free_return;
2186 }
2187 local_sctx.last_node = node;
2188 local_sctx.last_str_idx = str_idx;
2189 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2190 if (BE (ret < 0, 0))
2191 {
2192 err = REG_ESPACE;
2193 goto free_return;
2194 }
2195 cur_state = local_sctx.sifted_states[str_idx];
2196 err = sift_states_backward (mctx, &local_sctx);
2197 if (BE (err != REG_NOERROR, 0))
2198 goto free_return;
2199 if (sctx->limited_states != NULL)
2200 {
2201 err = merge_state_array (dfa, sctx->limited_states,
2202 local_sctx.sifted_states,
2203 str_idx + 1);
2204 if (BE (err != REG_NOERROR, 0))
2205 goto free_return;
2206 }
2207 local_sctx.sifted_states[str_idx] = cur_state;
2208 re_node_set_remove (&local_sctx.limits, enabled_idx);
2209
2210 /* mctx->bkref_ents may have changed, reload the pointer. */
2211 entry = mctx->bkref_ents + enabled_idx;
2212 }
2213 while (enabled_idx++, entry++->more);
2214 }
2215 err = REG_NOERROR;
2216 free_return:
2217 if (local_sctx.sifted_states != NULL)
2218 {
2219 re_node_set_free (&local_sctx.limits);
2220 }
2221
2222 return err;
2223}
2224
2225
2226#ifdef RE_ENABLE_I18N
2227static int
2228internal_function
2229sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2230 int node_idx, int str_idx, int max_str_idx)
2231{
2232 const re_dfa_t *const dfa = mctx->dfa;
2233 int naccepted;
2234 /* Check the node can accept `multi byte'. */
2235 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2236 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2237 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2238 dfa->nexts[node_idx]))
2239 /* The node can't accept the `multi byte', or the
2240 destination was already thrown away, then the node
2241 couldn't accept the current input `multi byte'. */
2242 naccepted = 0;
2243 /* Otherwise, it is sure that the node could accept
2244 `naccepted' bytes input. */
2245 return naccepted;
2246}
2247#endif /* RE_ENABLE_I18N */
2248
2249\f
2250/* Functions for state transition. */
2251
2252/* Return the next state to which the current state STATE will transit by
2253 accepting the current input byte, and update STATE_LOG if necessary.
2254 If STATE can accept a multibyte char/collating element/back reference
2255 update the destination of STATE_LOG. */
2256
2257static re_dfastate_t *
2258internal_function
2259transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2260 re_dfastate_t *state)
2261{
2262 re_dfastate_t **trtable;
2263 unsigned char ch;
2264
2265#ifdef RE_ENABLE_I18N
2266 /* If the current state can accept multibyte. */
2267 if (BE (state->accept_mb, 0))
2268 {
2269 *err = transit_state_mb (mctx, state);
2270 if (BE (*err != REG_NOERROR, 0))
2271 return NULL;
2272 }
2273#endif /* RE_ENABLE_I18N */
2274
2275 /* Then decide the next state with the single byte. */
2276#if 0
2277 if (0)
2278 /* don't use transition table */
2279 return transit_state_sb (err, mctx, state);
2280#endif
2281
2282 /* Use transition table */
2283 ch = re_string_fetch_byte (&mctx->input);
2284 for (;;)
2285 {
2286 trtable = state->trtable;
2287 if (BE (trtable != NULL, 1))
2288 return trtable[ch];
2289
2290 trtable = state->word_trtable;
2291 if (BE (trtable != NULL, 1))
2292 {
2293 unsigned int context;
2294 context
2295 = re_string_context_at (&mctx->input,
2296 re_string_cur_idx (&mctx->input) - 1,
2297 mctx->eflags);
2298 if (IS_WORD_CONTEXT (context))
2299 return trtable[ch + SBC_MAX];
2300 else
2301 return trtable[ch];
2302 }
2303
2304 if (!build_trtable (mctx->dfa, state))
2305 {
2306 *err = REG_ESPACE;
2307 return NULL;
2308 }
2309
2310 /* Retry, we now have a transition table. */
2311 }
2312}
2313
2314/* Update the state_log if we need */
2315static re_dfastate_t *
2316internal_function
2317merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2318 re_dfastate_t *next_state)
2319{
2320 const re_dfa_t *const dfa = mctx->dfa;
2321 int cur_idx = re_string_cur_idx (&mctx->input);
2322
2323 if (cur_idx > mctx->state_log_top)
2324 {
2325 mctx->state_log[cur_idx] = next_state;
2326 mctx->state_log_top = cur_idx;
2327 }
2328 else if (mctx->state_log[cur_idx] == NULL)
2329 {
2330 mctx->state_log[cur_idx] = next_state;
2331 }
2332 else
2333 {
2334 re_dfastate_t *pstate;
2335 unsigned int context;
2336 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2337 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2338 the destination of a multibyte char/collating element/
2339 back reference. Then the next state is the union set of
2340 these destinations and the results of the transition table. */
2341 pstate = mctx->state_log[cur_idx];
2342 log_nodes = pstate->entrance_nodes;
2343 if (next_state != NULL)
2344 {
2345 table_nodes = next_state->entrance_nodes;
2346 *err = re_node_set_init_union (&next_nodes, table_nodes,
2347 log_nodes);
2348 if (BE (*err != REG_NOERROR, 0))
2349 return NULL;
2350 }
2351 else
2352 next_nodes = *log_nodes;
2353 /* Note: We already add the nodes of the initial state,
2354 then we don't need to add them here. */
2355
2356 context = re_string_context_at (&mctx->input,
2357 re_string_cur_idx (&mctx->input) - 1,
2358 mctx->eflags);
2359 next_state = mctx->state_log[cur_idx]
2360 = re_acquire_state_context (err, dfa, &next_nodes, context);
2361 /* We don't need to check errors here, since the return value of
2362 this function is next_state and ERR is already set. */
2363
2364 if (table_nodes != NULL)
2365 re_node_set_free (&next_nodes);
2366 }
2367
2368 if (BE (dfa->nbackref, 0) && next_state != NULL)
2369 {
2370 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2371 later. We must check them here, since the back references in the
2372 next state might use them. */
2373 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2374 cur_idx);
2375 if (BE (*err != REG_NOERROR, 0))
2376 return NULL;
2377
2378 /* If the next state has back references. */
2379 if (next_state->has_backref)
2380 {
2381 *err = transit_state_bkref (mctx, &next_state->nodes);
2382 if (BE (*err != REG_NOERROR, 0))
2383 return NULL;
2384 next_state = mctx->state_log[cur_idx];
2385 }
2386 }
2387
2388 return next_state;
2389}
2390
2391/* Skip bytes in the input that correspond to part of a
2392 multi-byte match, then look in the log for a state
2393 from which to restart matching. */
2394static re_dfastate_t *
2395internal_function
2396find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2397{
2398 re_dfastate_t *cur_state;
2399 do
2400 {
2401 int max = mctx->state_log_top;
2402 int cur_str_idx = re_string_cur_idx (&mctx->input);
2403
2404 do
2405 {
2406 if (++cur_str_idx > max)
2407 return NULL;
2408 re_string_skip_bytes (&mctx->input, 1);
2409 }
2410 while (mctx->state_log[cur_str_idx] == NULL);
2411
2412 cur_state = merge_state_with_log (err, mctx, NULL);
2413 }
2414 while (*err == REG_NOERROR && cur_state == NULL);
2415 return cur_state;
2416}
2417
2418/* Helper functions for transit_state. */
2419
2420/* From the node set CUR_NODES, pick up the nodes whose types are
2421 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2422 expression. And register them to use them later for evaluating the
2423 correspoding back references. */
2424
2425static reg_errcode_t
2426internal_function
2427check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2428 int str_idx)
2429{
2430 const re_dfa_t *const dfa = mctx->dfa;
2431 int node_idx;
2432 reg_errcode_t err;
2433
2434 /* TODO: This isn't efficient.
2435 Because there might be more than one nodes whose types are
2436 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2437 nodes.
2438 E.g. RE: (a){2} */
2439 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2440 {
2441 int node = cur_nodes->elems[node_idx];
2442 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2443 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2444 && (dfa->used_bkref_map
2445 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2446 {
2447 err = match_ctx_add_subtop (mctx, node, str_idx);
2448 if (BE (err != REG_NOERROR, 0))
2449 return err;
2450 }
2451 }
2452 return REG_NOERROR;
2453}
2454
2455#if 0
2456/* Return the next state to which the current state STATE will transit by
2457 accepting the current input byte. */
2458
2459static re_dfastate_t *
2460transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2461 re_dfastate_t *state)
2462{
2463 const re_dfa_t *const dfa = mctx->dfa;
2464 re_node_set next_nodes;
2465 re_dfastate_t *next_state;
2466 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2467 unsigned int context;
2468
2469 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2470 if (BE (*err != REG_NOERROR, 0))
2471 return NULL;
2472 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2473 {
2474 int cur_node = state->nodes.elems[node_cnt];
2475 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2476 {
2477 *err = re_node_set_merge (&next_nodes,
2478 dfa->eclosures + dfa->nexts[cur_node]);
2479 if (BE (*err != REG_NOERROR, 0))
2480 {
2481 re_node_set_free (&next_nodes);
2482 return NULL;
2483 }
2484 }
2485 }
2486 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2487 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2488 /* We don't need to check errors here, since the return value of
2489 this function is next_state and ERR is already set. */
2490
2491 re_node_set_free (&next_nodes);
2492 re_string_skip_bytes (&mctx->input, 1);
2493 return next_state;
2494}
2495#endif
2496
2497#ifdef RE_ENABLE_I18N
2498static reg_errcode_t
2499internal_function
2500transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2501{
2502 const re_dfa_t *const dfa = mctx->dfa;
2503 reg_errcode_t err;
2504 int i;
2505
2506 for (i = 0; i < pstate->nodes.nelem; ++i)
2507 {
2508 re_node_set dest_nodes, *new_nodes;
2509 int cur_node_idx = pstate->nodes.elems[i];
2510 int naccepted, dest_idx;
2511 unsigned int context;
2512 re_dfastate_t *dest_state;
2513
2514 if (!dfa->nodes[cur_node_idx].accept_mb)
2515 continue;
2516
2517 if (dfa->nodes[cur_node_idx].constraint)
2518 {
2519 context = re_string_context_at (&mctx->input,
2520 re_string_cur_idx (&mctx->input),
2521 mctx->eflags);
2522 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2523 context))
2524 continue;
2525 }
2526
2527 /* How many bytes the node can accept? */
2528 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2529 re_string_cur_idx (&mctx->input));
2530 if (naccepted == 0)
2531 continue;
2532
2533 /* The node can accepts `naccepted' bytes. */
2534 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2535 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2536 : mctx->max_mb_elem_len);
2537 err = clean_state_log_if_needed (mctx, dest_idx);
2538 if (BE (err != REG_NOERROR, 0))
2539 return err;
2540#ifdef DEBUG
2541 assert (dfa->nexts[cur_node_idx] != -1);
2542#endif
2543 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2544
2545 dest_state = mctx->state_log[dest_idx];
2546 if (dest_state == NULL)
2547 dest_nodes = *new_nodes;
2548 else
2549 {
2550 err = re_node_set_init_union (&dest_nodes,
2551 dest_state->entrance_nodes, new_nodes);
2552 if (BE (err != REG_NOERROR, 0))
2553 return err;
2554 }
2555 context = re_string_context_at (&mctx->input, dest_idx - 1,
2556 mctx->eflags);
2557 mctx->state_log[dest_idx]
2558 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2559 if (dest_state != NULL)
2560 re_node_set_free (&dest_nodes);
2561 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2562 return err;
2563 }
2564 return REG_NOERROR;
2565}
2566#endif /* RE_ENABLE_I18N */
2567
2568static reg_errcode_t
2569internal_function
2570transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2571{
2572 const re_dfa_t *const dfa = mctx->dfa;
2573 reg_errcode_t err;
2574 int i;
2575 int cur_str_idx = re_string_cur_idx (&mctx->input);
2576
2577 for (i = 0; i < nodes->nelem; ++i)
2578 {
2579 int dest_str_idx, prev_nelem, bkc_idx;
2580 int node_idx = nodes->elems[i];
2581 unsigned int context;
2582 const re_token_t *node = dfa->nodes + node_idx;
2583 re_node_set *new_dest_nodes;
2584
2585 /* Check whether `node' is a backreference or not. */
2586 if (node->type != OP_BACK_REF)
2587 continue;
2588
2589 if (node->constraint)
2590 {
2591 context = re_string_context_at (&mctx->input, cur_str_idx,
2592 mctx->eflags);
2593 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2594 continue;
2595 }
2596
2597 /* `node' is a backreference.
2598 Check the substring which the substring matched. */
2599 bkc_idx = mctx->nbkref_ents;
2600 err = get_subexp (mctx, node_idx, cur_str_idx);
2601 if (BE (err != REG_NOERROR, 0))
2602 goto free_return;
2603
2604 /* And add the epsilon closures (which is `new_dest_nodes') of
2605 the backreference to appropriate state_log. */
2606#ifdef DEBUG
2607 assert (dfa->nexts[node_idx] != -1);
2608#endif
2609 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2610 {
2611 int subexp_len;
2612 re_dfastate_t *dest_state;
2613 struct re_backref_cache_entry *bkref_ent;
2614 bkref_ent = mctx->bkref_ents + bkc_idx;
2615 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2616 continue;
2617 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2618 new_dest_nodes = (subexp_len == 0
2619 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2620 : dfa->eclosures + dfa->nexts[node_idx]);
2621 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2622 - bkref_ent->subexp_from);
2623 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2624 mctx->eflags);
2625 dest_state = mctx->state_log[dest_str_idx];
2626 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2627 : mctx->state_log[cur_str_idx]->nodes.nelem);
2628 /* Add `new_dest_node' to state_log. */
2629 if (dest_state == NULL)
2630 {
2631 mctx->state_log[dest_str_idx]
2632 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2633 context);
2634 if (BE (mctx->state_log[dest_str_idx] == NULL
2635 && err != REG_NOERROR, 0))
2636 goto free_return;
2637 }
2638 else
2639 {
2640 re_node_set dest_nodes;
2641 err = re_node_set_init_union (&dest_nodes,
2642 dest_state->entrance_nodes,
2643 new_dest_nodes);
2644 if (BE (err != REG_NOERROR, 0))
2645 {
2646 re_node_set_free (&dest_nodes);
2647 goto free_return;
2648 }
2649 mctx->state_log[dest_str_idx]
2650 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2651 re_node_set_free (&dest_nodes);
2652 if (BE (mctx->state_log[dest_str_idx] == NULL
2653 && err != REG_NOERROR, 0))
2654 goto free_return;
2655 }
2656 /* We need to check recursively if the backreference can epsilon
2657 transit. */
2658 if (subexp_len == 0
2659 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2660 {
2661 err = check_subexp_matching_top (mctx, new_dest_nodes,
2662 cur_str_idx);
2663 if (BE (err != REG_NOERROR, 0))
2664 goto free_return;
2665 err = transit_state_bkref (mctx, new_dest_nodes);
2666 if (BE (err != REG_NOERROR, 0))
2667 goto free_return;
2668 }
2669 }
2670 }
2671 err = REG_NOERROR;
2672 free_return:
2673 return err;
2674}
2675
2676/* Enumerate all the candidates which the backreference BKREF_NODE can match
2677 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2678 Note that we might collect inappropriate candidates here.
2679 However, the cost of checking them strictly here is too high, then we
2680 delay these checking for prune_impossible_nodes(). */
2681
2682static reg_errcode_t
2683internal_function
2684get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2685{
2686 const re_dfa_t *const dfa = mctx->dfa;
2687 int subexp_num, sub_top_idx;
2688 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2689 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2690 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2691 if (cache_idx != -1)
2692 {
2693 const struct re_backref_cache_entry *entry
2694 = mctx->bkref_ents + cache_idx;
2695 do
2696 if (entry->node == bkref_node)
2697 return REG_NOERROR; /* We already checked it. */
2698 while (entry++->more);
2699 }
2700
2701 subexp_num = dfa->nodes[bkref_node].opr.idx;
2702
2703 /* For each sub expression */
2704 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2705 {
2706 reg_errcode_t err;
2707 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2708 re_sub_match_last_t *sub_last;
2709 int sub_last_idx, sl_str, bkref_str_off;
2710
2711 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2712 continue; /* It isn't related. */
2713
2714 sl_str = sub_top->str_idx;
2715 bkref_str_off = bkref_str_idx;
2716 /* At first, check the last node of sub expressions we already
2717 evaluated. */
2718 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2719 {
2720 int sl_str_diff;
2721 sub_last = sub_top->lasts[sub_last_idx];
2722 sl_str_diff = sub_last->str_idx - sl_str;
2723 /* The matched string by the sub expression match with the substring
2724 at the back reference? */
2725 if (sl_str_diff > 0)
2726 {
2727 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2728 {
2729 /* Not enough chars for a successful match. */
2730 if (bkref_str_off + sl_str_diff > mctx->input.len)
2731 break;
2732
2733 err = clean_state_log_if_needed (mctx,
2734 bkref_str_off
2735 + sl_str_diff);
2736 if (BE (err != REG_NOERROR, 0))
2737 return err;
2738 buf = (const char *) re_string_get_buffer (&mctx->input);
2739 }
2740 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2741 /* We don't need to search this sub expression any more. */
2742 break;
2743 }
2744 bkref_str_off += sl_str_diff;
2745 sl_str += sl_str_diff;
2746 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2747 bkref_str_idx);
2748
2749 /* Reload buf, since the preceding call might have reallocated
2750 the buffer. */
2751 buf = (const char *) re_string_get_buffer (&mctx->input);
2752
2753 if (err == REG_NOMATCH)
2754 continue;
2755 if (BE (err != REG_NOERROR, 0))
2756 return err;
2757 }
2758
2759 if (sub_last_idx < sub_top->nlasts)
2760 continue;
2761 if (sub_last_idx > 0)
2762 ++sl_str;
2763 /* Then, search for the other last nodes of the sub expression. */
2764 for (; sl_str <= bkref_str_idx; ++sl_str)
2765 {
2766 int cls_node, sl_str_off;
2767 const re_node_set *nodes;
2768 sl_str_off = sl_str - sub_top->str_idx;
2769 /* The matched string by the sub expression match with the substring
2770 at the back reference? */
2771 if (sl_str_off > 0)
2772 {
2773 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2774 {
2775 /* If we are at the end of the input, we cannot match. */
2776 if (bkref_str_off >= mctx->input.len)
2777 break;
2778
2779 err = extend_buffers (mctx);
2780 if (BE (err != REG_NOERROR, 0))
2781 return err;
2782
2783 buf = (const char *) re_string_get_buffer (&mctx->input);
2784 }
2785 if (buf [bkref_str_off++] != buf[sl_str - 1])
2786 break; /* We don't need to search this sub expression
2787 any more. */
2788 }
2789 if (mctx->state_log[sl_str] == NULL)
2790 continue;
2791 /* Does this state have a ')' of the sub expression? */
2792 nodes = &mctx->state_log[sl_str]->nodes;
2793 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2794 OP_CLOSE_SUBEXP);
2795 if (cls_node == -1)
2796 continue; /* No. */
2797 if (sub_top->path == NULL)
2798 {
2799 sub_top->path = calloc (sizeof (state_array_t),
2800 sl_str - sub_top->str_idx + 1);
2801 if (sub_top->path == NULL)
2802 return REG_ESPACE;
2803 }
2804 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2805 in the current context? */
2806 err = check_arrival (mctx, sub_top->path, sub_top->node,
2807 sub_top->str_idx, cls_node, sl_str,
2808 OP_CLOSE_SUBEXP);
2809 if (err == REG_NOMATCH)
2810 continue;
2811 if (BE (err != REG_NOERROR, 0))
2812 return err;
2813 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2814 if (BE (sub_last == NULL, 0))
2815 return REG_ESPACE;
2816 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2817 bkref_str_idx);
2818 if (err == REG_NOMATCH)
2819 continue;
2820 }
2821 }
2822 return REG_NOERROR;
2823}
2824
2825/* Helper functions for get_subexp(). */
2826
2827/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2828 If it can arrive, register the sub expression expressed with SUB_TOP
2829 and SUB_LAST. */
2830
2831static reg_errcode_t
2832internal_function
2833get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2834 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2835{
2836 reg_errcode_t err;
2837 int to_idx;
2838 /* Can the subexpression arrive the back reference? */
2839 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2840 sub_last->str_idx, bkref_node, bkref_str,
2841 OP_OPEN_SUBEXP);
2842 if (err != REG_NOERROR)
2843 return err;
2844 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2845 sub_last->str_idx);
2846 if (BE (err != REG_NOERROR, 0))
2847 return err;
2848 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2849 return clean_state_log_if_needed (mctx, to_idx);
2850}
2851
2852/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2853 Search '(' if FL_OPEN, or search ')' otherwise.
2854 TODO: This function isn't efficient...
2855 Because there might be more than one nodes whose types are
2856 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2857 nodes.
2858 E.g. RE: (a){2} */
2859
2860static int
2861internal_function
2862find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2863 int subexp_idx, int type)
2864{
2865 int cls_idx;
2866 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2867 {
2868 int cls_node = nodes->elems[cls_idx];
2869 const re_token_t *node = dfa->nodes + cls_node;
2870 if (node->type == type
2871 && node->opr.idx == subexp_idx)
2872 return cls_node;
2873 }
2874 return -1;
2875}
2876
2877/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2878 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2879 heavily reused.
2880 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2881
2882static reg_errcode_t
2883internal_function
2884check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2885 int top_str, int last_node, int last_str, int type)
2886{
2887 const re_dfa_t *const dfa = mctx->dfa;
2888 reg_errcode_t err = REG_NOERROR;
2889 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2890 re_dfastate_t *cur_state = NULL;
2891 re_node_set *cur_nodes, next_nodes;
2892 re_dfastate_t **backup_state_log;
2893 unsigned int context;
2894
2895 subexp_num = dfa->nodes[top_node].opr.idx;
2896 /* Extend the buffer if we need. */
2897 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2898 {
2899 re_dfastate_t **new_array;
2900 int old_alloc = path->alloc;
2901 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2902 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2903 if (BE (new_array == NULL, 0))
2904 {
2905 path->alloc = old_alloc;
2906 return REG_ESPACE;
2907 }
2908 path->array = new_array;
2909 memset (new_array + old_alloc, '\0',
2910 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2911 }
2912
2913 str_idx = path->next_idx ? path->next_idx : top_str;
2914
2915 /* Temporary modify MCTX. */
2916 backup_state_log = mctx->state_log;
2917 backup_cur_idx = mctx->input.cur_idx;
2918 mctx->state_log = path->array;
2919 mctx->input.cur_idx = str_idx;
2920
2921 /* Setup initial node set. */
2922 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2923 if (str_idx == top_str)
2924 {
2925 err = re_node_set_init_1 (&next_nodes, top_node);
2926 if (BE (err != REG_NOERROR, 0))
2927 return err;
2928 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2929 if (BE (err != REG_NOERROR, 0))
2930 {
2931 re_node_set_free (&next_nodes);
2932 return err;
2933 }
2934 }
2935 else
2936 {
2937 cur_state = mctx->state_log[str_idx];
2938 if (cur_state && cur_state->has_backref)
2939 {
2940 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2941 if (BE (err != REG_NOERROR, 0))
2942 return err;
2943 }
2944 else
2945 re_node_set_init_empty (&next_nodes);
2946 }
2947 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2948 {
2949 if (next_nodes.nelem)
2950 {
2951 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2952 subexp_num, type);
2953 if (BE (err != REG_NOERROR, 0))
2954 {
2955 re_node_set_free (&next_nodes);
2956 return err;
2957 }
2958 }
2959 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2960 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2961 {
2962 re_node_set_free (&next_nodes);
2963 return err;
2964 }
2965 mctx->state_log[str_idx] = cur_state;
2966 }
2967
2968 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2969 {
2970 re_node_set_empty (&next_nodes);
2971 if (mctx->state_log[str_idx + 1])
2972 {
2973 err = re_node_set_merge (&next_nodes,
2974 &mctx->state_log[str_idx + 1]->nodes);
2975 if (BE (err != REG_NOERROR, 0))
2976 {
2977 re_node_set_free (&next_nodes);
2978 return err;
2979 }
2980 }
2981 if (cur_state)
2982 {
2983 err = check_arrival_add_next_nodes (mctx, str_idx,
2984 &cur_state->non_eps_nodes,
2985 &next_nodes);
2986 if (BE (err != REG_NOERROR, 0))
2987 {
2988 re_node_set_free (&next_nodes);
2989 return err;
2990 }
2991 }
2992 ++str_idx;
2993 if (next_nodes.nelem)
2994 {
2995 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2996 if (BE (err != REG_NOERROR, 0))
2997 {
2998 re_node_set_free (&next_nodes);
2999 return err;
3000 }
3001 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3002 subexp_num, type);
3003 if (BE (err != REG_NOERROR, 0))
3004 {
3005 re_node_set_free (&next_nodes);
3006 return err;
3007 }
3008 }
3009 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3010 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3011 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3012 {
3013 re_node_set_free (&next_nodes);
3014 return err;
3015 }
3016 mctx->state_log[str_idx] = cur_state;
3017 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3018 }
3019 re_node_set_free (&next_nodes);
3020 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3021 : &mctx->state_log[last_str]->nodes);
3022 path->next_idx = str_idx;
3023
3024 /* Fix MCTX. */
3025 mctx->state_log = backup_state_log;
3026 mctx->input.cur_idx = backup_cur_idx;
3027
3028 /* Then check the current node set has the node LAST_NODE. */
3029 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3030 return REG_NOERROR;
3031
3032 return REG_NOMATCH;
3033}
3034
3035/* Helper functions for check_arrival. */
3036
3037/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3038 to NEXT_NODES.
3039 TODO: This function is similar to the functions transit_state*(),
3040 however this function has many additional works.
3041 Can't we unify them? */
3042
3043static reg_errcode_t
3044internal_function
3045check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3046 re_node_set *cur_nodes, re_node_set *next_nodes)
3047{
3048 const re_dfa_t *const dfa = mctx->dfa;
3049 int result;
3050 int cur_idx;
3051#ifdef RE_ENABLE_I18N
3052 reg_errcode_t err = REG_NOERROR;
3053#endif
3054 re_node_set union_set;
3055 re_node_set_init_empty (&union_set);
3056 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3057 {
3058 int naccepted = 0;
3059 int cur_node = cur_nodes->elems[cur_idx];
3060#ifdef DEBUG
3061 re_token_type_t type = dfa->nodes[cur_node].type;
3062 assert (!IS_EPSILON_NODE (type));
3063#endif
3064#ifdef RE_ENABLE_I18N
3065 /* If the node may accept `multi byte'. */
3066 if (dfa->nodes[cur_node].accept_mb)
3067 {
3068 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3069 str_idx);
3070 if (naccepted > 1)
3071 {
3072 re_dfastate_t *dest_state;
3073 int next_node = dfa->nexts[cur_node];
3074 int next_idx = str_idx + naccepted;
3075 dest_state = mctx->state_log[next_idx];
3076 re_node_set_empty (&union_set);
3077 if (dest_state)
3078 {
3079 err = re_node_set_merge (&union_set, &dest_state->nodes);
3080 if (BE (err != REG_NOERROR, 0))
3081 {
3082 re_node_set_free (&union_set);
3083 return err;
3084 }
3085 }
3086 result = re_node_set_insert (&union_set, next_node);
3087 if (BE (result < 0, 0))
3088 {
3089 re_node_set_free (&union_set);
3090 return REG_ESPACE;
3091 }
3092 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3093 &union_set);
3094 if (BE (mctx->state_log[next_idx] == NULL
3095 && err != REG_NOERROR, 0))
3096 {
3097 re_node_set_free (&union_set);
3098 return err;
3099 }
3100 }
3101 }
3102#endif /* RE_ENABLE_I18N */
3103 if (naccepted
3104 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3105 {
3106 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3107 if (BE (result < 0, 0))
3108 {
3109 re_node_set_free (&union_set);
3110 return REG_ESPACE;
3111 }
3112 }
3113 }
3114 re_node_set_free (&union_set);
3115 return REG_NOERROR;
3116}
3117
3118/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3119 CUR_NODES, however exclude the nodes which are:
3120 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3121 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3122*/
3123
3124static reg_errcode_t
3125internal_function
3126check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3127 int ex_subexp, int type)
3128{
3129 reg_errcode_t err;
3130 int idx, outside_node;
3131 re_node_set new_nodes;
3132#ifdef DEBUG
3133 assert (cur_nodes->nelem);
3134#endif
3135 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3136 if (BE (err != REG_NOERROR, 0))
3137 return err;
3138 /* Create a new node set NEW_NODES with the nodes which are epsilon
3139 closures of the node in CUR_NODES. */
3140
3141 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3142 {
3143 int cur_node = cur_nodes->elems[idx];
3144 const re_node_set *eclosure = dfa->eclosures + cur_node;
3145 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3146 if (outside_node == -1)
3147 {
3148 /* There are no problematic nodes, just merge them. */
3149 err = re_node_set_merge (&new_nodes, eclosure);
3150 if (BE (err != REG_NOERROR, 0))
3151 {
3152 re_node_set_free (&new_nodes);
3153 return err;
3154 }
3155 }
3156 else
3157 {
3158 /* There are problematic nodes, re-calculate incrementally. */
3159 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3160 ex_subexp, type);
3161 if (BE (err != REG_NOERROR, 0))
3162 {
3163 re_node_set_free (&new_nodes);
3164 return err;
3165 }
3166 }
3167 }
3168 re_node_set_free (cur_nodes);
3169 *cur_nodes = new_nodes;
3170 return REG_NOERROR;
3171}
3172
3173/* Helper function for check_arrival_expand_ecl.
3174 Check incrementally the epsilon closure of TARGET, and if it isn't
3175 problematic append it to DST_NODES. */
3176
3177static reg_errcode_t
3178internal_function
3179check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3180 int target, int ex_subexp, int type)
3181{
3182 int cur_node;
3183 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3184 {
3185 int err;
3186
3187 if (dfa->nodes[cur_node].type == type
3188 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3189 {
3190 if (type == OP_CLOSE_SUBEXP)
3191 {
3192 err = re_node_set_insert (dst_nodes, cur_node);
3193 if (BE (err == -1, 0))
3194 return REG_ESPACE;
3195 }
3196 break;
3197 }
3198 err = re_node_set_insert (dst_nodes, cur_node);
3199 if (BE (err == -1, 0))
3200 return REG_ESPACE;
3201 if (dfa->edests[cur_node].nelem == 0)
3202 break;
3203 if (dfa->edests[cur_node].nelem == 2)
3204 {
3205 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3206 dfa->edests[cur_node].elems[1],
3207 ex_subexp, type);
3208 if (BE (err != REG_NOERROR, 0))
3209 return err;
3210 }
3211 cur_node = dfa->edests[cur_node].elems[0];
3212 }
3213 return REG_NOERROR;
3214}
3215
3216
3217/* For all the back references in the current state, calculate the
3218 destination of the back references by the appropriate entry
3219 in MCTX->BKREF_ENTS. */
3220
3221static reg_errcode_t
3222internal_function
3223expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3224 int cur_str, int subexp_num, int type)
3225{
3226 const re_dfa_t *const dfa = mctx->dfa;
3227 reg_errcode_t err;
3228 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3229 struct re_backref_cache_entry *ent;
3230
3231 if (cache_idx_start == -1)
3232 return REG_NOERROR;
3233
3234 restart:
3235 ent = mctx->bkref_ents + cache_idx_start;
3236 do
3237 {
3238 int to_idx, next_node;
3239
3240 /* Is this entry ENT is appropriate? */
3241 if (!re_node_set_contains (cur_nodes, ent->node))
3242 continue; /* No. */
3243
3244 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3245 /* Calculate the destination of the back reference, and append it
3246 to MCTX->STATE_LOG. */
3247 if (to_idx == cur_str)
3248 {
3249 /* The backreference did epsilon transit, we must re-check all the
3250 node in the current state. */
3251 re_node_set new_dests;
3252 reg_errcode_t err2, err3;
3253 next_node = dfa->edests[ent->node].elems[0];
3254 if (re_node_set_contains (cur_nodes, next_node))
3255 continue;
3256 err = re_node_set_init_1 (&new_dests, next_node);
3257 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3258 err3 = re_node_set_merge (cur_nodes, &new_dests);
3259 re_node_set_free (&new_dests);
3260 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3261 || err3 != REG_NOERROR, 0))
3262 {
3263 err = (err != REG_NOERROR ? err
3264 : (err2 != REG_NOERROR ? err2 : err3));
3265 return err;
3266 }
3267 /* TODO: It is still inefficient... */
3268 goto restart;
3269 }
3270 else
3271 {
3272 re_node_set union_set;
3273 next_node = dfa->nexts[ent->node];
3274 if (mctx->state_log[to_idx])
3275 {
3276 int ret;
3277 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3278 next_node))
3279 continue;
3280 err = re_node_set_init_copy (&union_set,
3281 &mctx->state_log[to_idx]->nodes);
3282 ret = re_node_set_insert (&union_set, next_node);
3283 if (BE (err != REG_NOERROR || ret < 0, 0))
3284 {
3285 re_node_set_free (&union_set);
3286 err = err != REG_NOERROR ? err : REG_ESPACE;
3287 return err;
3288 }
3289 }
3290 else
3291 {
3292 err = re_node_set_init_1 (&union_set, next_node);
3293 if (BE (err != REG_NOERROR, 0))
3294 return err;
3295 }
3296 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3297 re_node_set_free (&union_set);
3298 if (BE (mctx->state_log[to_idx] == NULL
3299 && err != REG_NOERROR, 0))
3300 return err;
3301 }
3302 }
3303 while (ent++->more);
3304 return REG_NOERROR;
3305}
3306
3307/* Build transition table for the state.
3308 Return 1 if succeeded, otherwise return NULL. */
3309
3310static int
3311internal_function
3312build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3313{
3314 reg_errcode_t err;
3315 int i, j, ch, need_word_trtable = 0;
3316 bitset_word_t elem, mask;
3317 bool dests_node_malloced = false;
3318 bool dest_states_malloced = false;
3319 int ndests; /* Number of the destination states from `state'. */
3320 re_dfastate_t **trtable;
3321 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3322 re_node_set follows, *dests_node;
3323 bitset_t *dests_ch;
3324 bitset_t acceptable;
3325
3326 struct dests_alloc
3327 {
3328 re_node_set dests_node[SBC_MAX];
3329 bitset_t dests_ch[SBC_MAX];
3330 } *dests_alloc;
3331
3332 /* We build DFA states which corresponds to the destination nodes
3333 from `state'. `dests_node[i]' represents the nodes which i-th
3334 destination state contains, and `dests_ch[i]' represents the
3335 characters which i-th destination state accepts. */
3336#ifdef HAVE_ALLOCA
3337 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3338 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3339 else
3340#endif
3341 {
3342 dests_alloc = re_malloc (struct dests_alloc, 1);
3343 if (BE (dests_alloc == NULL, 0))
3344 return 0;
3345 dests_node_malloced = true;
3346 }
3347 dests_node = dests_alloc->dests_node;
3348 dests_ch = dests_alloc->dests_ch;
3349
3350 /* Initialize transiton table. */
3351 state->word_trtable = state->trtable = NULL;
3352
3353 /* At first, group all nodes belonging to `state' into several
3354 destinations. */
3355 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3356 if (BE (ndests <= 0, 0))
3357 {
3358 if (dests_node_malloced)
3359 free (dests_alloc);
3360 /* Return 0 in case of an error, 1 otherwise. */
3361 if (ndests == 0)
3362 {
3363 state->trtable = (re_dfastate_t **)
3364 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3365 return 1;
3366 }
3367 return 0;
3368 }
3369
3370 err = re_node_set_alloc (&follows, ndests + 1);
3371 if (BE (err != REG_NOERROR, 0))
3372 goto out_free;
3373
3374 /* Avoid arithmetic overflow in size calculation. */
3375 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3376 / (3 * sizeof (re_dfastate_t *)))
3377 < ndests),
3378 0))
3379 goto out_free;
3380
3381#ifdef HAVE_ALLOCA
3382 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3383 + ndests * 3 * sizeof (re_dfastate_t *)))
3384 dest_states = (re_dfastate_t **)
3385 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3386 else
3387#endif
3388 {
3389 dest_states = (re_dfastate_t **)
3390 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3391 if (BE (dest_states == NULL, 0))
3392 {
3393out_free:
3394 if (dest_states_malloced)
3395 free (dest_states);
3396 re_node_set_free (&follows);
3397 for (i = 0; i < ndests; ++i)
3398 re_node_set_free (dests_node + i);
3399 if (dests_node_malloced)
3400 free (dests_alloc);
3401 return 0;
3402 }
3403 dest_states_malloced = true;
3404 }
3405 dest_states_word = dest_states + ndests;
3406 dest_states_nl = dest_states_word + ndests;
3407 bitset_empty (acceptable);
3408
3409 /* Then build the states for all destinations. */
3410 for (i = 0; i < ndests; ++i)
3411 {
3412 int next_node;
3413 re_node_set_empty (&follows);
3414 /* Merge the follows of this destination states. */
3415 for (j = 0; j < dests_node[i].nelem; ++j)
3416 {
3417 next_node = dfa->nexts[dests_node[i].elems[j]];
3418 if (next_node != -1)
3419 {
3420 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3421 if (BE (err != REG_NOERROR, 0))
3422 goto out_free;
3423 }
3424 }
3425 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3426 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3427 goto out_free;
3428 /* If the new state has context constraint,
3429 build appropriate states for these contexts. */
3430 if (dest_states[i]->has_constraint)
3431 {
3432 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3433 CONTEXT_WORD);
3434 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3435 goto out_free;
3436
3437 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3438 need_word_trtable = 1;
3439
3440 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3441 CONTEXT_NEWLINE);
3442 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3443 goto out_free;
3444 }
3445 else
3446 {
3447 dest_states_word[i] = dest_states[i];
3448 dest_states_nl[i] = dest_states[i];
3449 }
3450 bitset_merge (acceptable, dests_ch[i]);
3451 }
3452
3453 if (!BE (need_word_trtable, 0))
3454 {
3455 /* We don't care about whether the following character is a word
3456 character, or we are in a single-byte character set so we can
3457 discern by looking at the character code: allocate a
3458 256-entry transition table. */
3459 trtable = state->trtable =
3460 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3461 if (BE (trtable == NULL, 0))
3462 goto out_free;
3463
3464 /* For all characters ch...: */
3465 for (i = 0; i < BITSET_WORDS; ++i)
3466 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3467 elem;
3468 mask <<= 1, elem >>= 1, ++ch)
3469 if (BE (elem & 1, 0))
3470 {
3471 /* There must be exactly one destination which accepts
3472 character ch. See group_nodes_into_DFAstates. */
3473 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3474 ;
3475
3476 /* j-th destination accepts the word character ch. */
3477 if (dfa->word_char[i] & mask)
3478 trtable[ch] = dest_states_word[j];
3479 else
3480 trtable[ch] = dest_states[j];
3481 }
3482 }
3483 else
3484 {
3485 /* We care about whether the following character is a word
3486 character, and we are in a multi-byte character set: discern
3487 by looking at the character code: build two 256-entry
3488 transition tables, one starting at trtable[0] and one
3489 starting at trtable[SBC_MAX]. */
3490 trtable = state->word_trtable =
3491 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3492 if (BE (trtable == NULL, 0))
3493 goto out_free;
3494
3495 /* For all characters ch...: */
3496 for (i = 0; i < BITSET_WORDS; ++i)
3497 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3498 elem;
3499 mask <<= 1, elem >>= 1, ++ch)
3500 if (BE (elem & 1, 0))
3501 {
3502 /* There must be exactly one destination which accepts
3503 character ch. See group_nodes_into_DFAstates. */
3504 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3505 ;
3506
3507 /* j-th destination accepts the word character ch. */
3508 trtable[ch] = dest_states[j];
3509 trtable[ch + SBC_MAX] = dest_states_word[j];
3510 }
3511 }
3512
3513 /* new line */
3514 if (bitset_contain (acceptable, NEWLINE_CHAR))
3515 {
3516 /* The current state accepts newline character. */
3517 for (j = 0; j < ndests; ++j)
3518 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3519 {
3520 /* k-th destination accepts newline character. */
3521 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3522 if (need_word_trtable)
3523 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3524 /* There must be only one destination which accepts
3525 newline. See group_nodes_into_DFAstates. */
3526 break;
3527 }
3528 }
3529
3530 if (dest_states_malloced)
3531 free (dest_states);
3532
3533 re_node_set_free (&follows);
3534 for (i = 0; i < ndests; ++i)
3535 re_node_set_free (dests_node + i);
3536
3537 if (dests_node_malloced)
3538 free (dests_alloc);
3539
3540 return 1;
3541}
3542
3543/* Group all nodes belonging to STATE into several destinations.
3544 Then for all destinations, set the nodes belonging to the destination
3545 to DESTS_NODE[i] and set the characters accepted by the destination
3546 to DEST_CH[i]. This function return the number of destinations. */
3547
3548static int
3549internal_function
3550group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3551 re_node_set *dests_node, bitset_t *dests_ch)
3552{
3553 reg_errcode_t err;
3554 int result;
3555 int i, j, k;
3556 int ndests; /* Number of the destinations from `state'. */
3557 bitset_t accepts; /* Characters a node can accept. */
3558 const re_node_set *cur_nodes = &state->nodes;
3559 bitset_empty (accepts);
3560 ndests = 0;
3561
3562 /* For all the nodes belonging to `state', */
3563 for (i = 0; i < cur_nodes->nelem; ++i)
3564 {
3565 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3566 re_token_type_t type = node->type;
3567 unsigned int constraint = node->constraint;
3568
3569 /* Enumerate all single byte character this node can accept. */
3570 if (type == CHARACTER)
3571 bitset_set (accepts, node->opr.c);
3572 else if (type == SIMPLE_BRACKET)
3573 {
3574 bitset_merge (accepts, node->opr.sbcset);
3575 }
3576 else if (type == OP_PERIOD)
3577 {
3578#ifdef RE_ENABLE_I18N
3579 if (dfa->mb_cur_max > 1)
3580 bitset_merge (accepts, dfa->sb_char);
3581 else
3582#endif
3583 bitset_set_all (accepts);
3584 if (!(dfa->syntax & RE_DOT_NEWLINE))
3585 bitset_clear (accepts, '\n');
3586 if (dfa->syntax & RE_DOT_NOT_NULL)
3587 bitset_clear (accepts, '\0');
3588 }
3589#ifdef RE_ENABLE_I18N
3590 else if (type == OP_UTF8_PERIOD)
3591 {
3592 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3593 if (!(dfa->syntax & RE_DOT_NEWLINE))
3594 bitset_clear (accepts, '\n');
3595 if (dfa->syntax & RE_DOT_NOT_NULL)
3596 bitset_clear (accepts, '\0');
3597 }
3598#endif
3599 else
3600 continue;
3601
3602 /* Check the `accepts' and sift the characters which are not
3603 match it the context. */
3604 if (constraint)
3605 {
3606 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3607 {
3608 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3609 bitset_empty (accepts);
3610 if (accepts_newline)
3611 bitset_set (accepts, NEWLINE_CHAR);
3612 else
3613 continue;
3614 }
3615 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3616 {
3617 bitset_empty (accepts);
3618 continue;
3619 }
3620
3621 if (constraint & NEXT_WORD_CONSTRAINT)
3622 {
3623 bitset_word_t any_set = 0;
3624 if (type == CHARACTER && !node->word_char)
3625 {
3626 bitset_empty (accepts);
3627 continue;
3628 }
3629#ifdef RE_ENABLE_I18N
3630 if (dfa->mb_cur_max > 1)
3631 for (j = 0; j < BITSET_WORDS; ++j)
3632 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3633 else
3634#endif
3635 for (j = 0; j < BITSET_WORDS; ++j)
3636 any_set |= (accepts[j] &= dfa->word_char[j]);
3637 if (!any_set)
3638 continue;
3639 }
3640 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3641 {
3642 bitset_word_t any_set = 0;
3643 if (type == CHARACTER && node->word_char)
3644 {
3645 bitset_empty (accepts);
3646 continue;
3647 }
3648#ifdef RE_ENABLE_I18N
3649 if (dfa->mb_cur_max > 1)
3650 for (j = 0; j < BITSET_WORDS; ++j)
3651 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3652 else
3653#endif
3654 for (j = 0; j < BITSET_WORDS; ++j)
3655 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3656 if (!any_set)
3657 continue;
3658 }
3659 }
3660
3661 /* Then divide `accepts' into DFA states, or create a new
3662 state. Above, we make sure that accepts is not empty. */
3663 for (j = 0; j < ndests; ++j)
3664 {
3665 bitset_t intersec; /* Intersection sets, see below. */
3666 bitset_t remains;
3667 /* Flags, see below. */
3668 bitset_word_t has_intersec, not_subset, not_consumed;
3669
3670 /* Optimization, skip if this state doesn't accept the character. */
3671 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3672 continue;
3673
3674 /* Enumerate the intersection set of this state and `accepts'. */
3675 has_intersec = 0;
3676 for (k = 0; k < BITSET_WORDS; ++k)
3677 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3678 /* And skip if the intersection set is empty. */
3679 if (!has_intersec)
3680 continue;
3681
3682 /* Then check if this state is a subset of `accepts'. */
3683 not_subset = not_consumed = 0;
3684 for (k = 0; k < BITSET_WORDS; ++k)
3685 {
3686 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3687 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3688 }
3689
3690 /* If this state isn't a subset of `accepts', create a
3691 new group state, which has the `remains'. */
3692 if (not_subset)
3693 {
3694 bitset_copy (dests_ch[ndests], remains);
3695 bitset_copy (dests_ch[j], intersec);
3696 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3697 if (BE (err != REG_NOERROR, 0))
3698 goto error_return;
3699 ++ndests;
3700 }
3701
3702 /* Put the position in the current group. */
3703 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3704 if (BE (result < 0, 0))
3705 goto error_return;
3706
3707 /* If all characters are consumed, go to next node. */
3708 if (!not_consumed)
3709 break;
3710 }
3711 /* Some characters remain, create a new group. */
3712 if (j == ndests)
3713 {
3714 bitset_copy (dests_ch[ndests], accepts);
3715 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3716 if (BE (err != REG_NOERROR, 0))
3717 goto error_return;
3718 ++ndests;
3719 bitset_empty (accepts);
3720 }
3721 }
3722 return ndests;
3723 error_return:
3724 for (j = 0; j < ndests; ++j)
3725 re_node_set_free (dests_node + j);
3726 return -1;
3727}
3728
3729#ifdef RE_ENABLE_I18N
3730/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3731 Return the number of the bytes the node accepts.
3732 STR_IDX is the current index of the input string.
3733
3734 This function handles the nodes which can accept one character, or
3735 one collating element like '.', '[a-z]', opposite to the other nodes
3736 can only accept one byte. */
3737
3738static int
3739internal_function
3740check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3741 const re_string_t *input, int str_idx)
3742{
3743 const re_token_t *node = dfa->nodes + node_idx;
3744 int char_len, elem_len;
3745 int i;
3746 wint_t wc;
3747
3748 if (BE (node->type == OP_UTF8_PERIOD, 0))
3749 {
3750 unsigned char c = re_string_byte_at (input, str_idx), d;
3751 if (BE (c < 0xc2, 1))
3752 return 0;
3753
3754 if (str_idx + 2 > input->len)
3755 return 0;
3756
3757 d = re_string_byte_at (input, str_idx + 1);
3758 if (c < 0xe0)
3759 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3760 else if (c < 0xf0)
3761 {
3762 char_len = 3;
3763 if (c == 0xe0 && d < 0xa0)
3764 return 0;
3765 }
3766 else if (c < 0xf8)
3767 {
3768 char_len = 4;
3769 if (c == 0xf0 && d < 0x90)
3770 return 0;
3771 }
3772 else if (c < 0xfc)
3773 {
3774 char_len = 5;
3775 if (c == 0xf8 && d < 0x88)
3776 return 0;
3777 }
3778 else if (c < 0xfe)
3779 {
3780 char_len = 6;
3781 if (c == 0xfc && d < 0x84)
3782 return 0;
3783 }
3784 else
3785 return 0;
3786
3787 if (str_idx + char_len > input->len)
3788 return 0;
3789
3790 for (i = 1; i < char_len; ++i)
3791 {
3792 d = re_string_byte_at (input, str_idx + i);
3793 if (d < 0x80 || d > 0xbf)
3794 return 0;
3795 }
3796 return char_len;
3797 }
3798
3799 char_len = re_string_char_size_at (input, str_idx);
3800 if (node->type == OP_PERIOD)
3801 {
3802 if (char_len <= 1)
3803 return 0;
3804 /* FIXME: I don't think this if is needed, as both '\n'
3805 and '\0' are char_len == 1. */
3806 /* '.' accepts any one character except the following two cases. */
3807 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3808 re_string_byte_at (input, str_idx) == '\n') ||
3809 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3810 re_string_byte_at (input, str_idx) == '\0'))
3811 return 0;
3812 return char_len;
3813 }
3814
3815 elem_len = re_string_elem_size_at (input, str_idx);
3816 wc = __btowc(*(input->mbs+str_idx));
3817 if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3818 return 0;
3819
3820 if (node->type == COMPLEX_BRACKET)
3821 {
3822 const re_charset_t *cset = node->opr.mbcset;
3823# ifdef _LIBC
3824 const unsigned char *pin
3825 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3826 int j;
3827 uint32_t nrules;
3828# endif /* _LIBC */
3829 int match_len = 0;
3830 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3831 ? re_string_wchar_at (input, str_idx) : 0);
3832
3833 /* match with multibyte character? */
3834 for (i = 0; i < cset->nmbchars; ++i)
3835 if (wc == cset->mbchars[i])
3836 {
3837 match_len = char_len;
3838 goto check_node_accept_bytes_match;
3839 }
3840 /* match with character_class? */
3841 for (i = 0; i < cset->nchar_classes; ++i)
3842 {
3843 wctype_t wt = cset->char_classes[i];
3844 if (__iswctype (wc, wt))
3845 {
3846 match_len = char_len;
3847 goto check_node_accept_bytes_match;
3848 }
3849 }
3850
3851# ifdef _LIBC
3852 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3853 if (nrules != 0)
3854 {
3855 unsigned int in_collseq = 0;
3856 const int32_t *table, *indirect;
3857 const unsigned char *weights, *extra;
3858 const char *collseqwc;
3859 /* This #include defines a local function! */
3860# include <locale/weight.h>
3861
3862 /* match with collating_symbol? */
3863 if (cset->ncoll_syms)
3864 extra = (const unsigned char *)
3865 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3866 for (i = 0; i < cset->ncoll_syms; ++i)
3867 {
3868 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3869 /* Compare the length of input collating element and
3870 the length of current collating element. */
3871 if (*coll_sym != elem_len)
3872 continue;
3873 /* Compare each bytes. */
3874 for (j = 0; j < *coll_sym; j++)
3875 if (pin[j] != coll_sym[1 + j])
3876 break;
3877 if (j == *coll_sym)
3878 {
3879 /* Match if every bytes is equal. */
3880 match_len = j;
3881 goto check_node_accept_bytes_match;
3882 }
3883 }
3884
3885 if (cset->nranges)
3886 {
3887 if (elem_len <= char_len)
3888 {
3889 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3890 in_collseq = __collseq_table_lookup (collseqwc, wc);
3891 }
3892 else
3893 in_collseq = find_collation_sequence_value (pin, elem_len);
3894 }
3895 /* match with range expression? */
3896 for (i = 0; i < cset->nranges; ++i)
3897 if (cset->range_starts[i] <= in_collseq
3898 && in_collseq <= cset->range_ends[i])
3899 {
3900 match_len = elem_len;
3901 goto check_node_accept_bytes_match;
3902 }
3903
3904 /* match with equivalence_class? */
3905 if (cset->nequiv_classes)
3906 {
3907 const unsigned char *cp = pin;
3908 table = (const int32_t *)
3909 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3910 weights = (const unsigned char *)
3911 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3912 extra = (const unsigned char *)
3913 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3914 indirect = (const int32_t *)
3915 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3916 int32_t idx = findidx (&cp);
3917 if (idx > 0)
3918 for (i = 0; i < cset->nequiv_classes; ++i)
3919 {
3920 int32_t equiv_class_idx = cset->equiv_classes[i];
3921 size_t weight_len = weights[idx & 0xffffff];
3922 if (weight_len == weights[equiv_class_idx & 0xffffff]
3923 && (idx >> 24) == (equiv_class_idx >> 24))
3924 {
3925 int cnt = 0;
3926
3927 idx &= 0xffffff;
3928 equiv_class_idx &= 0xffffff;
3929
3930 while (cnt <= weight_len
3931 && (weights[equiv_class_idx + 1 + cnt]
3932 == weights[idx + 1 + cnt]))
3933 ++cnt;
3934 if (cnt > weight_len)
3935 {
3936 match_len = elem_len;
3937 goto check_node_accept_bytes_match;
3938 }
3939 }
3940 }
3941 }
3942 }
3943 else
3944# endif /* _LIBC */
3945 {
3946 /* match with range expression? */
3947#if __GNUC__ >= 2
3948 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3949#else
3950 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3951 cmp_buf[2] = wc;
3952#endif
3953 for (i = 0; i < cset->nranges; ++i)
3954 {
3955 cmp_buf[0] = cset->range_starts[i];
3956 cmp_buf[4] = cset->range_ends[i];
3957 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3958 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3959 {
3960 match_len = char_len;
3961 goto check_node_accept_bytes_match;
3962 }
3963 }
3964 }
3965 check_node_accept_bytes_match:
3966 if (!cset->non_match)
3967 return match_len;
3968 else
3969 {
3970 if (match_len > 0)
3971 return 0;
3972 else
3973 return (elem_len > char_len) ? elem_len : char_len;
3974 }
3975 }
3976 return 0;
3977}
3978
3979# ifdef _LIBC
3980static unsigned int
3981internal_function
3982find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3983{
3984 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3985 if (nrules == 0)
3986 {
3987 if (mbs_len == 1)
3988 {
3989 /* No valid character. Match it as a single byte character. */
3990 const unsigned char *collseq = (const unsigned char *)
3991 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3992 return collseq[mbs[0]];
3993 }
3994 return UINT_MAX;
3995 }
3996 else
3997 {
3998 int32_t idx;
3999 const unsigned char *extra = (const unsigned char *)
4000 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4001 int32_t extrasize = (const unsigned char *)
4002 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4003
4004 for (idx = 0; idx < extrasize;)
4005 {
4006 int mbs_cnt, found = 0;
4007 int32_t elem_mbs_len;
4008 /* Skip the name of collating element name. */
4009 idx = idx + extra[idx] + 1;
4010 elem_mbs_len = extra[idx++];
4011 if (mbs_len == elem_mbs_len)
4012 {
4013 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4014 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4015 break;
4016 if (mbs_cnt == elem_mbs_len)
4017 /* Found the entry. */
4018 found = 1;
4019 }
4020 /* Skip the byte sequence of the collating element. */
4021 idx += elem_mbs_len;
4022 /* Adjust for the alignment. */
4023 idx = (idx + 3) & ~3;
4024 /* Skip the collation sequence value. */
4025 idx += sizeof (uint32_t);
4026 /* Skip the wide char sequence of the collating element. */
4027 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4028 /* If we found the entry, return the sequence value. */
4029 if (found)
4030 return *(uint32_t *) (extra + idx);
4031 /* Skip the collation sequence value. */
4032 idx += sizeof (uint32_t);
4033 }
4034 return UINT_MAX;
4035 }
4036}
4037# endif /* _LIBC */
4038#endif /* RE_ENABLE_I18N */
4039
4040/* Check whether the node accepts the byte which is IDX-th
4041 byte of the INPUT. */
4042
4043static int
4044internal_function
4045check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4046 int idx)
4047{
4048 unsigned char ch;
4049 ch = re_string_byte_at (&mctx->input, idx);
4050 switch (node->type)
4051 {
4052 case CHARACTER:
4053 if (node->opr.c != ch)
4054 return 0;
4055 break;
4056
4057 case SIMPLE_BRACKET:
4058 if (!bitset_contain (node->opr.sbcset, ch))
4059 return 0;
4060 break;
4061
4062#ifdef RE_ENABLE_I18N
4063 case OP_UTF8_PERIOD:
4064 if (ch >= 0x80)
4065 return 0;
4066 /* FALLTHROUGH */
4067#endif
4068 case OP_PERIOD:
4069 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4070 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4071 return 0;
4072 break;
4073
4074 default:
4075 return 0;
4076 }
4077
4078 if (node->constraint)
4079 {
4080 /* The node has constraints. Check whether the current context
4081 satisfies the constraints. */
4082 unsigned int context = re_string_context_at (&mctx->input, idx,
4083 mctx->eflags);
4084 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4085 return 0;
4086 }
4087
4088 return 1;
4089}
4090
4091/* Extend the buffers, if the buffers have run out. */
4092
4093static reg_errcode_t
4094internal_function
4095extend_buffers (re_match_context_t *mctx)
4096{
4097 reg_errcode_t ret;
4098 re_string_t *pstr = &mctx->input;
4099
4100 /* Avoid overflow. */
4101 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4102 return REG_ESPACE;
4103
4104 /* Double the lengths of the buffers. */
4105 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4106 if (BE (ret != REG_NOERROR, 0))
4107 return ret;
4108
4109 if (mctx->state_log != NULL)
4110 {
4111 /* And double the length of state_log. */
4112 /* XXX We have no indication of the size of this buffer. If this
4113 allocation fail we have no indication that the state_log array
4114 does not have the right size. */
4115 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4116 pstr->bufs_len + 1);
4117 if (BE (new_array == NULL, 0))
4118 return REG_ESPACE;
4119 mctx->state_log = new_array;
4120 }
4121
4122 /* Then reconstruct the buffers. */
4123 if (pstr->icase)
4124 {
4125#ifdef RE_ENABLE_I18N
4126 if (pstr->mb_cur_max > 1)
4127 {
4128 ret = build_wcs_upper_buffer (pstr);
4129 if (BE (ret != REG_NOERROR, 0))
4130 return ret;
4131 }
4132 else
4133#endif /* RE_ENABLE_I18N */
4134 build_upper_buffer (pstr);
4135 }
4136 else
4137 {
4138#ifdef RE_ENABLE_I18N
4139 if (pstr->mb_cur_max > 1)
4140 build_wcs_buffer (pstr);
4141 else
4142#endif /* RE_ENABLE_I18N */
4143 {
4144 if (pstr->trans != NULL)
4145 re_string_translate_buffer (pstr);
4146 }
4147 }
4148 return REG_NOERROR;
4149}
4150
4151\f
4152/* Functions for matching context. */
4153
4154/* Initialize MCTX. */
4155
4156static reg_errcode_t
4157internal_function
4158match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4159{
4160 mctx->eflags = eflags;
4161 mctx->match_last = -1;
4162 if (n > 0)
4163 {
4164 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4165 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4166 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4167 return REG_ESPACE;
4168 }
4169 /* Already zero-ed by the caller.
4170 else
4171 mctx->bkref_ents = NULL;
4172 mctx->nbkref_ents = 0;
4173 mctx->nsub_tops = 0; */
4174 mctx->abkref_ents = n;
4175 mctx->max_mb_elem_len = 1;
4176 mctx->asub_tops = n;
4177 return REG_NOERROR;
4178}
4179
4180/* Clean the entries which depend on the current input in MCTX.
4181 This function must be invoked when the matcher changes the start index
4182 of the input, or changes the input string. */
4183
4184static void
4185internal_function
4186match_ctx_clean (re_match_context_t *mctx)
4187{
4188 int st_idx;
4189 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4190 {
4191 int sl_idx;
4192 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4193 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4194 {
4195 re_sub_match_last_t *last = top->lasts[sl_idx];
4196 re_free (last->path.array);
4197 re_free (last);
4198 }
4199 re_free (top->lasts);
4200 if (top->path)
4201 {
4202 re_free (top->path->array);
4203 re_free (top->path);
4204 }
4205 free (top);
4206 }
4207
4208 mctx->nsub_tops = 0;
4209 mctx->nbkref_ents = 0;
4210}
4211
4212/* Free all the memory associated with MCTX. */
4213
4214static void
4215internal_function
4216match_ctx_free (re_match_context_t *mctx)
4217{
4218 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4219 match_ctx_clean (mctx);
4220 re_free (mctx->sub_tops);
4221 re_free (mctx->bkref_ents);
4222}
4223
4224/* Add a new backreference entry to MCTX.
4225 Note that we assume that caller never call this function with duplicate
4226 entry, and call with STR_IDX which isn't smaller than any existing entry.
4227*/
4228
4229static reg_errcode_t
4230internal_function
4231match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4232 int to)
4233{
4234 if (mctx->nbkref_ents >= mctx->abkref_ents)
4235 {
4236 struct re_backref_cache_entry* new_entry;
4237 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4238 mctx->abkref_ents * 2);
4239 if (BE (new_entry == NULL, 0))
4240 {
4241 re_free (mctx->bkref_ents);
4242 return REG_ESPACE;
4243 }
4244 mctx->bkref_ents = new_entry;
4245 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4246 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4247 mctx->abkref_ents *= 2;
4248 }
4249 if (mctx->nbkref_ents > 0
4250 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4251 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4252
4253 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4254 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4255 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4256 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4257
4258 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4259 If bit N is clear, means that this entry won't epsilon-transition to
4260 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4261 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4262 such node.
4263
4264 A backreference does not epsilon-transition unless it is empty, so set
4265 to all zeros if FROM != TO. */
4266 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4267 = (from == to ? ~0 : 0);
4268
4269 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4270 if (mctx->max_mb_elem_len < to - from)
4271 mctx->max_mb_elem_len = to - from;
4272 return REG_NOERROR;
4273}
4274
4275/* Search for the first entry which has the same str_idx, or -1 if none is
4276 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4277
4278static int
4279internal_function
4280search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4281{
4282 int left, right, mid, last;
4283 last = right = mctx->nbkref_ents;
4284 for (left = 0; left < right;)
4285 {
4286 mid = left + (right - left) / 2;
4287 if (mctx->bkref_ents[mid].str_idx < str_idx)
4288 left = mid + 1;
4289 else
4290 right = mid;
4291 }
4292 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4293 return left;
4294 else
4295 return -1;
4296}
4297
4298/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4299 at STR_IDX. */
4300
4301static reg_errcode_t
4302internal_function
4303match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4304{
4305#ifdef DEBUG
4306 assert (mctx->sub_tops != NULL);
4307 assert (mctx->asub_tops > 0);
4308#endif
4309 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4310 {
4311 int new_asub_tops = mctx->asub_tops * 2;
4312 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4313 re_sub_match_top_t *,
4314 new_asub_tops);
4315 if (BE (new_array == NULL, 0))
4316 return REG_ESPACE;
4317 mctx->sub_tops = new_array;
4318 mctx->asub_tops = new_asub_tops;
4319 }
4320 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4321 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4322 return REG_ESPACE;
4323 mctx->sub_tops[mctx->nsub_tops]->node = node;
4324 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4325 return REG_NOERROR;
4326}
4327
4328/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4329 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4330
4331static re_sub_match_last_t *
4332internal_function
4333match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4334{
4335 re_sub_match_last_t *new_entry;
4336 if (BE (subtop->nlasts == subtop->alasts, 0))
4337 {
4338 int new_alasts = 2 * subtop->alasts + 1;
4339 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4340 re_sub_match_last_t *,
4341 new_alasts);
4342 if (BE (new_array == NULL, 0))
4343 return NULL;
4344 subtop->lasts = new_array;
4345 subtop->alasts = new_alasts;
4346 }
4347 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4348 if (BE (new_entry != NULL, 1))
4349 {
4350 subtop->lasts[subtop->nlasts] = new_entry;
4351 new_entry->node = node;
4352 new_entry->str_idx = str_idx;
4353 ++subtop->nlasts;
4354 }
4355 return new_entry;
4356}
4357
4358static void
4359internal_function
4360sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4361 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4362{
4363 sctx->sifted_states = sifted_sts;
4364 sctx->limited_states = limited_sts;
4365 sctx->last_node = last_node;
4366 sctx->last_str_idx = last_str_idx;
4367 re_node_set_init_empty (&sctx->limits);
4368}