1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags);
44static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len);
49static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len);
53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated);
55static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
56static int check_matching (re_match_context_t *mctx, int fl_longest_match,
57 int *p_match_first) internal_function;
58static int check_halt_state_context (const re_match_context_t *mctx,
59 const re_dfastate_t *state, int idx)
60 internal_function;
61static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
62 regmatch_t *prev_idx_match, int cur_node,
63 int cur_idx, int nmatch) internal_function;
64static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
65 int str_idx, int dest_node, int nregs,
66 regmatch_t *regs,
67 re_node_set *eps_via_nodes)
68 internal_function;
69static reg_errcode_t set_regs (const regex_t *preg,
70 const re_match_context_t *mctx,
71 size_t nmatch, regmatch_t *pmatch,
72 int fl_backtrack) internal_function;
73static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
74 internal_function;
75
76#ifdef RE_ENABLE_I18N
77static int sift_states_iter_mb (const re_match_context_t *mctx,
78 re_sift_context_t *sctx,
79 int node_idx, int str_idx, int max_str_idx)
80 internal_function;
81#endif /* RE_ENABLE_I18N */
82static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
83 re_sift_context_t *sctx)
84 internal_function;
85static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
86 re_sift_context_t *sctx, int str_idx,
87 re_node_set *cur_dest)
88 internal_function;
89static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
90 re_sift_context_t *sctx,
91 int str_idx,
92 re_node_set *dest_nodes)
93 internal_function;
94static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
95 re_node_set *dest_nodes,
96 const re_node_set *candidates)
97 internal_function;
98static int check_dst_limits (const re_match_context_t *mctx,
99 re_node_set *limits,
100 int dst_node, int dst_idx, int src_node,
101 int src_idx) internal_function;
102static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
103 int boundaries, int subexp_idx,
104 int from_node, int bkref_idx)
105 internal_function;
106static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
107 int limit, int subexp_idx,
108 int node, int str_idx,
109 int bkref_idx) internal_function;
110static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
111 re_node_set *dest_nodes,
112 const re_node_set *candidates,
113 re_node_set *limits,
114 struct re_backref_cache_entry *bkref_ents,
115 int str_idx) internal_function;
116static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
117 re_sift_context_t *sctx,
118 int str_idx, const re_node_set *candidates)
119 internal_function;
120static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
121 re_dfastate_t **dst,
122 re_dfastate_t **src, int num)
123 internal_function;
124static re_dfastate_t *find_recover_state (reg_errcode_t *err,
125 re_match_context_t *mctx) internal_function;
126static re_dfastate_t *transit_state (reg_errcode_t *err,
127 re_match_context_t *mctx,
128 re_dfastate_t *state) internal_function;
129static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
130 re_match_context_t *mctx,
131 re_dfastate_t *next_state)
132 internal_function;
133static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
134 re_node_set *cur_nodes,
135 int str_idx) internal_function;
136#if 0
137static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
138 re_match_context_t *mctx,
139 re_dfastate_t *pstate)
140 internal_function;
141#endif
142#ifdef RE_ENABLE_I18N
143static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
144 re_dfastate_t *pstate)
145 internal_function;
146#endif /* RE_ENABLE_I18N */
147static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
148 const re_node_set *nodes)
149 internal_function;
150static reg_errcode_t get_subexp (re_match_context_t *mctx,
151 int bkref_node, int bkref_str_idx)
152 internal_function;
153static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
154 const re_sub_match_top_t *sub_top,
155 re_sub_match_last_t *sub_last,
156 int bkref_node, int bkref_str)
157 internal_function;
158static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159 int subexp_idx, int type) internal_function;
160static reg_errcode_t check_arrival (re_match_context_t *mctx,
161 state_array_t *path, int top_node,
162 int top_str, int last_node, int last_str,
163 int type) internal_function;
164static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
165 int str_idx,
166 re_node_set *cur_nodes,
167 re_node_set *next_nodes)
168 internal_function;
169static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
170 re_node_set *cur_nodes,
171 int ex_subexp, int type)
172 internal_function;
173static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
174 re_node_set *dst_nodes,
175 int target, int ex_subexp,
176 int type) internal_function;
177static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
178 re_node_set *cur_nodes, int cur_str,
179 int subexp_num, int type)
180 internal_function;
181static int build_trtable (const re_dfa_t *dfa,
182 re_dfastate_t *state) internal_function;
183#ifdef RE_ENABLE_I18N
184static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
185 const re_string_t *input, int idx)
186 internal_function;
187# ifdef _LIBC
188static unsigned int find_collation_sequence_value (const unsigned char *mbs,
189 size_t name_len)
190 internal_function;
191# endif /* _LIBC */
192#endif /* RE_ENABLE_I18N */
193static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
194 const re_dfastate_t *state,
195 re_node_set *states_node,
196 bitset_t *states_ch) internal_function;
197static int check_node_accept (const re_match_context_t *mctx,
198 const re_token_t *node, int idx)
199 internal_function;
200static reg_errcode_t extend_buffers (re_match_context_t *mctx)
201 internal_function;
202\f
203/* Entry point for POSIX code. */
204
205/* regexec searches for a given pattern, specified by PREG, in the
206 string STRING.
207
208 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
209 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
210 least NMATCH elements, and we set them to the offsets of the
211 corresponding matched substrings.
212
213 EFLAGS specifies `execution flags' which affect matching: if
214 REG_NOTBOL is set, then ^ does not match at the beginning of the
215 string; if REG_NOTEOL is set, then $ does not match at the end.
216
217 We return 0 if we find a match and REG_NOMATCH if not. */
218
219int
220regexec (
221 const regex_t *__restrict preg,
222 const char *__restrict string,
223 size_t nmatch,
224 regmatch_t pmatch[],
225 int eflags)
226{
227 reg_errcode_t err;
228 int start, length;
229
230 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
231 return REG_BADPAT;
232
233 if (eflags & REG_STARTEND)
234 {
235 start = pmatch[0].rm_so;
236 length = pmatch[0].rm_eo;
237 }
238 else
239 {
240 start = 0;
241 length = strlen (string);
242 }
243
244 __libc_lock_lock (dfa->lock);
245 if (preg->no_sub)
246 err = re_search_internal (preg, string, length, start, length - start,
247 length, 0, NULL, eflags);
248 else
249 err = re_search_internal (preg, string, length, start, length - start,
250 length, nmatch, pmatch, eflags);
251 __libc_lock_unlock (dfa->lock);
252 return err != REG_NOERROR;
253}
254
255#ifdef _LIBC
256# include <shlib-compat.h>
257versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
258
259# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
260__typeof__ (__regexec) __compat_regexec;
261
262int
263attribute_compat_text_section
264__compat_regexec (const regex_t *__restrict preg,
265 const char *__restrict string, size_t nmatch,
266 regmatch_t pmatch[], int eflags)
267{
268 return regexec (preg, string, nmatch, pmatch,
269 eflags & (REG_NOTBOL | REG_NOTEOL));
270}
271compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
272# endif
273#endif
274
275/* Entry points for GNU code. */
276
277/* re_match, re_search, re_match_2, re_search_2
278
279 The former two functions operate on STRING with length LENGTH,
280 while the later two operate on concatenation of STRING1 and STRING2
281 with lengths LENGTH1 and LENGTH2, respectively.
282
283 re_match() matches the compiled pattern in BUFP against the string,
284 starting at index START.
285
286 re_search() first tries matching at index START, then it tries to match
287 starting from index START + 1, and so on. The last start position tried
288 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
289 way as re_match().)
290
291 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
292 the first STOP characters of the concatenation of the strings should be
293 concerned.
294
295 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
296 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
297 computed relative to the concatenation, not relative to the individual
298 strings.)
299
300 On success, re_match* functions return the length of the match, re_search*
301 return the position of the start of the match. Return value -1 means no
302 match was found and -2 indicates an internal error. */
303
304int
305re_match (struct re_pattern_buffer *bufp,
306 const char *string,
307 int length,
308 int start,
309 struct re_registers *regs)
310{
311 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
312}
313#ifdef _LIBC
314weak_alias (__re_match, re_match)
315#endif
316
317int
318re_search (struct re_pattern_buffer *bufp,
319 const char *string,
320 int length, int start, int range,
321 struct re_registers *regs)
322{
323 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
324}
325#ifdef _LIBC
326weak_alias (__re_search, re_search)
327#endif
328
329int
330re_match_2 (struct re_pattern_buffer *bufp,
331 const char *string1, int length1,
332 const char *string2, int length2, int start,
333 struct re_registers *regs, int stop)
334{
335 return re_search_2_stub (bufp, string1, length1, string2, length2,
336 start, 0, regs, stop, 1);
337}
338#ifdef _LIBC
339weak_alias (__re_match_2, re_match_2)
340#endif
341
342int
343re_search_2 (struct re_pattern_buffer *bufp,
344 const char *string1, int length1,
345 const char *string2, int length2, int start,
346 int range, struct re_registers *regs, int stop)
347{
348 return re_search_2_stub (bufp, string1, length1, string2, length2,
349 start, range, regs, stop, 0);
350}
351#ifdef _LIBC
352weak_alias (__re_search_2, re_search_2)
353#endif
354
355static int
356re_search_2_stub (struct re_pattern_buffer *bufp,
357 const char *string1, int length1,
358 const char *string2, int length2, int start,
359 int range, struct re_registers *regs,
360 int stop, int ret_len)
361{
362 const char *str;
363 int rval;
364 int len = length1 + length2;
365 int free_str = 0;
366
367 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
368 return -2;
369
370 /* Concatenate the strings. */
371 if (length2 > 0)
372 if (length1 > 0)
373 {
374 char *s = re_malloc (char, len);
375
376 if (BE (s == NULL, 0))
377 return -2;
378 memcpy (s, string1, length1);
379 memcpy (s + length1, string2, length2);
380 str = s;
381 free_str = 1;
382 }
383 else
384 str = string2;
385 else
386 str = string1;
387
388 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
389 if (free_str)
390 re_free ((char *) str);
391 return rval;
392}
393
394/* The parameters have the same meaning as those of re_search.
395 Additional parameters:
396 If RET_LEN is nonzero the length of the match is returned (re_match style);
397 otherwise the position of the match is returned. */
398
399static int
400re_search_stub (struct re_pattern_buffer *bufp,
401 const char *string, int length, int start,
402 int range, int stop,
403 struct re_registers *regs, int ret_len)
404{
405 reg_errcode_t result;
406 regmatch_t *pmatch;
407 int nregs, rval;
408 int eflags = 0;
409
410 /* Check for out-of-range. */
411 if (BE (start < 0 || start > length, 0))
412 return -1;
413 if (BE (start + range > length, 0))
414 range = length - start;
415 else if (BE (start + range < 0, 0))
416 range = -start;
417
418 __libc_lock_lock (dfa->lock);
419
420 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
421 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
422
423 /* Compile fastmap if we haven't yet. */
424 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
425 re_compile_fastmap (bufp);
426
427 if (BE (bufp->no_sub, 0))
428 regs = NULL;
429
430 /* We need at least 1 register. */
431 if (regs == NULL)
432 nregs = 1;
433 else if (BE (bufp->regs_allocated == REGS_FIXED &&
434 regs->num_regs < bufp->re_nsub + 1, 0))
435 {
436 nregs = regs->num_regs;
437 if (BE (nregs < 1, 0))
438 {
439 /* Nothing can be copied to regs. */
440 regs = NULL;
441 nregs = 1;
442 }
443 }
444 else
445 nregs = bufp->re_nsub + 1;
446 pmatch = re_malloc (regmatch_t, nregs);
447 if (BE (pmatch == NULL, 0))
448 {
449 rval = -2;
450 goto out;
451 }
452
453 result = re_search_internal (bufp, string, length, start, range, stop,
454 nregs, pmatch, eflags);
455
456 rval = 0;
457
458 /* I hope we needn't fill ther regs with -1's when no match was found. */
459 if (result != REG_NOERROR)
460 rval = -1;
461 else if (regs != NULL)
462 {
463 /* If caller wants register contents data back, copy them. */
464 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
465 bufp->regs_allocated);
466 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
467 rval = -2;
468 }
469
470 if (BE (rval == 0, 1))
471 {
472 if (ret_len)
473 {
474 assert (pmatch[0].rm_so == start);
475 rval = pmatch[0].rm_eo - start;
476 }
477 else
478 rval = pmatch[0].rm_so;
479 }
480 re_free (pmatch);
481 out:
482 __libc_lock_unlock (dfa->lock);
483 return rval;
484}
485
486static unsigned
487re_copy_regs (struct re_registers *regs,
488 regmatch_t *pmatch,
489 int nregs, int regs_allocated)
490{
491 int rval = REGS_REALLOCATE;
492 int i;
493 int need_regs = nregs + 1;
494 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
495 uses. */
496
497 /* Have the register data arrays been allocated? */
498 if (regs_allocated == REGS_UNALLOCATED)
499 { /* No. So allocate them with malloc. */
500 regs->start = re_malloc (regoff_t, need_regs);
501 if (BE (regs->start == NULL, 0))
502 return REGS_UNALLOCATED;
503 regs->end = re_malloc (regoff_t, need_regs);
504 if (BE (regs->end == NULL, 0))
505 {
506 re_free (regs->start);
507 return REGS_UNALLOCATED;
508 }
509 regs->num_regs = need_regs;
510 }
511 else if (regs_allocated == REGS_REALLOCATE)
512 { /* Yes. If we need more elements than were already
513 allocated, reallocate them. If we need fewer, just
514 leave it alone. */
515 if (BE (need_regs > regs->num_regs, 0))
516 {
517 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
518 regoff_t *new_end;
519 if (BE (new_start == NULL, 0))
520 return REGS_UNALLOCATED;
521 new_end = re_realloc (regs->end, regoff_t, need_regs);
522 if (BE (new_end == NULL, 0))
523 {
524 re_free (new_start);
525 return REGS_UNALLOCATED;
526 }
527 regs->start = new_start;
528 regs->end = new_end;
529 regs->num_regs = need_regs;
530 }
531 }
532 else
533 {
534 assert (regs_allocated == REGS_FIXED);
535 /* This function may not be called with REGS_FIXED and nregs too big. */
536 assert (regs->num_regs >= nregs);
537 rval = REGS_FIXED;
538 }
539
540 /* Copy the regs. */
541 for (i = 0; i < nregs; ++i)
542 {
543 regs->start[i] = pmatch[i].rm_so;
544 regs->end[i] = pmatch[i].rm_eo;
545 }
546 for ( ; i < regs->num_regs; ++i)
547 regs->start[i] = regs->end[i] = -1;
548
549 return rval;
550}
551
552/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
553 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
554 this memory for recording register information. STARTS and ENDS
555 must be allocated using the malloc library routine, and must each
556 be at least NUM_REGS * sizeof (regoff_t) bytes long.
557
558 If NUM_REGS == 0, then subsequent matches should allocate their own
559 register data.
560
561 Unless this function is called, the first search or match using
562 PATTERN_BUFFER will allocate its own register data, without
563 freeing the old data. */
564
565void
566re_set_registers (struct re_pattern_buffer *bufp,
567 struct re_registers *regs,
568 unsigned num_regs,
569 regoff_t *starts,
570 regoff_t *ends)
571{
572 if (num_regs)
573 {
574 bufp->regs_allocated = REGS_REALLOCATE;
575 regs->num_regs = num_regs;
576 regs->start = starts;
577 regs->end = ends;
578 }
579 else
580 {
581 bufp->regs_allocated = REGS_UNALLOCATED;
582 regs->num_regs = 0;
583 regs->start = regs->end = (regoff_t *) 0;
584 }
585}
586#ifdef _LIBC
587weak_alias (__re_set_registers, re_set_registers)
588#endif
589\f
590/* Entry points compatible with 4.2 BSD regex library. We don't define
591 them unless specifically requested. */
592
593#if defined _REGEX_RE_COMP || defined _LIBC
594int
595# ifdef _LIBC
596weak_function
597# endif
598re_exec (s)
599 const char *s;
600{
601 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
602}
603#endif /* _REGEX_RE_COMP */
604\f
605/* Internal entry point. */
606
607/* Searches for a compiled pattern PREG in the string STRING, whose
608 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
609 mingings with regexec. START, and RANGE have the same meanings
610 with re_search.
611 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
612 otherwise return the error code.
613 Note: We assume front end functions already check ranges.
614 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
615
616static reg_errcode_t
617re_search_internal (const regex_t *preg,
618 const char *string,
619 int length, int start, int range, int stop,
620 size_t nmatch, regmatch_t pmatch[],
621 int eflags)
622{
623 reg_errcode_t err;
624 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
625 int left_lim, right_lim, incr;
626 int fl_longest_match, match_first, match_kind, match_last = -1;
627 int extra_nmatch;
628 int sb, ch;
629#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
630 re_match_context_t mctx = { .dfa = dfa };
631#else
632 re_match_context_t mctx;
633#endif
634 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
635 && range && !preg->can_be_null) ? preg->fastmap : NULL;
636 RE_TRANSLATE_TYPE t = preg->translate;
637
638#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
639 memset (&mctx, '\0', sizeof (re_match_context_t));
640 mctx.dfa = dfa;
641#endif
642
643 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
644 nmatch -= extra_nmatch;
645
646 /* Check if the DFA haven't been compiled. */
647 if (BE (preg->used == 0 || dfa->init_state == NULL
648 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
649 || dfa->init_state_begbuf == NULL, 0))
650 return REG_NOMATCH;
651
652#ifdef DEBUG
653 /* We assume front-end functions already check them. */
654 assert (start + range >= 0 && start + range <= length);
655#endif
656
657 /* If initial states with non-begbuf contexts have no elements,
658 the regex must be anchored. If preg->newline_anchor is set,
659 we'll never use init_state_nl, so do not check it. */
660 if (dfa->init_state->nodes.nelem == 0
661 && dfa->init_state_word->nodes.nelem == 0
662 && (dfa->init_state_nl->nodes.nelem == 0
663 || !preg->newline_anchor))
664 {
665 if (start != 0 && start + range != 0)
666 return REG_NOMATCH;
667 start = range = 0;
668 }
669
670 /* We must check the longest matching, if nmatch > 0. */
671 fl_longest_match = (nmatch != 0 || dfa->nbackref);
672
673 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
674 preg->translate, preg->syntax & RE_ICASE, dfa);
675 if (BE (err != REG_NOERROR, 0))
676 goto free_return;
677 mctx.input.stop = stop;
678 mctx.input.raw_stop = stop;
679 mctx.input.newline_anchor = preg->newline_anchor;
680
681 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
682 if (BE (err != REG_NOERROR, 0))
683 goto free_return;
684
685 /* We will log all the DFA states through which the dfa pass,
686 if nmatch > 1, or this dfa has "multibyte node", which is a
687 back-reference or a node which can accept multibyte character or
688 multi character collating element. */
689 if (nmatch > 1 || dfa->has_mb_node)
690 {
691 /* Avoid overflow. */
692 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
693 {
694 err = REG_ESPACE;
695 goto free_return;
696 }
697
698 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
699 if (BE (mctx.state_log == NULL, 0))
700 {
701 err = REG_ESPACE;
702 goto free_return;
703 }
704 }
705 else
706 mctx.state_log = NULL;
707
708 match_first = start;
709 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
710 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
711
712 /* Check incrementally whether of not the input string match. */
713 incr = (range < 0) ? -1 : 1;
714 left_lim = (range < 0) ? start + range : start;
715 right_lim = (range < 0) ? start : start + range;
716 sb = dfa->mb_cur_max == 1;
717 match_kind =
718 (fastmap
719 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
720 | (range >= 0 ? 2 : 0)
721 | (t != NULL ? 1 : 0))
722 : 8);
723
724 for (;; match_first += incr)
725 {
726 err = REG_NOMATCH;
727 if (match_first < left_lim || right_lim < match_first)
728 goto free_return;
729
730 /* Advance as rapidly as possible through the string, until we
731 find a plausible place to start matching. This may be done
732 with varying efficiency, so there are various possibilities:
733 only the most common of them are specialized, in order to
734 save on code size. We use a switch statement for speed. */
735 switch (match_kind)
736 {
737 case 8:
738 /* No fastmap. */
739 break;
740
741 case 7:
742 /* Fastmap with single-byte translation, match forward. */
743 while (BE (match_first < right_lim, 1)
744 && !fastmap[t[(unsigned char) string[match_first]]])
745 ++match_first;
746 goto forward_match_found_start_or_reached_end;
747
748 case 6:
749 /* Fastmap without translation, match forward. */
750 while (BE (match_first < right_lim, 1)
751 && !fastmap[(unsigned char) string[match_first]])
752 ++match_first;
753
754 forward_match_found_start_or_reached_end:
755 if (BE (match_first == right_lim, 0))
756 {
757 ch = match_first >= length
758 ? 0 : (unsigned char) string[match_first];
759 if (!fastmap[t ? t[ch] : ch])
760 goto free_return;
761 }
762 break;
763
764 case 4:
765 case 5:
766 /* Fastmap without multi-byte translation, match backwards. */
767 while (match_first >= left_lim)
768 {
769 ch = match_first >= length
770 ? 0 : (unsigned char) string[match_first];
771 if (fastmap[t ? t[ch] : ch])
772 break;
773 --match_first;
774 }
775 if (match_first < left_lim)
776 goto free_return;
777 break;
778
779 default:
780 /* In this case, we can't determine easily the current byte,
781 since it might be a component byte of a multibyte
782 character. Then we use the constructed buffer instead. */
783 for (;;)
784 {
785 /* If MATCH_FIRST is out of the valid range, reconstruct the
786 buffers. */
787 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
788 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
789 {
790 err = re_string_reconstruct (&mctx.input, match_first,
791 eflags);
792 if (BE (err != REG_NOERROR, 0))
793 goto free_return;
794
795 offset = match_first - mctx.input.raw_mbs_idx;
796 }
797 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
798 Note that MATCH_FIRST must not be smaller than 0. */
799 ch = (match_first >= length
800 ? 0 : re_string_byte_at (&mctx.input, offset));
801 if (fastmap[ch])
802 break;
803 match_first += incr;
804 if (match_first < left_lim || match_first > right_lim)
805 {
806 err = REG_NOMATCH;
807 goto free_return;
808 }
809 }
810 break;
811 }
812
813 /* Reconstruct the buffers so that the matcher can assume that
814 the matching starts from the beginning of the buffer. */
815 err = re_string_reconstruct (&mctx.input, match_first, eflags);
816 if (BE (err != REG_NOERROR, 0))
817 goto free_return;
818
819#ifdef RE_ENABLE_I18N
820 /* Don't consider this char as a possible match start if it part,
821 yet isn't the head, of a multibyte character. */
822 if (!sb && !re_string_first_byte (&mctx.input, 0))
823 continue;
824#endif
825
826 /* It seems to be appropriate one, then use the matcher. */
827 /* We assume that the matching starts from 0. */
828 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
829 match_last = check_matching (&mctx, fl_longest_match,
830 range >= 0 ? &match_first : NULL);
831 if (match_last != -1)
832 {
833 if (BE (match_last == -2, 0))
834 {
835 err = REG_ESPACE;
836 goto free_return;
837 }
838 else
839 {
840 mctx.match_last = match_last;
841 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
842 {
843 re_dfastate_t *pstate = mctx.state_log[match_last];
844 mctx.last_node = check_halt_state_context (&mctx, pstate,
845 match_last);
846 }
847 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
848 || dfa->nbackref)
849 {
850 err = prune_impossible_nodes (&mctx);
851 if (err == REG_NOERROR)
852 break;
853 if (BE (err != REG_NOMATCH, 0))
854 goto free_return;
855 match_last = -1;
856 }
857 else
858 break; /* We found a match. */
859 }
860 }
861
862 match_ctx_clean (&mctx);
863 }
864
865#ifdef DEBUG
866 assert (match_last != -1);
867 assert (err == REG_NOERROR);
868#endif
869
870 /* Set pmatch[] if we need. */
871 if (nmatch > 0)
872 {
873 int reg_idx;
874
875 /* Initialize registers. */
876 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
877 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
878
879 /* Set the points where matching start/end. */
880 pmatch[0].rm_so = 0;
881 pmatch[0].rm_eo = mctx.match_last;
882
883 if (!preg->no_sub && nmatch > 1)
884 {
885 err = set_regs (preg, &mctx, nmatch, pmatch,
886 dfa->has_plural_match && dfa->nbackref > 0);
887 if (BE (err != REG_NOERROR, 0))
888 goto free_return;
889 }
890
891 /* At last, add the offset to the each registers, since we slided
892 the buffers so that we could assume that the matching starts
893 from 0. */
894 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
895 if (pmatch[reg_idx].rm_so != -1)
896 {
897#ifdef RE_ENABLE_I18N
898 if (BE (mctx.input.offsets_needed != 0, 0))
899 {
900 pmatch[reg_idx].rm_so =
901 (pmatch[reg_idx].rm_so == mctx.input.valid_len
902 ? mctx.input.valid_raw_len
903 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
904 pmatch[reg_idx].rm_eo =
905 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
906 ? mctx.input.valid_raw_len
907 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
908 }
909#else
910 assert (mctx.input.offsets_needed == 0);
911#endif
912 pmatch[reg_idx].rm_so += match_first;
913 pmatch[reg_idx].rm_eo += match_first;
914 }
915 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
916 {
917 pmatch[nmatch + reg_idx].rm_so = -1;
918 pmatch[nmatch + reg_idx].rm_eo = -1;
919 }
920
921 if (dfa->subexp_map)
922 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
923 if (dfa->subexp_map[reg_idx] != reg_idx)
924 {
925 pmatch[reg_idx + 1].rm_so
926 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
927 pmatch[reg_idx + 1].rm_eo
928 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
929 }
930 }
931
932 free_return:
933 re_free (mctx.state_log);
934 if (dfa->nbackref)
935 match_ctx_free (&mctx);
936 re_string_destruct (&mctx.input);
937 return err;
938}
939
940static reg_errcode_t
941prune_impossible_nodes (re_match_context_t *mctx)
942{
943 const re_dfa_t *const dfa = mctx->dfa;
944 int halt_node, match_last;
945 reg_errcode_t ret;
946 re_dfastate_t **sifted_states;
947 re_dfastate_t **lim_states = NULL;
948 re_sift_context_t sctx;
949#ifdef DEBUG
950 assert (mctx->state_log != NULL);
951#endif
952 match_last = mctx->match_last;
953 halt_node = mctx->last_node;
954
955 /* Avoid overflow. */
956 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
957 return REG_ESPACE;
958
959 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
960 if (BE (sifted_states == NULL, 0))
961 {
962 ret = REG_ESPACE;
963 goto free_return;
964 }
965 if (dfa->nbackref)
966 {
967 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
968 if (BE (lim_states == NULL, 0))
969 {
970 ret = REG_ESPACE;
971 goto free_return;
972 }
973 while (1)
974 {
975 memset (lim_states, '\0',
976 sizeof (re_dfastate_t *) * (match_last + 1));
977 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
978 match_last);
979 ret = sift_states_backward (mctx, &sctx);
980 re_node_set_free (&sctx.limits);
981 if (BE (ret != REG_NOERROR, 0))
982 goto free_return;
983 if (sifted_states[0] != NULL || lim_states[0] != NULL)
984 break;
985 do
986 {
987 --match_last;
988 if (match_last < 0)
989 {
990 ret = REG_NOMATCH;
991 goto free_return;
992 }
993 } while (mctx->state_log[match_last] == NULL
994 || !mctx->state_log[match_last]->halt);
995 halt_node = check_halt_state_context (mctx,
996 mctx->state_log[match_last],
997 match_last);
998 }
999 ret = merge_state_array (dfa, sifted_states, lim_states,
1000 match_last + 1);
1001 re_free (lim_states);
1002 lim_states = NULL;
1003 if (BE (ret != REG_NOERROR, 0))
1004 goto free_return;
1005 }
1006 else
1007 {
1008 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1009 ret = sift_states_backward (mctx, &sctx);
1010 re_node_set_free (&sctx.limits);
1011 if (BE (ret != REG_NOERROR, 0))
1012 goto free_return;
1013 if (sifted_states[0] == NULL)
1014 {
1015 ret = REG_NOMATCH;
1016 goto free_return;
1017 }
1018 }
1019 re_free (mctx->state_log);
1020 mctx->state_log = sifted_states;
1021 sifted_states = NULL;
1022 mctx->last_node = halt_node;
1023 mctx->match_last = match_last;
1024 ret = REG_NOERROR;
1025 free_return:
1026 re_free (sifted_states);
1027 re_free (lim_states);
1028 return ret;
1029}
1030
1031/* Acquire an initial state and return it.
1032 We must select appropriate initial state depending on the context,
1033 since initial states may have constraints like "\<", "^", etc.. */
1034
1035static inline re_dfastate_t *
1036__attribute ((always_inline)) internal_function
1037acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1038 int idx)
1039{
1040 const re_dfa_t *const dfa = mctx->dfa;
1041 if (dfa->init_state->has_constraint)
1042 {
1043 unsigned int context;
1044 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1045 if (IS_WORD_CONTEXT (context))
1046 return dfa->init_state_word;
1047 else if (IS_ORDINARY_CONTEXT (context))
1048 return dfa->init_state;
1049 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1050 return dfa->init_state_begbuf;
1051 else if (IS_NEWLINE_CONTEXT (context))
1052 return dfa->init_state_nl;
1053 else if (IS_BEGBUF_CONTEXT (context))
1054 {
1055 /* It is relatively rare case, then calculate on demand. */
1056 return re_acquire_state_context (err, dfa,
1057 dfa->init_state->entrance_nodes,
1058 context);
1059 }
1060 else
1061 /* Must not happen? */
1062 return dfa->init_state;
1063 }
1064 else
1065 return dfa->init_state;
1066}
1067
1068/* Check whether the regular expression match input string INPUT or not,
1069 and return the index where the matching end, return -1 if not match,
1070 or return -2 in case of an error.
1071 FL_LONGEST_MATCH means we want the POSIX longest matching.
1072 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1073 next place where we may want to try matching.
1074 Note that the matcher assume that the maching starts from the current
1075 index of the buffer. */
1076
1077static int
1078internal_function
1079check_matching (re_match_context_t *mctx, int fl_longest_match,
1080 int *p_match_first)
1081{
1082 const re_dfa_t *const dfa = mctx->dfa;
1083 reg_errcode_t err;
1084 int match = 0;
1085 int match_last = -1;
1086 int cur_str_idx = re_string_cur_idx (&mctx->input);
1087 re_dfastate_t *cur_state;
1088 int at_init_state = p_match_first != NULL;
1089 int next_start_idx = cur_str_idx;
1090
1091 err = REG_NOERROR;
1092 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1093 /* An initial state must not be NULL (invalid). */
1094 if (BE (cur_state == NULL, 0))
1095 {
1096 assert (err == REG_ESPACE);
1097 return -2;
1098 }
1099
1100 if (mctx->state_log != NULL)
1101 {
1102 mctx->state_log[cur_str_idx] = cur_state;
1103
1104 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1105 later. E.g. Processing back references. */
1106 if (BE (dfa->nbackref, 0))
1107 {
1108 at_init_state = 0;
1109 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1110 if (BE (err != REG_NOERROR, 0))
1111 return err;
1112
1113 if (cur_state->has_backref)
1114 {
1115 err = transit_state_bkref (mctx, &cur_state->nodes);
1116 if (BE (err != REG_NOERROR, 0))
1117 return err;
1118 }
1119 }
1120 }
1121
1122 /* If the RE accepts NULL string. */
1123 if (BE (cur_state->halt, 0))
1124 {
1125 if (!cur_state->has_constraint
1126 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1127 {
1128 if (!fl_longest_match)
1129 return cur_str_idx;
1130 else
1131 {
1132 match_last = cur_str_idx;
1133 match = 1;
1134 }
1135 }
1136 }
1137
1138 while (!re_string_eoi (&mctx->input))
1139 {
1140 re_dfastate_t *old_state = cur_state;
1141 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1142
1143 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1144 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1145 && mctx->input.valid_len < mctx->input.len))
1146 {
1147 err = extend_buffers (mctx);
1148 if (BE (err != REG_NOERROR, 0))
1149 {
1150 assert (err == REG_ESPACE);
1151 return -2;
1152 }
1153 }
1154
1155 cur_state = transit_state (&err, mctx, cur_state);
1156 if (mctx->state_log != NULL)
1157 cur_state = merge_state_with_log (&err, mctx, cur_state);
1158
1159 if (cur_state == NULL)
1160 {
1161 /* Reached the invalid state or an error. Try to recover a valid
1162 state using the state log, if available and if we have not
1163 already found a valid (even if not the longest) match. */
1164 if (BE (err != REG_NOERROR, 0))
1165 return -2;
1166
1167 if (mctx->state_log == NULL
1168 || (match && !fl_longest_match)
1169 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1170 break;
1171 }
1172
1173 if (BE (at_init_state, 0))
1174 {
1175 if (old_state == cur_state)
1176 next_start_idx = next_char_idx;
1177 else
1178 at_init_state = 0;
1179 }
1180
1181 if (cur_state->halt)
1182 {
1183 /* Reached a halt state.
1184 Check the halt state can satisfy the current context. */
1185 if (!cur_state->has_constraint
1186 || check_halt_state_context (mctx, cur_state,
1187 re_string_cur_idx (&mctx->input)))
1188 {
1189 /* We found an appropriate halt state. */
1190 match_last = re_string_cur_idx (&mctx->input);
1191 match = 1;
1192
1193 /* We found a match, do not modify match_first below. */
1194 p_match_first = NULL;
1195 if (!fl_longest_match)
1196 break;
1197 }
1198 }
1199 }
1200
1201 if (p_match_first)
1202 *p_match_first += next_start_idx;
1203
1204 return match_last;
1205}
1206
1207/* Check NODE match the current context. */
1208
1209static int
1210internal_function
1211check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1212{
1213 re_token_type_t type = dfa->nodes[node].type;
1214 unsigned int constraint = dfa->nodes[node].constraint;
1215 if (type != END_OF_RE)
1216 return 0;
1217 if (!constraint)
1218 return 1;
1219 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1220 return 0;
1221 return 1;
1222}
1223
1224/* Check the halt state STATE match the current context.
1225 Return 0 if not match, if the node, STATE has, is a halt node and
1226 match the context, return the node. */
1227
1228static int
1229internal_function
1230check_halt_state_context (const re_match_context_t *mctx,
1231 const re_dfastate_t *state, int idx)
1232{
1233 int i;
1234 unsigned int context;
1235#ifdef DEBUG
1236 assert (state->halt);
1237#endif
1238 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1239 for (i = 0; i < state->nodes.nelem; ++i)
1240 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1241 return state->nodes.elems[i];
1242 return 0;
1243}
1244
1245/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1246 corresponding to the DFA).
1247 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1248 of errors. */
1249
1250static int
1251internal_function
1252proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1253 int *pidx, int node, re_node_set *eps_via_nodes,
1254 struct re_fail_stack_t *fs)
1255{
1256 const re_dfa_t *const dfa = mctx->dfa;
1257 int i, err;
1258 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1259 {
1260 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1261 re_node_set *edests = &dfa->edests[node];
1262 int dest_node;
1263 err = re_node_set_insert (eps_via_nodes, node);
1264 if (BE (err < 0, 0))
1265 return -2;
1266 /* Pick up a valid destination, or return -1 if none is found. */
1267 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1268 {
1269 int candidate = edests->elems[i];
1270 if (!re_node_set_contains (cur_nodes, candidate))
1271 continue;
1272 if (dest_node == -1)
1273 dest_node = candidate;
1274
1275 else
1276 {
1277 /* In order to avoid infinite loop like "(a*)*", return the second
1278 epsilon-transition if the first was already considered. */
1279 if (re_node_set_contains (eps_via_nodes, dest_node))
1280 return candidate;
1281
1282 /* Otherwise, push the second epsilon-transition on the fail stack. */
1283 else if (fs != NULL
1284 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1285 eps_via_nodes))
1286 return -2;
1287
1288 /* We know we are going to exit. */
1289 break;
1290 }
1291 }
1292 return dest_node;
1293 }
1294 else
1295 {
1296 int naccepted = 0;
1297 re_token_type_t type = dfa->nodes[node].type;
1298
1299#ifdef RE_ENABLE_I18N
1300 if (dfa->nodes[node].accept_mb)
1301 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1302 else
1303#endif /* RE_ENABLE_I18N */
1304 if (type == OP_BACK_REF)
1305 {
1306 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1307 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1308 if (fs != NULL)
1309 {
1310 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1311 return -1;
1312 else if (naccepted)
1313 {
1314 char *buf = (char *) re_string_get_buffer (&mctx->input);
1315 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1316 naccepted) != 0)
1317 return -1;
1318 }
1319 }
1320
1321 if (naccepted == 0)
1322 {
1323 int dest_node;
1324 err = re_node_set_insert (eps_via_nodes, node);
1325 if (BE (err < 0, 0))
1326 return -2;
1327 dest_node = dfa->edests[node].elems[0];
1328 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1329 dest_node))
1330 return dest_node;
1331 }
1332 }
1333
1334 if (naccepted != 0
1335 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1336 {
1337 int dest_node = dfa->nexts[node];
1338 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1339 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1340 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1341 dest_node)))
1342 return -1;
1343 re_node_set_empty (eps_via_nodes);
1344 return dest_node;
1345 }
1346 }
1347 return -1;
1348}
1349
1350static reg_errcode_t
1351internal_function
1352push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1353 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1354{
1355 reg_errcode_t err;
1356 int num = fs->num++;
1357 if (fs->num == fs->alloc)
1358 {
1359 struct re_fail_stack_ent_t *new_array;
1360 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1361 * fs->alloc * 2));
1362 if (new_array == NULL)
1363 return REG_ESPACE;
1364 fs->alloc *= 2;
1365 fs->stack = new_array;
1366 }
1367 fs->stack[num].idx = str_idx;
1368 fs->stack[num].node = dest_node;
1369 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1370 if (fs->stack[num].regs == NULL)
1371 return REG_ESPACE;
1372 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1373 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1374 return err;
1375}
1376
1377static int
1378internal_function
1379pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1380 regmatch_t *regs, re_node_set *eps_via_nodes)
1381{
1382 int num = --fs->num;
1383 assert (num >= 0);
1384 *pidx = fs->stack[num].idx;
1385 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1386 re_node_set_free (eps_via_nodes);
1387 re_free (fs->stack[num].regs);
1388 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1389 return fs->stack[num].node;
1390}
1391
1392/* Set the positions where the subexpressions are starts/ends to registers
1393 PMATCH.
1394 Note: We assume that pmatch[0] is already set, and
1395 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1396
1397static reg_errcode_t
1398internal_function
1399set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1400 regmatch_t *pmatch, int fl_backtrack)
1401{
1402 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1403 int idx, cur_node;
1404 re_node_set eps_via_nodes;
1405 struct re_fail_stack_t *fs;
1406 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1407 regmatch_t *prev_idx_match;
1408 int prev_idx_match_malloced = 0;
1409
1410#ifdef DEBUG
1411 assert (nmatch > 1);
1412 assert (mctx->state_log != NULL);
1413#endif
1414 if (fl_backtrack)
1415 {
1416 fs = &fs_body;
1417 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1418 if (fs->stack == NULL)
1419 return REG_ESPACE;
1420 }
1421 else
1422 fs = NULL;
1423
1424 cur_node = dfa->init_node;
1425 re_node_set_init_empty (&eps_via_nodes);
1426
1427#ifdef HAVE_ALLOCA
1428 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1429 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1430 else
1431#endif
1432 {
1433 prev_idx_match = re_malloc (regmatch_t, nmatch);
1434 if (prev_idx_match == NULL)
1435 {
1436 free_fail_stack_return (fs);
1437 return REG_ESPACE;
1438 }
1439 prev_idx_match_malloced = 1;
1440 }
1441 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1442
1443 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1444 {
1445 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1446
1447 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1448 {
1449 int reg_idx;
1450 if (fs)
1451 {
1452 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1453 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1454 break;
1455 if (reg_idx == nmatch)
1456 {
1457 re_node_set_free (&eps_via_nodes);
1458 if (prev_idx_match_malloced)
1459 re_free (prev_idx_match);
1460 return free_fail_stack_return (fs);
1461 }
1462 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1463 &eps_via_nodes);
1464 }
1465 else
1466 {
1467 re_node_set_free (&eps_via_nodes);
1468 if (prev_idx_match_malloced)
1469 re_free (prev_idx_match);
1470 return REG_NOERROR;
1471 }
1472 }
1473
1474 /* Proceed to next node. */
1475 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1476 &eps_via_nodes, fs);
1477
1478 if (BE (cur_node < 0, 0))
1479 {
1480 if (BE (cur_node == -2, 0))
1481 {
1482 re_node_set_free (&eps_via_nodes);
1483 if (prev_idx_match_malloced)
1484 re_free (prev_idx_match);
1485 free_fail_stack_return (fs);
1486 return REG_ESPACE;
1487 }
1488 if (fs)
1489 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1490 &eps_via_nodes);
1491 else
1492 {
1493 re_node_set_free (&eps_via_nodes);
1494 if (prev_idx_match_malloced)
1495 re_free (prev_idx_match);
1496 return REG_NOMATCH;
1497 }
1498 }
1499 }
1500 re_node_set_free (&eps_via_nodes);
1501 if (prev_idx_match_malloced)
1502 re_free (prev_idx_match);
1503 return free_fail_stack_return (fs);
1504}
1505
1506static reg_errcode_t
1507internal_function
1508free_fail_stack_return (struct re_fail_stack_t *fs)
1509{
1510 if (fs)
1511 {
1512 int fs_idx;
1513 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1514 {
1515 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1516 re_free (fs->stack[fs_idx].regs);
1517 }
1518 re_free (fs->stack);
1519 }
1520 return REG_NOERROR;
1521}
1522
1523static void
1524internal_function
1525update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1526 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1527{
1528 int type = dfa->nodes[cur_node].type;
1529 if (type == OP_OPEN_SUBEXP)
1530 {
1531 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1532
1533 /* We are at the first node of this sub expression. */
1534 if (reg_num < nmatch)
1535 {
1536 pmatch[reg_num].rm_so = cur_idx;
1537 pmatch[reg_num].rm_eo = -1;
1538 }
1539 }
1540 else if (type == OP_CLOSE_SUBEXP)
1541 {
1542 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1543 if (reg_num < nmatch)
1544 {
1545 /* We are at the last node of this sub expression. */
1546 if (pmatch[reg_num].rm_so < cur_idx)
1547 {
1548 pmatch[reg_num].rm_eo = cur_idx;
1549 /* This is a non-empty match or we are not inside an optional
1550 subexpression. Accept this right away. */
1551 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1552 }
1553 else
1554 {
1555 if (dfa->nodes[cur_node].opt_subexp
1556 && prev_idx_match[reg_num].rm_so != -1)
1557 /* We transited through an empty match for an optional
1558 subexpression, like (a?)*, and this is not the subexp's
1559 first match. Copy back the old content of the registers
1560 so that matches of an inner subexpression are undone as
1561 well, like in ((a?))*. */
1562 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1563 else
1564 /* We completed a subexpression, but it may be part of
1565 an optional one, so do not update PREV_IDX_MATCH. */
1566 pmatch[reg_num].rm_eo = cur_idx;
1567 }
1568 }
1569 }
1570}
1571
1572/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1573 and sift the nodes in each states according to the following rules.
1574 Updated state_log will be wrote to STATE_LOG.
1575
1576 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1577 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1578 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1579 the LAST_NODE, we throw away the node `a'.
1580 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1581 string `s' and transit to `b':
1582 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1583 away the node `a'.
1584 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1585 thrown away, we throw away the node `a'.
1586 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1587 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1588 node `a'.
1589 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1590 we throw away the node `a'. */
1591
1592#define STATE_NODE_CONTAINS(state,node) \
1593 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1594
1595static reg_errcode_t
1596internal_function
1597sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1598{
1599 reg_errcode_t err;
1600 int null_cnt = 0;
1601 int str_idx = sctx->last_str_idx;
1602 re_node_set cur_dest;
1603
1604#ifdef DEBUG
1605 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1606#endif
1607
1608 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1609 transit to the last_node and the last_node itself. */
1610 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1611 if (BE (err != REG_NOERROR, 0))
1612 return err;
1613 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1614 if (BE (err != REG_NOERROR, 0))
1615 goto free_return;
1616
1617 /* Then check each states in the state_log. */
1618 while (str_idx > 0)
1619 {
1620 /* Update counters. */
1621 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1622 if (null_cnt > mctx->max_mb_elem_len)
1623 {
1624 memset (sctx->sifted_states, '\0',
1625 sizeof (re_dfastate_t *) * str_idx);
1626 re_node_set_free (&cur_dest);
1627 return REG_NOERROR;
1628 }
1629 re_node_set_empty (&cur_dest);
1630 --str_idx;
1631
1632 if (mctx->state_log[str_idx])
1633 {
1634 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1635 if (BE (err != REG_NOERROR, 0))
1636 goto free_return;
1637 }
1638
1639 /* Add all the nodes which satisfy the following conditions:
1640 - It can epsilon transit to a node in CUR_DEST.
1641 - It is in CUR_SRC.
1642 And update state_log. */
1643 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1644 if (BE (err != REG_NOERROR, 0))
1645 goto free_return;
1646 }
1647 err = REG_NOERROR;
1648 free_return:
1649 re_node_set_free (&cur_dest);
1650 return err;
1651}
1652
1653static reg_errcode_t
1654internal_function
1655build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1656 int str_idx, re_node_set *cur_dest)
1657{
1658 const re_dfa_t *const dfa = mctx->dfa;
1659 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1660 int i;
1661
1662 /* Then build the next sifted state.
1663 We build the next sifted state on `cur_dest', and update
1664 `sifted_states[str_idx]' with `cur_dest'.
1665 Note:
1666 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1667 `cur_src' points the node_set of the old `state_log[str_idx]'
1668 (with the epsilon nodes pre-filtered out). */
1669 for (i = 0; i < cur_src->nelem; i++)
1670 {
1671 int prev_node = cur_src->elems[i];
1672 int naccepted = 0;
1673 int ret;
1674
1675#ifdef DEBUG
1676 re_token_type_t type = dfa->nodes[prev_node].type;
1677 assert (!IS_EPSILON_NODE (type));
1678#endif
1679#ifdef RE_ENABLE_I18N
1680 /* If the node may accept `multi byte'. */
1681 if (dfa->nodes[prev_node].accept_mb)
1682 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1683 str_idx, sctx->last_str_idx);
1684#endif /* RE_ENABLE_I18N */
1685
1686 /* We don't check backreferences here.
1687 See update_cur_sifted_state(). */
1688 if (!naccepted
1689 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1690 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1691 dfa->nexts[prev_node]))
1692 naccepted = 1;
1693
1694 if (naccepted == 0)
1695 continue;
1696
1697 if (sctx->limits.nelem)
1698 {
1699 int to_idx = str_idx + naccepted;
1700 if (check_dst_limits (mctx, &sctx->limits,
1701 dfa->nexts[prev_node], to_idx,
1702 prev_node, str_idx))
1703 continue;
1704 }
1705 ret = re_node_set_insert (cur_dest, prev_node);
1706 if (BE (ret == -1, 0))
1707 return REG_ESPACE;
1708 }
1709
1710 return REG_NOERROR;
1711}
1712
1713/* Helper functions. */
1714
1715static reg_errcode_t
1716internal_function
1717clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1718{
1719 int top = mctx->state_log_top;
1720
1721 if (next_state_log_idx >= mctx->input.bufs_len
1722 || (next_state_log_idx >= mctx->input.valid_len
1723 && mctx->input.valid_len < mctx->input.len))
1724 {
1725 reg_errcode_t err;
1726 err = extend_buffers (mctx);
1727 if (BE (err != REG_NOERROR, 0))
1728 return err;
1729 }
1730
1731 if (top < next_state_log_idx)
1732 {
1733 memset (mctx->state_log + top + 1, '\0',
1734 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1735 mctx->state_log_top = next_state_log_idx;
1736 }
1737 return REG_NOERROR;
1738}
1739
1740static reg_errcode_t
1741internal_function
1742merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1743 re_dfastate_t **src, int num)
1744{
1745 int st_idx;
1746 reg_errcode_t err;
1747 for (st_idx = 0; st_idx < num; ++st_idx)
1748 {
1749 if (dst[st_idx] == NULL)
1750 dst[st_idx] = src[st_idx];
1751 else if (src[st_idx] != NULL)
1752 {
1753 re_node_set merged_set;
1754 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1755 &src[st_idx]->nodes);
1756 if (BE (err != REG_NOERROR, 0))
1757 return err;
1758 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1759 re_node_set_free (&merged_set);
1760 if (BE (err != REG_NOERROR, 0))
1761 return err;
1762 }
1763 }
1764 return REG_NOERROR;
1765}
1766
1767static reg_errcode_t
1768internal_function
1769update_cur_sifted_state (const re_match_context_t *mctx,
1770 re_sift_context_t *sctx, int str_idx,
1771 re_node_set *dest_nodes)
1772{
1773 const re_dfa_t *const dfa = mctx->dfa;
1774 reg_errcode_t err = REG_NOERROR;
1775 const re_node_set *candidates;
1776 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1777 : &mctx->state_log[str_idx]->nodes);
1778
1779 if (dest_nodes->nelem == 0)
1780 sctx->sifted_states[str_idx] = NULL;
1781 else
1782 {
1783 if (candidates)
1784 {
1785 /* At first, add the nodes which can epsilon transit to a node in
1786 DEST_NODE. */
1787 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1788 if (BE (err != REG_NOERROR, 0))
1789 return err;
1790
1791 /* Then, check the limitations in the current sift_context. */
1792 if (sctx->limits.nelem)
1793 {
1794 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1795 mctx->bkref_ents, str_idx);
1796 if (BE (err != REG_NOERROR, 0))
1797 return err;
1798 }
1799 }
1800
1801 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1802 if (BE (err != REG_NOERROR, 0))
1803 return err;
1804 }
1805
1806 if (candidates && mctx->state_log[str_idx]->has_backref)
1807 {
1808 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1809 if (BE (err != REG_NOERROR, 0))
1810 return err;
1811 }
1812 return REG_NOERROR;
1813}
1814
1815static reg_errcode_t
1816internal_function
1817add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1818 const re_node_set *candidates)
1819{
1820 reg_errcode_t err = REG_NOERROR;
1821 int i;
1822
1823 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1824 if (BE (err != REG_NOERROR, 0))
1825 return err;
1826
1827 if (!state->inveclosure.alloc)
1828 {
1829 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1830 if (BE (err != REG_NOERROR, 0))
1831 return REG_ESPACE;
1832 for (i = 0; i < dest_nodes->nelem; i++)
1833 {
1834 err = re_node_set_merge (&state->inveclosure,
1835 dfa->inveclosures + dest_nodes->elems[i]);
1836 if (BE (err != REG_NOERROR, 0))
1837 return REG_ESPACE;
1838 }
1839 }
1840 return re_node_set_add_intersect (dest_nodes, candidates,
1841 &state->inveclosure);
1842}
1843
1844static reg_errcode_t
1845internal_function
1846sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1847 const re_node_set *candidates)
1848{
1849 int ecl_idx;
1850 reg_errcode_t err;
1851 re_node_set *inv_eclosure = dfa->inveclosures + node;
1852 re_node_set except_nodes;
1853 re_node_set_init_empty (&except_nodes);
1854 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1855 {
1856 int cur_node = inv_eclosure->elems[ecl_idx];
1857 if (cur_node == node)
1858 continue;
1859 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1860 {
1861 int edst1 = dfa->edests[cur_node].elems[0];
1862 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1863 ? dfa->edests[cur_node].elems[1] : -1);
1864 if ((!re_node_set_contains (inv_eclosure, edst1)
1865 && re_node_set_contains (dest_nodes, edst1))
1866 || (edst2 > 0
1867 && !re_node_set_contains (inv_eclosure, edst2)
1868 && re_node_set_contains (dest_nodes, edst2)))
1869 {
1870 err = re_node_set_add_intersect (&except_nodes, candidates,
1871 dfa->inveclosures + cur_node);
1872 if (BE (err != REG_NOERROR, 0))
1873 {
1874 re_node_set_free (&except_nodes);
1875 return err;
1876 }
1877 }
1878 }
1879 }
1880 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1881 {
1882 int cur_node = inv_eclosure->elems[ecl_idx];
1883 if (!re_node_set_contains (&except_nodes, cur_node))
1884 {
1885 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1886 re_node_set_remove_at (dest_nodes, idx);
1887 }
1888 }
1889 re_node_set_free (&except_nodes);
1890 return REG_NOERROR;
1891}
1892
1893static int
1894internal_function
1895check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1896 int dst_node, int dst_idx, int src_node, int src_idx)
1897{
1898 const re_dfa_t *const dfa = mctx->dfa;
1899 int lim_idx, src_pos, dst_pos;
1900
1901 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1902 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1903 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1904 {
1905 int subexp_idx;
1906 struct re_backref_cache_entry *ent;
1907 ent = mctx->bkref_ents + limits->elems[lim_idx];
1908 subexp_idx = dfa->nodes[ent->node].opr.idx;
1909
1910 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1911 subexp_idx, dst_node, dst_idx,
1912 dst_bkref_idx);
1913 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1914 subexp_idx, src_node, src_idx,
1915 src_bkref_idx);
1916
1917 /* In case of:
1918 <src> <dst> ( <subexp> )
1919 ( <subexp> ) <src> <dst>
1920 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1921 if (src_pos == dst_pos)
1922 continue; /* This is unrelated limitation. */
1923 else
1924 return 1;
1925 }
1926 return 0;
1927}
1928
1929static int
1930internal_function
1931check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1932 int subexp_idx, int from_node, int bkref_idx)
1933{
1934 const re_dfa_t *const dfa = mctx->dfa;
1935 const re_node_set *eclosures = dfa->eclosures + from_node;
1936 int node_idx;
1937
1938 /* Else, we are on the boundary: examine the nodes on the epsilon
1939 closure. */
1940 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1941 {
1942 int node = eclosures->elems[node_idx];
1943 switch (dfa->nodes[node].type)
1944 {
1945 case OP_BACK_REF:
1946 if (bkref_idx != -1)
1947 {
1948 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1949 do
1950 {
1951 int dst, cpos;
1952
1953 if (ent->node != node)
1954 continue;
1955
1956 if (subexp_idx < BITSET_WORD_BITS
1957 && !(ent->eps_reachable_subexps_map
1958 & ((bitset_word_t) 1 << subexp_idx)))
1959 continue;
1960
1961 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1962 OP_CLOSE_SUBEXP cases below. But, if the
1963 destination node is the same node as the source
1964 node, don't recurse because it would cause an
1965 infinite loop: a regex that exhibits this behavior
1966 is ()\1*\1* */
1967 dst = dfa->edests[node].elems[0];
1968 if (dst == from_node)
1969 {
1970 if (boundaries & 1)
1971 return -1;
1972 else /* if (boundaries & 2) */
1973 return 0;
1974 }
1975
1976 cpos =
1977 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1978 dst, bkref_idx);
1979 if (cpos == -1 /* && (boundaries & 1) */)
1980 return -1;
1981 if (cpos == 0 && (boundaries & 2))
1982 return 0;
1983
1984 if (subexp_idx < BITSET_WORD_BITS)
1985 ent->eps_reachable_subexps_map
1986 &= ~((bitset_word_t) 1 << subexp_idx);
1987 }
1988 while (ent++->more);
1989 }
1990 break;
1991
1992 case OP_OPEN_SUBEXP:
1993 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1994 return -1;
1995 break;
1996
1997 case OP_CLOSE_SUBEXP:
1998 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1999 return 0;
2000 break;
2001
2002 default:
2003 break;
2004 }
2005 }
2006
2007 return (boundaries & 2) ? 1 : 0;
2008}
2009
2010static int
2011internal_function
2012check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2013 int subexp_idx, int from_node, int str_idx,
2014 int bkref_idx)
2015{
2016 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2017 int boundaries;
2018
2019 /* If we are outside the range of the subexpression, return -1 or 1. */
2020 if (str_idx < lim->subexp_from)
2021 return -1;
2022
2023 if (lim->subexp_to < str_idx)
2024 return 1;
2025
2026 /* If we are within the subexpression, return 0. */
2027 boundaries = (str_idx == lim->subexp_from);
2028 boundaries |= (str_idx == lim->subexp_to) << 1;
2029 if (boundaries == 0)
2030 return 0;
2031
2032 /* Else, examine epsilon closure. */
2033 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2034 from_node, bkref_idx);
2035}
2036
2037/* Check the limitations of sub expressions LIMITS, and remove the nodes
2038 which are against limitations from DEST_NODES. */
2039
2040static reg_errcode_t
2041internal_function
2042check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2043 const re_node_set *candidates, re_node_set *limits,
2044 struct re_backref_cache_entry *bkref_ents, int str_idx)
2045{
2046 reg_errcode_t err;
2047 int node_idx, lim_idx;
2048
2049 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2050 {
2051 int subexp_idx;
2052 struct re_backref_cache_entry *ent;
2053 ent = bkref_ents + limits->elems[lim_idx];
2054
2055 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2056 continue; /* This is unrelated limitation. */
2057
2058 subexp_idx = dfa->nodes[ent->node].opr.idx;
2059 if (ent->subexp_to == str_idx)
2060 {
2061 int ops_node = -1;
2062 int cls_node = -1;
2063 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2064 {
2065 int node = dest_nodes->elems[node_idx];
2066 re_token_type_t type = dfa->nodes[node].type;
2067 if (type == OP_OPEN_SUBEXP
2068 && subexp_idx == dfa->nodes[node].opr.idx)
2069 ops_node = node;
2070 else if (type == OP_CLOSE_SUBEXP
2071 && subexp_idx == dfa->nodes[node].opr.idx)
2072 cls_node = node;
2073 }
2074
2075 /* Check the limitation of the open subexpression. */
2076 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2077 if (ops_node >= 0)
2078 {
2079 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2080 candidates);
2081 if (BE (err != REG_NOERROR, 0))
2082 return err;
2083 }
2084
2085 /* Check the limitation of the close subexpression. */
2086 if (cls_node >= 0)
2087 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2088 {
2089 int node = dest_nodes->elems[node_idx];
2090 if (!re_node_set_contains (dfa->inveclosures + node,
2091 cls_node)
2092 && !re_node_set_contains (dfa->eclosures + node,
2093 cls_node))
2094 {
2095 /* It is against this limitation.
2096 Remove it form the current sifted state. */
2097 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2098 candidates);
2099 if (BE (err != REG_NOERROR, 0))
2100 return err;
2101 --node_idx;
2102 }
2103 }
2104 }
2105 else /* (ent->subexp_to != str_idx) */
2106 {
2107 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2108 {
2109 int node = dest_nodes->elems[node_idx];
2110 re_token_type_t type = dfa->nodes[node].type;
2111 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2112 {
2113 if (subexp_idx != dfa->nodes[node].opr.idx)
2114 continue;
2115 /* It is against this limitation.
2116 Remove it form the current sifted state. */
2117 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2118 candidates);
2119 if (BE (err != REG_NOERROR, 0))
2120 return err;
2121 }
2122 }
2123 }
2124 }
2125 return REG_NOERROR;
2126}
2127
2128static reg_errcode_t
2129internal_function
2130sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2131 int str_idx, const re_node_set *candidates)
2132{
2133 const re_dfa_t *const dfa = mctx->dfa;
2134 reg_errcode_t err;
2135 int node_idx, node;
2136 re_sift_context_t local_sctx;
2137 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2138
2139 if (first_idx == -1)
2140 return REG_NOERROR;
2141
2142 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2143
2144 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2145 {
2146 int enabled_idx;
2147 re_token_type_t type;
2148 struct re_backref_cache_entry *entry;
2149 node = candidates->elems[node_idx];
2150 type = dfa->nodes[node].type;
2151 /* Avoid infinite loop for the REs like "()\1+". */
2152 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2153 continue;
2154 if (type != OP_BACK_REF)
2155 continue;
2156
2157 entry = mctx->bkref_ents + first_idx;
2158 enabled_idx = first_idx;
2159 do
2160 {
2161 int subexp_len;
2162 int to_idx;
2163 int dst_node;
2164 int ret;
2165 re_dfastate_t *cur_state;
2166
2167 if (entry->node != node)
2168 continue;
2169 subexp_len = entry->subexp_to - entry->subexp_from;
2170 to_idx = str_idx + subexp_len;
2171 dst_node = (subexp_len ? dfa->nexts[node]
2172 : dfa->edests[node].elems[0]);
2173
2174 if (to_idx > sctx->last_str_idx
2175 || sctx->sifted_states[to_idx] == NULL
2176 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2177 || check_dst_limits (mctx, &sctx->limits, node,
2178 str_idx, dst_node, to_idx))
2179 continue;
2180
2181 if (local_sctx.sifted_states == NULL)
2182 {
2183 local_sctx = *sctx;
2184 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2185 if (BE (err != REG_NOERROR, 0))
2186 goto free_return;
2187 }
2188 local_sctx.last_node = node;
2189 local_sctx.last_str_idx = str_idx;
2190 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2191 if (BE (ret < 0, 0))
2192 {
2193 err = REG_ESPACE;
2194 goto free_return;
2195 }
2196 cur_state = local_sctx.sifted_states[str_idx];
2197 err = sift_states_backward (mctx, &local_sctx);
2198 if (BE (err != REG_NOERROR, 0))
2199 goto free_return;
2200 if (sctx->limited_states != NULL)
2201 {
2202 err = merge_state_array (dfa, sctx->limited_states,
2203 local_sctx.sifted_states,
2204 str_idx + 1);
2205 if (BE (err != REG_NOERROR, 0))
2206 goto free_return;
2207 }
2208 local_sctx.sifted_states[str_idx] = cur_state;
2209 re_node_set_remove (&local_sctx.limits, enabled_idx);
2210
2211 /* mctx->bkref_ents may have changed, reload the pointer. */
2212 entry = mctx->bkref_ents + enabled_idx;
2213 }
2214 while (enabled_idx++, entry++->more);
2215 }
2216 err = REG_NOERROR;
2217 free_return:
2218 if (local_sctx.sifted_states != NULL)
2219 {
2220 re_node_set_free (&local_sctx.limits);
2221 }
2222
2223 return err;
2224}
2225
2226
2227#ifdef RE_ENABLE_I18N
2228static int
2229internal_function
2230sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2231 int node_idx, int str_idx, int max_str_idx)
2232{
2233 const re_dfa_t *const dfa = mctx->dfa;
2234 int naccepted;
2235 /* Check the node can accept `multi byte'. */
2236 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2237 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2238 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2239 dfa->nexts[node_idx]))
2240 /* The node can't accept the `multi byte', or the
2241 destination was already thrown away, then the node
2242 could't accept the current input `multi byte'. */
2243 naccepted = 0;
2244 /* Otherwise, it is sure that the node could accept
2245 `naccepted' bytes input. */
2246 return naccepted;
2247}
2248#endif /* RE_ENABLE_I18N */
2249
2250\f
2251/* Functions for state transition. */
2252
2253/* Return the next state to which the current state STATE will transit by
2254 accepting the current input byte, and update STATE_LOG if necessary.
2255 If STATE can accept a multibyte char/collating element/back reference
2256 update the destination of STATE_LOG. */
2257
2258static re_dfastate_t *
2259internal_function
2260transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2261 re_dfastate_t *state)
2262{
2263 re_dfastate_t **trtable;
2264 unsigned char ch;
2265
2266#ifdef RE_ENABLE_I18N
2267 /* If the current state can accept multibyte. */
2268 if (BE (state->accept_mb, 0))
2269 {
2270 *err = transit_state_mb (mctx, state);
2271 if (BE (*err != REG_NOERROR, 0))
2272 return NULL;
2273 }
2274#endif /* RE_ENABLE_I18N */
2275
2276 /* Then decide the next state with the single byte. */
2277#if 0
2278 if (0)
2279 /* don't use transition table */
2280 return transit_state_sb (err, mctx, state);
2281#endif
2282
2283 /* Use transition table */
2284 ch = re_string_fetch_byte (&mctx->input);
2285 for (;;)
2286 {
2287 trtable = state->trtable;
2288 if (BE (trtable != NULL, 1))
2289 return trtable[ch];
2290
2291 trtable = state->word_trtable;
2292 if (BE (trtable != NULL, 1))
2293 {
2294 unsigned int context;
2295 context
2296 = re_string_context_at (&mctx->input,
2297 re_string_cur_idx (&mctx->input) - 1,
2298 mctx->eflags);
2299 if (IS_WORD_CONTEXT (context))
2300 return trtable[ch + SBC_MAX];
2301 else
2302 return trtable[ch];
2303 }
2304
2305 if (!build_trtable (mctx->dfa, state))
2306 {
2307 *err = REG_ESPACE;
2308 return NULL;
2309 }
2310
2311 /* Retry, we now have a transition table. */
2312 }
2313}
2314
2315/* Update the state_log if we need */
2316static re_dfastate_t *
2317internal_function
2318merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2319 re_dfastate_t *next_state)
2320{
2321 const re_dfa_t *const dfa = mctx->dfa;
2322 int cur_idx = re_string_cur_idx (&mctx->input);
2323
2324 if (cur_idx > mctx->state_log_top)
2325 {
2326 mctx->state_log[cur_idx] = next_state;
2327 mctx->state_log_top = cur_idx;
2328 }
2329 else if (mctx->state_log[cur_idx] == NULL)
2330 {
2331 mctx->state_log[cur_idx] = next_state;
2332 }
2333 else
2334 {
2335 re_dfastate_t *pstate;
2336 unsigned int context;
2337 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2338 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2339 the destination of a multibyte char/collating element/
2340 back reference. Then the next state is the union set of
2341 these destinations and the results of the transition table. */
2342 pstate = mctx->state_log[cur_idx];
2343 log_nodes = pstate->entrance_nodes;
2344 if (next_state != NULL)
2345 {
2346 table_nodes = next_state->entrance_nodes;
2347 *err = re_node_set_init_union (&next_nodes, table_nodes,
2348 log_nodes);
2349 if (BE (*err != REG_NOERROR, 0))
2350 return NULL;
2351 }
2352 else
2353 next_nodes = *log_nodes;
2354 /* Note: We already add the nodes of the initial state,
2355 then we don't need to add them here. */
2356
2357 context = re_string_context_at (&mctx->input,
2358 re_string_cur_idx (&mctx->input) - 1,
2359 mctx->eflags);
2360 next_state = mctx->state_log[cur_idx]
2361 = re_acquire_state_context (err, dfa, &next_nodes, context);
2362 /* We don't need to check errors here, since the return value of
2363 this function is next_state and ERR is already set. */
2364
2365 if (table_nodes != NULL)
2366 re_node_set_free (&next_nodes);
2367 }
2368
2369 if (BE (dfa->nbackref, 0) && next_state != NULL)
2370 {
2371 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2372 later. We must check them here, since the back references in the
2373 next state might use them. */
2374 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2375 cur_idx);
2376 if (BE (*err != REG_NOERROR, 0))
2377 return NULL;
2378
2379 /* If the next state has back references. */
2380 if (next_state->has_backref)
2381 {
2382 *err = transit_state_bkref (mctx, &next_state->nodes);
2383 if (BE (*err != REG_NOERROR, 0))
2384 return NULL;
2385 next_state = mctx->state_log[cur_idx];
2386 }
2387 }
2388
2389 return next_state;
2390}
2391
2392/* Skip bytes in the input that correspond to part of a
2393 multi-byte match, then look in the log for a state
2394 from which to restart matching. */
2395static re_dfastate_t *
2396internal_function
2397find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2398{
2399 re_dfastate_t *cur_state;
2400 do
2401 {
2402 int max = mctx->state_log_top;
2403 int cur_str_idx = re_string_cur_idx (&mctx->input);
2404
2405 do
2406 {
2407 if (++cur_str_idx > max)
2408 return NULL;
2409 re_string_skip_bytes (&mctx->input, 1);
2410 }
2411 while (mctx->state_log[cur_str_idx] == NULL);
2412
2413 cur_state = merge_state_with_log (err, mctx, NULL);
2414 }
2415 while (*err == REG_NOERROR && cur_state == NULL);
2416 return cur_state;
2417}
2418
2419/* Helper functions for transit_state. */
2420
2421/* From the node set CUR_NODES, pick up the nodes whose types are
2422 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2423 expression. And register them to use them later for evaluating the
2424 correspoding back references. */
2425
2426static reg_errcode_t
2427internal_function
2428check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2429 int str_idx)
2430{
2431 const re_dfa_t *const dfa = mctx->dfa;
2432 int node_idx;
2433 reg_errcode_t err;
2434
2435 /* TODO: This isn't efficient.
2436 Because there might be more than one nodes whose types are
2437 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2438 nodes.
2439 E.g. RE: (a){2} */
2440 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2441 {
2442 int node = cur_nodes->elems[node_idx];
2443 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2444 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2445 && (dfa->used_bkref_map
2446 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2447 {
2448 err = match_ctx_add_subtop (mctx, node, str_idx);
2449 if (BE (err != REG_NOERROR, 0))
2450 return err;
2451 }
2452 }
2453 return REG_NOERROR;
2454}
2455
2456#if 0
2457/* Return the next state to which the current state STATE will transit by
2458 accepting the current input byte. */
2459
2460static re_dfastate_t *
2461transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2462 re_dfastate_t *state)
2463{
2464 const re_dfa_t *const dfa = mctx->dfa;
2465 re_node_set next_nodes;
2466 re_dfastate_t *next_state;
2467 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2468 unsigned int context;
2469
2470 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2471 if (BE (*err != REG_NOERROR, 0))
2472 return NULL;
2473 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2474 {
2475 int cur_node = state->nodes.elems[node_cnt];
2476 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2477 {
2478 *err = re_node_set_merge (&next_nodes,
2479 dfa->eclosures + dfa->nexts[cur_node]);
2480 if (BE (*err != REG_NOERROR, 0))
2481 {
2482 re_node_set_free (&next_nodes);
2483 return NULL;
2484 }
2485 }
2486 }
2487 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2488 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2489 /* We don't need to check errors here, since the return value of
2490 this function is next_state and ERR is already set. */
2491
2492 re_node_set_free (&next_nodes);
2493 re_string_skip_bytes (&mctx->input, 1);
2494 return next_state;
2495}
2496#endif
2497
2498#ifdef RE_ENABLE_I18N
2499static reg_errcode_t
2500internal_function
2501transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2502{
2503 const re_dfa_t *const dfa = mctx->dfa;
2504 reg_errcode_t err;
2505 int i;
2506
2507 for (i = 0; i < pstate->nodes.nelem; ++i)
2508 {
2509 re_node_set dest_nodes, *new_nodes;
2510 int cur_node_idx = pstate->nodes.elems[i];
2511 int naccepted, dest_idx;
2512 unsigned int context;
2513 re_dfastate_t *dest_state;
2514
2515 if (!dfa->nodes[cur_node_idx].accept_mb)
2516 continue;
2517
2518 if (dfa->nodes[cur_node_idx].constraint)
2519 {
2520 context = re_string_context_at (&mctx->input,
2521 re_string_cur_idx (&mctx->input),
2522 mctx->eflags);
2523 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2524 context))
2525 continue;
2526 }
2527
2528 /* How many bytes the node can accept? */
2529 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2530 re_string_cur_idx (&mctx->input));
2531 if (naccepted == 0)
2532 continue;
2533
2534 /* The node can accepts `naccepted' bytes. */
2535 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2536 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2537 : mctx->max_mb_elem_len);
2538 err = clean_state_log_if_needed (mctx, dest_idx);
2539 if (BE (err != REG_NOERROR, 0))
2540 return err;
2541#ifdef DEBUG
2542 assert (dfa->nexts[cur_node_idx] != -1);
2543#endif
2544 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2545
2546 dest_state = mctx->state_log[dest_idx];
2547 if (dest_state == NULL)
2548 dest_nodes = *new_nodes;
2549 else
2550 {
2551 err = re_node_set_init_union (&dest_nodes,
2552 dest_state->entrance_nodes, new_nodes);
2553 if (BE (err != REG_NOERROR, 0))
2554 return err;
2555 }
2556 context = re_string_context_at (&mctx->input, dest_idx - 1,
2557 mctx->eflags);
2558 mctx->state_log[dest_idx]
2559 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2560 if (dest_state != NULL)
2561 re_node_set_free (&dest_nodes);
2562 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2563 return err;
2564 }
2565 return REG_NOERROR;
2566}
2567#endif /* RE_ENABLE_I18N */
2568
2569static reg_errcode_t
2570internal_function
2571transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2572{
2573 const re_dfa_t *const dfa = mctx->dfa;
2574 reg_errcode_t err;
2575 int i;
2576 int cur_str_idx = re_string_cur_idx (&mctx->input);
2577
2578 for (i = 0; i < nodes->nelem; ++i)
2579 {
2580 int dest_str_idx, prev_nelem, bkc_idx;
2581 int node_idx = nodes->elems[i];
2582 unsigned int context;
2583 const re_token_t *node = dfa->nodes + node_idx;
2584 re_node_set *new_dest_nodes;
2585
2586 /* Check whether `node' is a backreference or not. */
2587 if (node->type != OP_BACK_REF)
2588 continue;
2589
2590 if (node->constraint)
2591 {
2592 context = re_string_context_at (&mctx->input, cur_str_idx,
2593 mctx->eflags);
2594 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2595 continue;
2596 }
2597
2598 /* `node' is a backreference.
2599 Check the substring which the substring matched. */
2600 bkc_idx = mctx->nbkref_ents;
2601 err = get_subexp (mctx, node_idx, cur_str_idx);
2602 if (BE (err != REG_NOERROR, 0))
2603 goto free_return;
2604
2605 /* And add the epsilon closures (which is `new_dest_nodes') of
2606 the backreference to appropriate state_log. */
2607#ifdef DEBUG
2608 assert (dfa->nexts[node_idx] != -1);
2609#endif
2610 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2611 {
2612 int subexp_len;
2613 re_dfastate_t *dest_state;
2614 struct re_backref_cache_entry *bkref_ent;
2615 bkref_ent = mctx->bkref_ents + bkc_idx;
2616 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2617 continue;
2618 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2619 new_dest_nodes = (subexp_len == 0
2620 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2621 : dfa->eclosures + dfa->nexts[node_idx]);
2622 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2623 - bkref_ent->subexp_from);
2624 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2625 mctx->eflags);
2626 dest_state = mctx->state_log[dest_str_idx];
2627 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2628 : mctx->state_log[cur_str_idx]->nodes.nelem);
2629 /* Add `new_dest_node' to state_log. */
2630 if (dest_state == NULL)
2631 {
2632 mctx->state_log[dest_str_idx]
2633 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2634 context);
2635 if (BE (mctx->state_log[dest_str_idx] == NULL
2636 && err != REG_NOERROR, 0))
2637 goto free_return;
2638 }
2639 else
2640 {
2641 re_node_set dest_nodes;
2642 err = re_node_set_init_union (&dest_nodes,
2643 dest_state->entrance_nodes,
2644 new_dest_nodes);
2645 if (BE (err != REG_NOERROR, 0))
2646 {
2647 re_node_set_free (&dest_nodes);
2648 goto free_return;
2649 }
2650 mctx->state_log[dest_str_idx]
2651 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2652 re_node_set_free (&dest_nodes);
2653 if (BE (mctx->state_log[dest_str_idx] == NULL
2654 && err != REG_NOERROR, 0))
2655 goto free_return;
2656 }
2657 /* We need to check recursively if the backreference can epsilon
2658 transit. */
2659 if (subexp_len == 0
2660 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2661 {
2662 err = check_subexp_matching_top (mctx, new_dest_nodes,
2663 cur_str_idx);
2664 if (BE (err != REG_NOERROR, 0))
2665 goto free_return;
2666 err = transit_state_bkref (mctx, new_dest_nodes);
2667 if (BE (err != REG_NOERROR, 0))
2668 goto free_return;
2669 }
2670 }
2671 }
2672 err = REG_NOERROR;
2673 free_return:
2674 return err;
2675}
2676
2677/* Enumerate all the candidates which the backreference BKREF_NODE can match
2678 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2679 Note that we might collect inappropriate candidates here.
2680 However, the cost of checking them strictly here is too high, then we
2681 delay these checking for prune_impossible_nodes(). */
2682
2683static reg_errcode_t
2684internal_function
2685get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2686{
2687 const re_dfa_t *const dfa = mctx->dfa;
2688 int subexp_num, sub_top_idx;
2689 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2690 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2691 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2692 if (cache_idx != -1)
2693 {
2694 const struct re_backref_cache_entry *entry
2695 = mctx->bkref_ents + cache_idx;
2696 do
2697 if (entry->node == bkref_node)
2698 return REG_NOERROR; /* We already checked it. */
2699 while (entry++->more);
2700 }
2701
2702 subexp_num = dfa->nodes[bkref_node].opr.idx;
2703
2704 /* For each sub expression */
2705 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2706 {
2707 reg_errcode_t err;
2708 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2709 re_sub_match_last_t *sub_last;
2710 int sub_last_idx, sl_str, bkref_str_off;
2711
2712 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2713 continue; /* It isn't related. */
2714
2715 sl_str = sub_top->str_idx;
2716 bkref_str_off = bkref_str_idx;
2717 /* At first, check the last node of sub expressions we already
2718 evaluated. */
2719 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2720 {
2721 int sl_str_diff;
2722 sub_last = sub_top->lasts[sub_last_idx];
2723 sl_str_diff = sub_last->str_idx - sl_str;
2724 /* The matched string by the sub expression match with the substring
2725 at the back reference? */
2726 if (sl_str_diff > 0)
2727 {
2728 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2729 {
2730 /* Not enough chars for a successful match. */
2731 if (bkref_str_off + sl_str_diff > mctx->input.len)
2732 break;
2733
2734 err = clean_state_log_if_needed (mctx,
2735 bkref_str_off
2736 + sl_str_diff);
2737 if (BE (err != REG_NOERROR, 0))
2738 return err;
2739 buf = (const char *) re_string_get_buffer (&mctx->input);
2740 }
2741 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2742 /* We don't need to search this sub expression any more. */
2743 break;
2744 }
2745 bkref_str_off += sl_str_diff;
2746 sl_str += sl_str_diff;
2747 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2748 bkref_str_idx);
2749
2750 /* Reload buf, since the preceding call might have reallocated
2751 the buffer. */
2752 buf = (const char *) re_string_get_buffer (&mctx->input);
2753
2754 if (err == REG_NOMATCH)
2755 continue;
2756 if (BE (err != REG_NOERROR, 0))
2757 return err;
2758 }
2759
2760 if (sub_last_idx < sub_top->nlasts)
2761 continue;
2762 if (sub_last_idx > 0)
2763 ++sl_str;
2764 /* Then, search for the other last nodes of the sub expression. */
2765 for (; sl_str <= bkref_str_idx; ++sl_str)
2766 {
2767 int cls_node, sl_str_off;
2768 const re_node_set *nodes;
2769 sl_str_off = sl_str - sub_top->str_idx;
2770 /* The matched string by the sub expression match with the substring
2771 at the back reference? */
2772 if (sl_str_off > 0)
2773 {
2774 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2775 {
2776 /* If we are at the end of the input, we cannot match. */
2777 if (bkref_str_off >= mctx->input.len)
2778 break;
2779
2780 err = extend_buffers (mctx);
2781 if (BE (err != REG_NOERROR, 0))
2782 return err;
2783
2784 buf = (const char *) re_string_get_buffer (&mctx->input);
2785 }
2786 if (buf [bkref_str_off++] != buf[sl_str - 1])
2787 break; /* We don't need to search this sub expression
2788 any more. */
2789 }
2790 if (mctx->state_log[sl_str] == NULL)
2791 continue;
2792 /* Does this state have a ')' of the sub expression? */
2793 nodes = &mctx->state_log[sl_str]->nodes;
2794 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2795 OP_CLOSE_SUBEXP);
2796 if (cls_node == -1)
2797 continue; /* No. */
2798 if (sub_top->path == NULL)
2799 {
2800 sub_top->path = calloc (sizeof (state_array_t),
2801 sl_str - sub_top->str_idx + 1);
2802 if (sub_top->path == NULL)
2803 return REG_ESPACE;
2804 }
2805 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2806 in the current context? */
2807 err = check_arrival (mctx, sub_top->path, sub_top->node,
2808 sub_top->str_idx, cls_node, sl_str,
2809 OP_CLOSE_SUBEXP);
2810 if (err == REG_NOMATCH)
2811 continue;
2812 if (BE (err != REG_NOERROR, 0))
2813 return err;
2814 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2815 if (BE (sub_last == NULL, 0))
2816 return REG_ESPACE;
2817 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2818 bkref_str_idx);
2819 if (err == REG_NOMATCH)
2820 continue;
2821 }
2822 }
2823 return REG_NOERROR;
2824}
2825
2826/* Helper functions for get_subexp(). */
2827
2828/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2829 If it can arrive, register the sub expression expressed with SUB_TOP
2830 and SUB_LAST. */
2831
2832static reg_errcode_t
2833internal_function
2834get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2835 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2836{
2837 reg_errcode_t err;
2838 int to_idx;
2839 /* Can the subexpression arrive the back reference? */
2840 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2841 sub_last->str_idx, bkref_node, bkref_str,
2842 OP_OPEN_SUBEXP);
2843 if (err != REG_NOERROR)
2844 return err;
2845 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2846 sub_last->str_idx);
2847 if (BE (err != REG_NOERROR, 0))
2848 return err;
2849 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2850 return clean_state_log_if_needed (mctx, to_idx);
2851}
2852
2853/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2854 Search '(' if FL_OPEN, or search ')' otherwise.
2855 TODO: This function isn't efficient...
2856 Because there might be more than one nodes whose types are
2857 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2858 nodes.
2859 E.g. RE: (a){2} */
2860
2861static int
2862internal_function
2863find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2864 int subexp_idx, int type)
2865{
2866 int cls_idx;
2867 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2868 {
2869 int cls_node = nodes->elems[cls_idx];
2870 const re_token_t *node = dfa->nodes + cls_node;
2871 if (node->type == type
2872 && node->opr.idx == subexp_idx)
2873 return cls_node;
2874 }
2875 return -1;
2876}
2877
2878/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2879 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2880 heavily reused.
2881 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2882
2883static reg_errcode_t
2884internal_function
2885check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2886 int top_str, int last_node, int last_str, int type)
2887{
2888 const re_dfa_t *const dfa = mctx->dfa;
2889 reg_errcode_t err = REG_NOERROR;
2890 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2891 re_dfastate_t *cur_state = NULL;
2892 re_node_set *cur_nodes, next_nodes;
2893 re_dfastate_t **backup_state_log;
2894 unsigned int context;
2895
2896 subexp_num = dfa->nodes[top_node].opr.idx;
2897 /* Extend the buffer if we need. */
2898 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2899 {
2900 re_dfastate_t **new_array;
2901 int old_alloc = path->alloc;
2902 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2903 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2904 if (BE (new_array == NULL, 0))
2905 {
2906 path->alloc = old_alloc;
2907 return REG_ESPACE;
2908 }
2909 path->array = new_array;
2910 memset (new_array + old_alloc, '\0',
2911 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2912 }
2913
2914 str_idx = path->next_idx ? path->next_idx : top_str;
2915
2916 /* Temporary modify MCTX. */
2917 backup_state_log = mctx->state_log;
2918 backup_cur_idx = mctx->input.cur_idx;
2919 mctx->state_log = path->array;
2920 mctx->input.cur_idx = str_idx;
2921
2922 /* Setup initial node set. */
2923 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2924 if (str_idx == top_str)
2925 {
2926 err = re_node_set_init_1 (&next_nodes, top_node);
2927 if (BE (err != REG_NOERROR, 0))
2928 return err;
2929 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2930 if (BE (err != REG_NOERROR, 0))
2931 {
2932 re_node_set_free (&next_nodes);
2933 return err;
2934 }
2935 }
2936 else
2937 {
2938 cur_state = mctx->state_log[str_idx];
2939 if (cur_state && cur_state->has_backref)
2940 {
2941 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2942 if (BE (err != REG_NOERROR, 0))
2943 return err;
2944 }
2945 else
2946 re_node_set_init_empty (&next_nodes);
2947 }
2948 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2949 {
2950 if (next_nodes.nelem)
2951 {
2952 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2953 subexp_num, type);
2954 if (BE (err != REG_NOERROR, 0))
2955 {
2956 re_node_set_free (&next_nodes);
2957 return err;
2958 }
2959 }
2960 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2961 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2962 {
2963 re_node_set_free (&next_nodes);
2964 return err;
2965 }
2966 mctx->state_log[str_idx] = cur_state;
2967 }
2968
2969 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2970 {
2971 re_node_set_empty (&next_nodes);
2972 if (mctx->state_log[str_idx + 1])
2973 {
2974 err = re_node_set_merge (&next_nodes,
2975 &mctx->state_log[str_idx + 1]->nodes);
2976 if (BE (err != REG_NOERROR, 0))
2977 {
2978 re_node_set_free (&next_nodes);
2979 return err;
2980 }
2981 }
2982 if (cur_state)
2983 {
2984 err = check_arrival_add_next_nodes (mctx, str_idx,
2985 &cur_state->non_eps_nodes,
2986 &next_nodes);
2987 if (BE (err != REG_NOERROR, 0))
2988 {
2989 re_node_set_free (&next_nodes);
2990 return err;
2991 }
2992 }
2993 ++str_idx;
2994 if (next_nodes.nelem)
2995 {
2996 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2997 if (BE (err != REG_NOERROR, 0))
2998 {
2999 re_node_set_free (&next_nodes);
3000 return err;
3001 }
3002 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3003 subexp_num, type);
3004 if (BE (err != REG_NOERROR, 0))
3005 {
3006 re_node_set_free (&next_nodes);
3007 return err;
3008 }
3009 }
3010 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3011 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3012 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3013 {
3014 re_node_set_free (&next_nodes);
3015 return err;
3016 }
3017 mctx->state_log[str_idx] = cur_state;
3018 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3019 }
3020 re_node_set_free (&next_nodes);
3021 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3022 : &mctx->state_log[last_str]->nodes);
3023 path->next_idx = str_idx;
3024
3025 /* Fix MCTX. */
3026 mctx->state_log = backup_state_log;
3027 mctx->input.cur_idx = backup_cur_idx;
3028
3029 /* Then check the current node set has the node LAST_NODE. */
3030 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3031 return REG_NOERROR;
3032
3033 return REG_NOMATCH;
3034}
3035
3036/* Helper functions for check_arrival. */
3037
3038/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3039 to NEXT_NODES.
3040 TODO: This function is similar to the functions transit_state*(),
3041 however this function has many additional works.
3042 Can't we unify them? */
3043
3044static reg_errcode_t
3045internal_function
3046check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3047 re_node_set *cur_nodes, re_node_set *next_nodes)
3048{
3049 const re_dfa_t *const dfa = mctx->dfa;
3050 int result;
3051 int cur_idx;
3052#ifdef RE_ENABLE_I18N
3053 reg_errcode_t err = REG_NOERROR;
3054#endif
3055 re_node_set union_set;
3056 re_node_set_init_empty (&union_set);
3057 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3058 {
3059 int naccepted = 0;
3060 int cur_node = cur_nodes->elems[cur_idx];
3061#ifdef DEBUG
3062 re_token_type_t type = dfa->nodes[cur_node].type;
3063 assert (!IS_EPSILON_NODE (type));
3064#endif
3065#ifdef RE_ENABLE_I18N
3066 /* If the node may accept `multi byte'. */
3067 if (dfa->nodes[cur_node].accept_mb)
3068 {
3069 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3070 str_idx);
3071 if (naccepted > 1)
3072 {
3073 re_dfastate_t *dest_state;
3074 int next_node = dfa->nexts[cur_node];
3075 int next_idx = str_idx + naccepted;
3076 dest_state = mctx->state_log[next_idx];
3077 re_node_set_empty (&union_set);
3078 if (dest_state)
3079 {
3080 err = re_node_set_merge (&union_set, &dest_state->nodes);
3081 if (BE (err != REG_NOERROR, 0))
3082 {
3083 re_node_set_free (&union_set);
3084 return err;
3085 }
3086 }
3087 result = re_node_set_insert (&union_set, next_node);
3088 if (BE (result < 0, 0))
3089 {
3090 re_node_set_free (&union_set);
3091 return REG_ESPACE;
3092 }
3093 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3094 &union_set);
3095 if (BE (mctx->state_log[next_idx] == NULL
3096 && err != REG_NOERROR, 0))
3097 {
3098 re_node_set_free (&union_set);
3099 return err;
3100 }
3101 }
3102 }
3103#endif /* RE_ENABLE_I18N */
3104 if (naccepted
3105 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3106 {
3107 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3108 if (BE (result < 0, 0))
3109 {
3110 re_node_set_free (&union_set);
3111 return REG_ESPACE;
3112 }
3113 }
3114 }
3115 re_node_set_free (&union_set);
3116 return REG_NOERROR;
3117}
3118
3119/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3120 CUR_NODES, however exclude the nodes which are:
3121 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3122 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3123*/
3124
3125static reg_errcode_t
3126internal_function
3127check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3128 int ex_subexp, int type)
3129{
3130 reg_errcode_t err;
3131 int idx, outside_node;
3132 re_node_set new_nodes;
3133#ifdef DEBUG
3134 assert (cur_nodes->nelem);
3135#endif
3136 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3137 if (BE (err != REG_NOERROR, 0))
3138 return err;
3139 /* Create a new node set NEW_NODES with the nodes which are epsilon
3140 closures of the node in CUR_NODES. */
3141
3142 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3143 {
3144 int cur_node = cur_nodes->elems[idx];
3145 const re_node_set *eclosure = dfa->eclosures + cur_node;
3146 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3147 if (outside_node == -1)
3148 {
3149 /* There are no problematic nodes, just merge them. */
3150 err = re_node_set_merge (&new_nodes, eclosure);
3151 if (BE (err != REG_NOERROR, 0))
3152 {
3153 re_node_set_free (&new_nodes);
3154 return err;
3155 }
3156 }
3157 else
3158 {
3159 /* There are problematic nodes, re-calculate incrementally. */
3160 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3161 ex_subexp, type);
3162 if (BE (err != REG_NOERROR, 0))
3163 {
3164 re_node_set_free (&new_nodes);
3165 return err;
3166 }
3167 }
3168 }
3169 re_node_set_free (cur_nodes);
3170 *cur_nodes = new_nodes;
3171 return REG_NOERROR;
3172}
3173
3174/* Helper function for check_arrival_expand_ecl.
3175 Check incrementally the epsilon closure of TARGET, and if it isn't
3176 problematic append it to DST_NODES. */
3177
3178static reg_errcode_t
3179internal_function
3180check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3181 int target, int ex_subexp, int type)
3182{
3183 int cur_node;
3184 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3185 {
3186 int err;
3187
3188 if (dfa->nodes[cur_node].type == type
3189 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3190 {
3191 if (type == OP_CLOSE_SUBEXP)
3192 {
3193 err = re_node_set_insert (dst_nodes, cur_node);
3194 if (BE (err == -1, 0))
3195 return REG_ESPACE;
3196 }
3197 break;
3198 }
3199 err = re_node_set_insert (dst_nodes, cur_node);
3200 if (BE (err == -1, 0))
3201 return REG_ESPACE;
3202 if (dfa->edests[cur_node].nelem == 0)
3203 break;
3204 if (dfa->edests[cur_node].nelem == 2)
3205 {
3206 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3207 dfa->edests[cur_node].elems[1],
3208 ex_subexp, type);
3209 if (BE (err != REG_NOERROR, 0))
3210 return err;
3211 }
3212 cur_node = dfa->edests[cur_node].elems[0];
3213 }
3214 return REG_NOERROR;
3215}
3216
3217
3218/* For all the back references in the current state, calculate the
3219 destination of the back references by the appropriate entry
3220 in MCTX->BKREF_ENTS. */
3221
3222static reg_errcode_t
3223internal_function
3224expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3225 int cur_str, int subexp_num, int type)
3226{
3227 const re_dfa_t *const dfa = mctx->dfa;
3228 reg_errcode_t err;
3229 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3230 struct re_backref_cache_entry *ent;
3231
3232 if (cache_idx_start == -1)
3233 return REG_NOERROR;
3234
3235 restart:
3236 ent = mctx->bkref_ents + cache_idx_start;
3237 do
3238 {
3239 int to_idx, next_node;
3240
3241 /* Is this entry ENT is appropriate? */
3242 if (!re_node_set_contains (cur_nodes, ent->node))
3243 continue; /* No. */
3244
3245 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3246 /* Calculate the destination of the back reference, and append it
3247 to MCTX->STATE_LOG. */
3248 if (to_idx == cur_str)
3249 {
3250 /* The backreference did epsilon transit, we must re-check all the
3251 node in the current state. */
3252 re_node_set new_dests;
3253 reg_errcode_t err2, err3;
3254 next_node = dfa->edests[ent->node].elems[0];
3255 if (re_node_set_contains (cur_nodes, next_node))
3256 continue;
3257 err = re_node_set_init_1 (&new_dests, next_node);
3258 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3259 err3 = re_node_set_merge (cur_nodes, &new_dests);
3260 re_node_set_free (&new_dests);
3261 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3262 || err3 != REG_NOERROR, 0))
3263 {
3264 err = (err != REG_NOERROR ? err
3265 : (err2 != REG_NOERROR ? err2 : err3));
3266 return err;
3267 }
3268 /* TODO: It is still inefficient... */
3269 goto restart;
3270 }
3271 else
3272 {
3273 re_node_set union_set;
3274 next_node = dfa->nexts[ent->node];
3275 if (mctx->state_log[to_idx])
3276 {
3277 int ret;
3278 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3279 next_node))
3280 continue;
3281 err = re_node_set_init_copy (&union_set,
3282 &mctx->state_log[to_idx]->nodes);
3283 ret = re_node_set_insert (&union_set, next_node);
3284 if (BE (err != REG_NOERROR || ret < 0, 0))
3285 {
3286 re_node_set_free (&union_set);
3287 err = err != REG_NOERROR ? err : REG_ESPACE;
3288 return err;
3289 }
3290 }
3291 else
3292 {
3293 err = re_node_set_init_1 (&union_set, next_node);
3294 if (BE (err != REG_NOERROR, 0))
3295 return err;
3296 }
3297 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3298 re_node_set_free (&union_set);
3299 if (BE (mctx->state_log[to_idx] == NULL
3300 && err != REG_NOERROR, 0))
3301 return err;
3302 }
3303 }
3304 while (ent++->more);
3305 return REG_NOERROR;
3306}
3307
3308/* Build transition table for the state.
3309 Return 1 if succeeded, otherwise return NULL. */
3310
3311static int
3312internal_function
3313build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3314{
3315 reg_errcode_t err;
3316 int i, j, ch, need_word_trtable = 0;
3317 bitset_word_t elem, mask;
3318 bool dests_node_malloced = false;
3319 bool dest_states_malloced = false;
3320 int ndests; /* Number of the destination states from `state'. */
3321 re_dfastate_t **trtable;
3322 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3323 re_node_set follows, *dests_node;
3324 bitset_t *dests_ch;
3325 bitset_t acceptable;
3326
3327 struct dests_alloc
3328 {
3329 re_node_set dests_node[SBC_MAX];
3330 bitset_t dests_ch[SBC_MAX];
3331 } *dests_alloc;
3332
3333 /* We build DFA states which corresponds to the destination nodes
3334 from `state'. `dests_node[i]' represents the nodes which i-th
3335 destination state contains, and `dests_ch[i]' represents the
3336 characters which i-th destination state accepts. */
3337#ifdef HAVE_ALLOCA
3338 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3339 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3340 else
3341#endif
3342 {
3343 dests_alloc = re_malloc (struct dests_alloc, 1);
3344 if (BE (dests_alloc == NULL, 0))
3345 return 0;
3346 dests_node_malloced = true;
3347 }
3348 dests_node = dests_alloc->dests_node;
3349 dests_ch = dests_alloc->dests_ch;
3350
3351 /* Initialize transiton table. */
3352 state->word_trtable = state->trtable = NULL;
3353
3354 /* At first, group all nodes belonging to `state' into several
3355 destinations. */
3356 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3357 if (BE (ndests <= 0, 0))
3358 {
3359 if (dests_node_malloced)
3360 free (dests_alloc);
3361 /* Return 0 in case of an error, 1 otherwise. */
3362 if (ndests == 0)
3363 {
3364 state->trtable = (re_dfastate_t **)
3365 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3366 return 1;
3367 }
3368 return 0;
3369 }
3370
3371 err = re_node_set_alloc (&follows, ndests + 1);
3372 if (BE (err != REG_NOERROR, 0))
3373 goto out_free;
3374
3375 /* Avoid arithmetic overflow in size calculation. */
3376 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3377 / (3 * sizeof (re_dfastate_t *)))
3378 < ndests),
3379 0))
3380 goto out_free;
3381
3382#ifdef HAVE_ALLOCA
3383 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3384 + ndests * 3 * sizeof (re_dfastate_t *)))
3385 dest_states = (re_dfastate_t **)
3386 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3387 else
3388#endif
3389 {
3390 dest_states = (re_dfastate_t **)
3391 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3392 if (BE (dest_states == NULL, 0))
3393 {
3394out_free:
3395 if (dest_states_malloced)
3396 free (dest_states);
3397 re_node_set_free (&follows);
3398 for (i = 0; i < ndests; ++i)
3399 re_node_set_free (dests_node + i);
3400 if (dests_node_malloced)
3401 free (dests_alloc);
3402 return 0;
3403 }
3404 dest_states_malloced = true;
3405 }
3406 dest_states_word = dest_states + ndests;
3407 dest_states_nl = dest_states_word + ndests;
3408 bitset_empty (acceptable);
3409
3410 /* Then build the states for all destinations. */
3411 for (i = 0; i < ndests; ++i)
3412 {
3413 int next_node;
3414 re_node_set_empty (&follows);
3415 /* Merge the follows of this destination states. */
3416 for (j = 0; j < dests_node[i].nelem; ++j)
3417 {
3418 next_node = dfa->nexts[dests_node[i].elems[j]];
3419 if (next_node != -1)
3420 {
3421 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3422 if (BE (err != REG_NOERROR, 0))
3423 goto out_free;
3424 }
3425 }
3426 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3427 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3428 goto out_free;
3429 /* If the new state has context constraint,
3430 build appropriate states for these contexts. */
3431 if (dest_states[i]->has_constraint)
3432 {
3433 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3434 CONTEXT_WORD);
3435 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3436 goto out_free;
3437
3438 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3439 need_word_trtable = 1;
3440
3441 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3442 CONTEXT_NEWLINE);
3443 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3444 goto out_free;
3445 }
3446 else
3447 {
3448 dest_states_word[i] = dest_states[i];
3449 dest_states_nl[i] = dest_states[i];
3450 }
3451 bitset_merge (acceptable, dests_ch[i]);
3452 }
3453
3454 if (!BE (need_word_trtable, 0))
3455 {
3456 /* We don't care about whether the following character is a word
3457 character, or we are in a single-byte character set so we can
3458 discern by looking at the character code: allocate a
3459 256-entry transition table. */
3460 trtable = state->trtable =
3461 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3462 if (BE (trtable == NULL, 0))
3463 goto out_free;
3464
3465 /* For all characters ch...: */
3466 for (i = 0; i < BITSET_WORDS; ++i)
3467 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3468 elem;
3469 mask <<= 1, elem >>= 1, ++ch)
3470 if (BE (elem & 1, 0))
3471 {
3472 /* There must be exactly one destination which accepts
3473 character ch. See group_nodes_into_DFAstates. */
3474 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3475 ;
3476
3477 /* j-th destination accepts the word character ch. */
3478 if (dfa->word_char[i] & mask)
3479 trtable[ch] = dest_states_word[j];
3480 else
3481 trtable[ch] = dest_states[j];
3482 }
3483 }
3484 else
3485 {
3486 /* We care about whether the following character is a word
3487 character, and we are in a multi-byte character set: discern
3488 by looking at the character code: build two 256-entry
3489 transition tables, one starting at trtable[0] and one
3490 starting at trtable[SBC_MAX]. */
3491 trtable = state->word_trtable =
3492 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3493 if (BE (trtable == NULL, 0))
3494 goto out_free;
3495
3496 /* For all characters ch...: */
3497 for (i = 0; i < BITSET_WORDS; ++i)
3498 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3499 elem;
3500 mask <<= 1, elem >>= 1, ++ch)
3501 if (BE (elem & 1, 0))
3502 {
3503 /* There must be exactly one destination which accepts
3504 character ch. See group_nodes_into_DFAstates. */
3505 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3506 ;
3507
3508 /* j-th destination accepts the word character ch. */
3509 trtable[ch] = dest_states[j];
3510 trtable[ch + SBC_MAX] = dest_states_word[j];
3511 }
3512 }
3513
3514 /* new line */
3515 if (bitset_contain (acceptable, NEWLINE_CHAR))
3516 {
3517 /* The current state accepts newline character. */
3518 for (j = 0; j < ndests; ++j)
3519 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3520 {
3521 /* k-th destination accepts newline character. */
3522 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3523 if (need_word_trtable)
3524 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3525 /* There must be only one destination which accepts
3526 newline. See group_nodes_into_DFAstates. */
3527 break;
3528 }
3529 }
3530
3531 if (dest_states_malloced)
3532 free (dest_states);
3533
3534 re_node_set_free (&follows);
3535 for (i = 0; i < ndests; ++i)
3536 re_node_set_free (dests_node + i);
3537
3538 if (dests_node_malloced)
3539 free (dests_alloc);
3540
3541 return 1;
3542}
3543
3544/* Group all nodes belonging to STATE into several destinations.
3545 Then for all destinations, set the nodes belonging to the destination
3546 to DESTS_NODE[i] and set the characters accepted by the destination
3547 to DEST_CH[i]. This function return the number of destinations. */
3548
3549static int
3550internal_function
3551group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3552 re_node_set *dests_node, bitset_t *dests_ch)
3553{
3554 reg_errcode_t err;
3555 int result;
3556 int i, j, k;
3557 int ndests; /* Number of the destinations from `state'. */
3558 bitset_t accepts; /* Characters a node can accept. */
3559 const re_node_set *cur_nodes = &state->nodes;
3560 bitset_empty (accepts);
3561 ndests = 0;
3562
3563 /* For all the nodes belonging to `state', */
3564 for (i = 0; i < cur_nodes->nelem; ++i)
3565 {
3566 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3567 re_token_type_t type = node->type;
3568 unsigned int constraint = node->constraint;
3569
3570 /* Enumerate all single byte character this node can accept. */
3571 if (type == CHARACTER)
3572 bitset_set (accepts, node->opr.c);
3573 else if (type == SIMPLE_BRACKET)
3574 {
3575 bitset_merge (accepts, node->opr.sbcset);
3576 }
3577 else if (type == OP_PERIOD)
3578 {
3579#ifdef RE_ENABLE_I18N
3580 if (dfa->mb_cur_max > 1)
3581 bitset_merge (accepts, dfa->sb_char);
3582 else
3583#endif
3584 bitset_set_all (accepts);
3585 if (!(dfa->syntax & RE_DOT_NEWLINE))
3586 bitset_clear (accepts, '\n');
3587 if (dfa->syntax & RE_DOT_NOT_NULL)
3588 bitset_clear (accepts, '\0');
3589 }
3590#ifdef RE_ENABLE_I18N
3591 else if (type == OP_UTF8_PERIOD)
3592 {
3593 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3594 if (!(dfa->syntax & RE_DOT_NEWLINE))
3595 bitset_clear (accepts, '\n');
3596 if (dfa->syntax & RE_DOT_NOT_NULL)
3597 bitset_clear (accepts, '\0');
3598 }
3599#endif
3600 else
3601 continue;
3602
3603 /* Check the `accepts' and sift the characters which are not
3604 match it the context. */
3605 if (constraint)
3606 {
3607 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3608 {
3609 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3610 bitset_empty (accepts);
3611 if (accepts_newline)
3612 bitset_set (accepts, NEWLINE_CHAR);
3613 else
3614 continue;
3615 }
3616 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3617 {
3618 bitset_empty (accepts);
3619 continue;
3620 }
3621
3622 if (constraint & NEXT_WORD_CONSTRAINT)
3623 {
3624 bitset_word_t any_set = 0;
3625 if (type == CHARACTER && !node->word_char)
3626 {
3627 bitset_empty (accepts);
3628 continue;
3629 }
3630#ifdef RE_ENABLE_I18N
3631 if (dfa->mb_cur_max > 1)
3632 for (j = 0; j < BITSET_WORDS; ++j)
3633 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3634 else
3635#endif
3636 for (j = 0; j < BITSET_WORDS; ++j)
3637 any_set |= (accepts[j] &= dfa->word_char[j]);
3638 if (!any_set)
3639 continue;
3640 }
3641 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3642 {
3643 bitset_word_t any_set = 0;
3644 if (type == CHARACTER && node->word_char)
3645 {
3646 bitset_empty (accepts);
3647 continue;
3648 }
3649#ifdef RE_ENABLE_I18N
3650 if (dfa->mb_cur_max > 1)
3651 for (j = 0; j < BITSET_WORDS; ++j)
3652 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3653 else
3654#endif
3655 for (j = 0; j < BITSET_WORDS; ++j)
3656 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3657 if (!any_set)
3658 continue;
3659 }
3660 }
3661
3662 /* Then divide `accepts' into DFA states, or create a new
3663 state. Above, we make sure that accepts is not empty. */
3664 for (j = 0; j < ndests; ++j)
3665 {
3666 bitset_t intersec; /* Intersection sets, see below. */
3667 bitset_t remains;
3668 /* Flags, see below. */
3669 bitset_word_t has_intersec, not_subset, not_consumed;
3670
3671 /* Optimization, skip if this state doesn't accept the character. */
3672 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3673 continue;
3674
3675 /* Enumerate the intersection set of this state and `accepts'. */
3676 has_intersec = 0;
3677 for (k = 0; k < BITSET_WORDS; ++k)
3678 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3679 /* And skip if the intersection set is empty. */
3680 if (!has_intersec)
3681 continue;
3682
3683 /* Then check if this state is a subset of `accepts'. */
3684 not_subset = not_consumed = 0;
3685 for (k = 0; k < BITSET_WORDS; ++k)
3686 {
3687 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3688 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3689 }
3690
3691 /* If this state isn't a subset of `accepts', create a
3692 new group state, which has the `remains'. */
3693 if (not_subset)
3694 {
3695 bitset_copy (dests_ch[ndests], remains);
3696 bitset_copy (dests_ch[j], intersec);
3697 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3698 if (BE (err != REG_NOERROR, 0))
3699 goto error_return;
3700 ++ndests;
3701 }
3702
3703 /* Put the position in the current group. */
3704 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3705 if (BE (result < 0, 0))
3706 goto error_return;
3707
3708 /* If all characters are consumed, go to next node. */
3709 if (!not_consumed)
3710 break;
3711 }
3712 /* Some characters remain, create a new group. */
3713 if (j == ndests)
3714 {
3715 bitset_copy (dests_ch[ndests], accepts);
3716 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3717 if (BE (err != REG_NOERROR, 0))
3718 goto error_return;
3719 ++ndests;
3720 bitset_empty (accepts);
3721 }
3722 }
3723 return ndests;
3724 error_return:
3725 for (j = 0; j < ndests; ++j)
3726 re_node_set_free (dests_node + j);
3727 return -1;
3728}
3729
3730#ifdef RE_ENABLE_I18N
3731/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3732 Return the number of the bytes the node accepts.
3733 STR_IDX is the current index of the input string.
3734
3735 This function handles the nodes which can accept one character, or
3736 one collating element like '.', '[a-z]', opposite to the other nodes
3737 can only accept one byte. */
3738
3739static int
3740internal_function
3741check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3742 const re_string_t *input, int str_idx)
3743{
3744 const re_token_t *node = dfa->nodes + node_idx;
3745 int char_len, elem_len;
3746 int i;
3747 wint_t wc;
3748
3749 if (BE (node->type == OP_UTF8_PERIOD, 0))
3750 {
3751 unsigned char c = re_string_byte_at (input, str_idx), d;
3752 if (BE (c < 0xc2, 1))
3753 return 0;
3754
3755 if (str_idx + 2 > input->len)
3756 return 0;
3757
3758 d = re_string_byte_at (input, str_idx + 1);
3759 if (c < 0xe0)
3760 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3761 else if (c < 0xf0)
3762 {
3763 char_len = 3;
3764 if (c == 0xe0 && d < 0xa0)
3765 return 0;
3766 }
3767 else if (c < 0xf8)
3768 {
3769 char_len = 4;
3770 if (c == 0xf0 && d < 0x90)
3771 return 0;
3772 }
3773 else if (c < 0xfc)
3774 {
3775 char_len = 5;
3776 if (c == 0xf8 && d < 0x88)
3777 return 0;
3778 }
3779 else if (c < 0xfe)
3780 {
3781 char_len = 6;
3782 if (c == 0xfc && d < 0x84)
3783 return 0;
3784 }
3785 else
3786 return 0;
3787
3788 if (str_idx + char_len > input->len)
3789 return 0;
3790
3791 for (i = 1; i < char_len; ++i)
3792 {
3793 d = re_string_byte_at (input, str_idx + i);
3794 if (d < 0x80 || d > 0xbf)
3795 return 0;
3796 }
3797 return char_len;
3798 }
3799
3800 char_len = re_string_char_size_at (input, str_idx);
3801 if (node->type == OP_PERIOD)
3802 {
3803 if (char_len <= 1)
3804 return 0;
3805 /* FIXME: I don't think this if is needed, as both '\n'
3806 and '\0' are char_len == 1. */
3807 /* '.' accepts any one character except the following two cases. */
3808 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3809 re_string_byte_at (input, str_idx) == '\n') ||
3810 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3811 re_string_byte_at (input, str_idx) == '\0'))
3812 return 0;
3813 return char_len;
3814 }
3815
3816 elem_len = re_string_elem_size_at (input, str_idx);
3817 wc = __btowc(*(input->mbs+str_idx));
3818 if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3819 return 0;
3820
3821 if (node->type == COMPLEX_BRACKET)
3822 {
3823 const re_charset_t *cset = node->opr.mbcset;
3824# ifdef _LIBC
3825 const unsigned char *pin
3826 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3827 int j;
3828 uint32_t nrules;
3829# endif /* _LIBC */
3830 int match_len = 0;
3831 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3832 ? re_string_wchar_at (input, str_idx) : 0);
3833
3834 /* match with multibyte character? */
3835 for (i = 0; i < cset->nmbchars; ++i)
3836 if (wc == cset->mbchars[i])
3837 {
3838 match_len = char_len;
3839 goto check_node_accept_bytes_match;
3840 }
3841 /* match with character_class? */
3842 for (i = 0; i < cset->nchar_classes; ++i)
3843 {
3844 wctype_t wt = cset->char_classes[i];
3845 if (__iswctype (wc, wt))
3846 {
3847 match_len = char_len;
3848 goto check_node_accept_bytes_match;
3849 }
3850 }
3851
3852# ifdef _LIBC
3853 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3854 if (nrules != 0)
3855 {
3856 unsigned int in_collseq = 0;
3857 const int32_t *table, *indirect;
3858 const unsigned char *weights, *extra;
3859 const char *collseqwc;
3860 /* This #include defines a local function! */
3861# include <locale/weight.h>
3862
3863 /* match with collating_symbol? */
3864 if (cset->ncoll_syms)
3865 extra = (const unsigned char *)
3866 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3867 for (i = 0; i < cset->ncoll_syms; ++i)
3868 {
3869 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3870 /* Compare the length of input collating element and
3871 the length of current collating element. */
3872 if (*coll_sym != elem_len)
3873 continue;
3874 /* Compare each bytes. */
3875 for (j = 0; j < *coll_sym; j++)
3876 if (pin[j] != coll_sym[1 + j])
3877 break;
3878 if (j == *coll_sym)
3879 {
3880 /* Match if every bytes is equal. */
3881 match_len = j;
3882 goto check_node_accept_bytes_match;
3883 }
3884 }
3885
3886 if (cset->nranges)
3887 {
3888 if (elem_len <= char_len)
3889 {
3890 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3891 in_collseq = __collseq_table_lookup (collseqwc, wc);
3892 }
3893 else
3894 in_collseq = find_collation_sequence_value (pin, elem_len);
3895 }
3896 /* match with range expression? */
3897 for (i = 0; i < cset->nranges; ++i)
3898 if (cset->range_starts[i] <= in_collseq
3899 && in_collseq <= cset->range_ends[i])
3900 {
3901 match_len = elem_len;
3902 goto check_node_accept_bytes_match;
3903 }
3904
3905 /* match with equivalence_class? */
3906 if (cset->nequiv_classes)
3907 {
3908 const unsigned char *cp = pin;
3909 table = (const int32_t *)
3910 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3911 weights = (const unsigned char *)
3912 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3913 extra = (const unsigned char *)
3914 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3915 indirect = (const int32_t *)
3916 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3917 int32_t idx = findidx (&cp);
3918 if (idx > 0)
3919 for (i = 0; i < cset->nequiv_classes; ++i)
3920 {
3921 int32_t equiv_class_idx = cset->equiv_classes[i];
3922 size_t weight_len = weights[idx & 0xffffff];
3923 if (weight_len == weights[equiv_class_idx & 0xffffff]
3924 && (idx >> 24) == (equiv_class_idx >> 24))
3925 {
3926 int cnt = 0;
3927
3928 idx &= 0xffffff;
3929 equiv_class_idx &= 0xffffff;
3930
3931 while (cnt <= weight_len
3932 && (weights[equiv_class_idx + 1 + cnt]
3933 == weights[idx + 1 + cnt]))
3934 ++cnt;
3935 if (cnt > weight_len)
3936 {
3937 match_len = elem_len;
3938 goto check_node_accept_bytes_match;
3939 }
3940 }
3941 }
3942 }
3943 }
3944 else
3945# endif /* _LIBC */
3946 {
3947 /* match with range expression? */
3948#if __GNUC__ >= 2
3949 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3950#else
3951 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3952 cmp_buf[2] = wc;
3953#endif
3954 for (i = 0; i < cset->nranges; ++i)
3955 {
3956 cmp_buf[0] = cset->range_starts[i];
3957 cmp_buf[4] = cset->range_ends[i];
3958 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3959 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3960 {
3961 match_len = char_len;
3962 goto check_node_accept_bytes_match;
3963 }
3964 }
3965 }
3966 check_node_accept_bytes_match:
3967 if (!cset->non_match)
3968 return match_len;
3969 else
3970 {
3971 if (match_len > 0)
3972 return 0;
3973 else
3974 return (elem_len > char_len) ? elem_len : char_len;
3975 }
3976 }
3977 return 0;
3978}
3979
3980# ifdef _LIBC
3981static unsigned int
3982internal_function
3983find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3984{
3985 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3986 if (nrules == 0)
3987 {
3988 if (mbs_len == 1)
3989 {
3990 /* No valid character. Match it as a single byte character. */
3991 const unsigned char *collseq = (const unsigned char *)
3992 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3993 return collseq[mbs[0]];
3994 }
3995 return UINT_MAX;
3996 }
3997 else
3998 {
3999 int32_t idx;
4000 const unsigned char *extra = (const unsigned char *)
4001 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4002 int32_t extrasize = (const unsigned char *)
4003 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4004
4005 for (idx = 0; idx < extrasize;)
4006 {
4007 int mbs_cnt, found = 0;
4008 int32_t elem_mbs_len;
4009 /* Skip the name of collating element name. */
4010 idx = idx + extra[idx] + 1;
4011 elem_mbs_len = extra[idx++];
4012 if (mbs_len == elem_mbs_len)
4013 {
4014 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4015 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4016 break;
4017 if (mbs_cnt == elem_mbs_len)
4018 /* Found the entry. */
4019 found = 1;
4020 }
4021 /* Skip the byte sequence of the collating element. */
4022 idx += elem_mbs_len;
4023 /* Adjust for the alignment. */
4024 idx = (idx + 3) & ~3;
4025 /* Skip the collation sequence value. */
4026 idx += sizeof (uint32_t);
4027 /* Skip the wide char sequence of the collating element. */
4028 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4029 /* If we found the entry, return the sequence value. */
4030 if (found)
4031 return *(uint32_t *) (extra + idx);
4032 /* Skip the collation sequence value. */
4033 idx += sizeof (uint32_t);
4034 }
4035 return UINT_MAX;
4036 }
4037}
4038# endif /* _LIBC */
4039#endif /* RE_ENABLE_I18N */
4040
4041/* Check whether the node accepts the byte which is IDX-th
4042 byte of the INPUT. */
4043
4044static int
4045internal_function
4046check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4047 int idx)
4048{
4049 unsigned char ch;
4050 ch = re_string_byte_at (&mctx->input, idx);
4051 switch (node->type)
4052 {
4053 case CHARACTER:
4054 if (node->opr.c != ch)
4055 return 0;
4056 break;
4057
4058 case SIMPLE_BRACKET:
4059 if (!bitset_contain (node->opr.sbcset, ch))
4060 return 0;
4061 break;
4062
4063#ifdef RE_ENABLE_I18N
4064 case OP_UTF8_PERIOD:
4065 if (ch >= 0x80)
4066 return 0;
4067 /* FALLTHROUGH */
4068#endif
4069 case OP_PERIOD:
4070 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4071 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4072 return 0;
4073 break;
4074
4075 default:
4076 return 0;
4077 }
4078
4079 if (node->constraint)
4080 {
4081 /* The node has constraints. Check whether the current context
4082 satisfies the constraints. */
4083 unsigned int context = re_string_context_at (&mctx->input, idx,
4084 mctx->eflags);
4085 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4086 return 0;
4087 }
4088
4089 return 1;
4090}
4091
4092/* Extend the buffers, if the buffers have run out. */
4093
4094static reg_errcode_t
4095internal_function
4096extend_buffers (re_match_context_t *mctx)
4097{
4098 reg_errcode_t ret;
4099 re_string_t *pstr = &mctx->input;
4100
4101 /* Avoid overflow. */
4102 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4103 return REG_ESPACE;
4104
4105 /* Double the lengthes of the buffers. */
4106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4107 if (BE (ret != REG_NOERROR, 0))
4108 return ret;
4109
4110 if (mctx->state_log != NULL)
4111 {
4112 /* And double the length of state_log. */
4113 /* XXX We have no indication of the size of this buffer. If this
4114 allocation fail we have no indication that the state_log array
4115 does not have the right size. */
4116 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4117 pstr->bufs_len + 1);
4118 if (BE (new_array == NULL, 0))
4119 return REG_ESPACE;
4120 mctx->state_log = new_array;
4121 }
4122
4123 /* Then reconstruct the buffers. */
4124 if (pstr->icase)
4125 {
4126#ifdef RE_ENABLE_I18N
4127 if (pstr->mb_cur_max > 1)
4128 {
4129 ret = build_wcs_upper_buffer (pstr);
4130 if (BE (ret != REG_NOERROR, 0))
4131 return ret;
4132 }
4133 else
4134#endif /* RE_ENABLE_I18N */
4135 build_upper_buffer (pstr);
4136 }
4137 else
4138 {
4139#ifdef RE_ENABLE_I18N
4140 if (pstr->mb_cur_max > 1)
4141 build_wcs_buffer (pstr);
4142 else
4143#endif /* RE_ENABLE_I18N */
4144 {
4145 if (pstr->trans != NULL)
4146 re_string_translate_buffer (pstr);
4147 }
4148 }
4149 return REG_NOERROR;
4150}
4151
4152\f
4153/* Functions for matching context. */
4154
4155/* Initialize MCTX. */
4156
4157static reg_errcode_t
4158internal_function
4159match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4160{
4161 mctx->eflags = eflags;
4162 mctx->match_last = -1;
4163 if (n > 0)
4164 {
4165 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4166 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4167 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4168 return REG_ESPACE;
4169 }
4170 /* Already zero-ed by the caller.
4171 else
4172 mctx->bkref_ents = NULL;
4173 mctx->nbkref_ents = 0;
4174 mctx->nsub_tops = 0; */
4175 mctx->abkref_ents = n;
4176 mctx->max_mb_elem_len = 1;
4177 mctx->asub_tops = n;
4178 return REG_NOERROR;
4179}
4180
4181/* Clean the entries which depend on the current input in MCTX.
4182 This function must be invoked when the matcher changes the start index
4183 of the input, or changes the input string. */
4184
4185static void
4186internal_function
4187match_ctx_clean (re_match_context_t *mctx)
4188{
4189 int st_idx;
4190 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4191 {
4192 int sl_idx;
4193 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4194 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4195 {
4196 re_sub_match_last_t *last = top->lasts[sl_idx];
4197 re_free (last->path.array);
4198 re_free (last);
4199 }
4200 re_free (top->lasts);
4201 if (top->path)
4202 {
4203 re_free (top->path->array);
4204 re_free (top->path);
4205 }
4206 free (top);
4207 }
4208
4209 mctx->nsub_tops = 0;
4210 mctx->nbkref_ents = 0;
4211}
4212
4213/* Free all the memory associated with MCTX. */
4214
4215static void
4216internal_function
4217match_ctx_free (re_match_context_t *mctx)
4218{
4219 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4220 match_ctx_clean (mctx);
4221 re_free (mctx->sub_tops);
4222 re_free (mctx->bkref_ents);
4223}
4224
4225/* Add a new backreference entry to MCTX.
4226 Note that we assume that caller never call this function with duplicate
4227 entry, and call with STR_IDX which isn't smaller than any existing entry.
4228*/
4229
4230static reg_errcode_t
4231internal_function
4232match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4233 int to)
4234{
4235 if (mctx->nbkref_ents >= mctx->abkref_ents)
4236 {
4237 struct re_backref_cache_entry* new_entry;
4238 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4239 mctx->abkref_ents * 2);
4240 if (BE (new_entry == NULL, 0))
4241 {
4242 re_free (mctx->bkref_ents);
4243 return REG_ESPACE;
4244 }
4245 mctx->bkref_ents = new_entry;
4246 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4247 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4248 mctx->abkref_ents *= 2;
4249 }
4250 if (mctx->nbkref_ents > 0
4251 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4252 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4253
4254 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4255 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4256 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4257 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4258
4259 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4260 If bit N is clear, means that this entry won't epsilon-transition to
4261 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4262 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4263 such node.
4264
4265 A backreference does not epsilon-transition unless it is empty, so set
4266 to all zeros if FROM != TO. */
4267 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4268 = (from == to ? ~0 : 0);
4269
4270 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4271 if (mctx->max_mb_elem_len < to - from)
4272 mctx->max_mb_elem_len = to - from;
4273 return REG_NOERROR;
4274}
4275
4276/* Search for the first entry which has the same str_idx, or -1 if none is
4277 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4278
4279static int
4280internal_function
4281search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4282{
4283 int left, right, mid, last;
4284 last = right = mctx->nbkref_ents;
4285 for (left = 0; left < right;)
4286 {
4287 mid = (left + right) / 2;
4288 if (mctx->bkref_ents[mid].str_idx < str_idx)
4289 left = mid + 1;
4290 else
4291 right = mid;
4292 }
4293 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4294 return left;
4295 else
4296 return -1;
4297}
4298
4299/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4300 at STR_IDX. */
4301
4302static reg_errcode_t
4303internal_function
4304match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4305{
4306#ifdef DEBUG
4307 assert (mctx->sub_tops != NULL);
4308 assert (mctx->asub_tops > 0);
4309#endif
4310 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4311 {
4312 int new_asub_tops = mctx->asub_tops * 2;
4313 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4314 re_sub_match_top_t *,
4315 new_asub_tops);
4316 if (BE (new_array == NULL, 0))
4317 return REG_ESPACE;
4318 mctx->sub_tops = new_array;
4319 mctx->asub_tops = new_asub_tops;
4320 }
4321 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4322 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4323 return REG_ESPACE;
4324 mctx->sub_tops[mctx->nsub_tops]->node = node;
4325 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4326 return REG_NOERROR;
4327}
4328
4329/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4330 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4331
4332static re_sub_match_last_t *
4333internal_function
4334match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4335{
4336 re_sub_match_last_t *new_entry;
4337 if (BE (subtop->nlasts == subtop->alasts, 0))
4338 {
4339 int new_alasts = 2 * subtop->alasts + 1;
4340 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4341 re_sub_match_last_t *,
4342 new_alasts);
4343 if (BE (new_array == NULL, 0))
4344 return NULL;
4345 subtop->lasts = new_array;
4346 subtop->alasts = new_alasts;
4347 }
4348 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4349 if (BE (new_entry != NULL, 1))
4350 {
4351 subtop->lasts[subtop->nlasts] = new_entry;
4352 new_entry->node = node;
4353 new_entry->str_idx = str_idx;
4354 ++subtop->nlasts;
4355 }
4356 return new_entry;
4357}
4358
4359static void
4360internal_function
4361sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4362 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4363{
4364 sctx->sifted_states = sifted_sts;
4365 sctx->limited_states = limited_sts;
4366 sctx->last_node = last_node;
4367 sctx->last_str_idx = last_str_idx;
4368 re_node_set_init_empty (&sctx->limits);
4369}