1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56 internal_function;
57static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
61 internal_function;
62static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
67 regmatch_t *regs,
68 re_node_set *eps_via_nodes)
69 internal_function;
70static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
75 internal_function;
76
77#ifdef RE_ENABLE_I18N
78static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
81 internal_function;
82#endif /* RE_ENABLE_I18N */
83static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
85 internal_function;
86static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
89 internal_function;
90static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
92 int str_idx,
93 re_node_set *dest_nodes)
94 internal_function;
95static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
98 internal_function;
99static int check_dst_limits (const re_match_context_t *mctx,
100 re_node_set *limits,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
106 internal_function;
107static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
114 re_node_set *limits,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
120 internal_function;
121static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122 re_dfastate_t **dst,
123 re_dfastate_t **src, int num)
124 internal_function;
125static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
133 internal_function;
134static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
137#if 0
138static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
141 internal_function;
142#endif
143#ifdef RE_ENABLE_I18N
144static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
146 internal_function;
147#endif /* RE_ENABLE_I18N */
148static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
150 internal_function;
151static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
153 internal_function;
154static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
158 internal_function;
159static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166 int str_idx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
169 internal_function;
170static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
173 internal_function;
174static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
181 internal_function;
182static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184#ifdef RE_ENABLE_I18N
185static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
187 internal_function;
188# ifdef _LIBC
189static unsigned int find_collation_sequence_value (const unsigned char *mbs,
190 size_t name_len)
191 internal_function;
192# endif /* _LIBC */
193#endif /* RE_ENABLE_I18N */
194static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
195 const re_dfastate_t *state,
196 re_node_set *states_node,
197 bitset_t *states_ch) internal_function;
198static int check_node_accept (const re_match_context_t *mctx,
199 const re_token_t *node, int idx)
200 internal_function;
201static reg_errcode_t extend_buffers (re_match_context_t *mctx)
202 internal_function;
203\f
204/* Entry point for POSIX code. */
205
206/* regexec searches for a given pattern, specified by PREG, in the
207 string STRING.
208
209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211 least NMATCH elements, and we set them to the offsets of the
212 corresponding matched substrings.
213
214 EFLAGS specifies `execution flags' which affect matching: if
215 REG_NOTBOL is set, then ^ does not match at the beginning of the
216 string; if REG_NOTEOL is set, then $ does not match at the end.
217
218 We return 0 if we find a match and REG_NOMATCH if not. */
219
220int
221regexec (
222 const regex_t *__restrict preg,
223 const char *__restrict string,
224 size_t nmatch,
225 regmatch_t pmatch[],
226 int eflags)
227{
228 reg_errcode_t err;
229 int start, length;
230
231 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
232 return REG_BADPAT;
233
234 if (eflags & REG_STARTEND)
235 {
236 start = pmatch[0].rm_so;
237 length = pmatch[0].rm_eo;
238 }
239 else
240 {
241 start = 0;
242 length = strlen (string);
243 }
244
245 __libc_lock_lock (dfa->lock);
246 if (preg->no_sub)
247 err = re_search_internal (preg, string, length, start, length - start,
248 length, 0, NULL, eflags);
249 else
250 err = re_search_internal (preg, string, length, start, length - start,
251 length, nmatch, pmatch, eflags);
252 __libc_lock_unlock (dfa->lock);
253 return err != REG_NOERROR;
254}
255
256#ifdef _LIBC
257# include <shlib-compat.h>
258versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
259
260# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
261__typeof__ (__regexec) __compat_regexec;
262
263int
264attribute_compat_text_section
265__compat_regexec (const regex_t *__restrict preg,
266 const char *__restrict string, size_t nmatch,
267 regmatch_t pmatch[], int eflags)
268{
269 return regexec (preg, string, nmatch, pmatch,
270 eflags & (REG_NOTBOL | REG_NOTEOL));
271}
272compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
273# endif
274#endif
275
276/* Entry points for GNU code. */
277
278/* re_match, re_search, re_match_2, re_search_2
279
280 The former two functions operate on STRING with length LENGTH,
281 while the later two operate on concatenation of STRING1 and STRING2
282 with lengths LENGTH1 and LENGTH2, respectively.
283
284 re_match() matches the compiled pattern in BUFP against the string,
285 starting at index START.
286
287 re_search() first tries matching at index START, then it tries to match
288 starting from index START + 1, and so on. The last start position tried
289 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
290 way as re_match().)
291
292 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
293 the first STOP characters of the concatenation of the strings should be
294 concerned.
295
296 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
297 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
298 computed relative to the concatenation, not relative to the individual
299 strings.)
300
301 On success, re_match* functions return the length of the match, re_search*
302 return the position of the start of the match. Return value -1 means no
303 match was found and -2 indicates an internal error. */
304
305int
306re_match (struct re_pattern_buffer *bufp,
307 const char *string,
308 int length,
309 int start,
310 struct re_registers *regs)
311{
312 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
313}
314#ifdef _LIBC
315weak_alias (__re_match, re_match)
316#endif
317
318int
319re_search (struct re_pattern_buffer *bufp,
320 const char *string,
321 int length, int start, int range,
322 struct re_registers *regs)
323{
324 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
325}
326#ifdef _LIBC
327weak_alias (__re_search, re_search)
328#endif
329
330int
331re_match_2 (struct re_pattern_buffer *bufp,
332 const char *string1, int length1,
333 const char *string2, int length2, int start,
334 struct re_registers *regs, int stop)
335{
336 return re_search_2_stub (bufp, string1, length1, string2, length2,
337 start, 0, regs, stop, 1);
338}
339#ifdef _LIBC
340weak_alias (__re_match_2, re_match_2)
341#endif
342
343int
344re_search_2 (struct re_pattern_buffer *bufp,
345 const char *string1, int length1,
346 const char *string2, int length2, int start,
347 int range, struct re_registers *regs, int stop)
348{
349 return re_search_2_stub (bufp, string1, length1, string2, length2,
350 start, range, regs, stop, 0);
351}
352#ifdef _LIBC
353weak_alias (__re_search_2, re_search_2)
354#endif
355
356static int
357re_search_2_stub (struct re_pattern_buffer *bufp,
358 const char *string1, int length1,
359 const char *string2, int length2, int start,
360 int range, struct re_registers *regs,
361 int stop, int ret_len)
362{
363 const char *str;
364 int rval;
365 int len = length1 + length2;
366 int free_str = 0;
367
368 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
369 return -2;
370
371 /* Concatenate the strings. */
372 if (length2 > 0)
373 if (length1 > 0)
374 {
375 char *s = re_malloc (char, len);
376
377 if (BE (s == NULL, 0))
378 return -2;
379 memcpy (s, string1, length1);
380 memcpy (s + length1, string2, length2);
381 str = s;
382 free_str = 1;
383 }
384 else
385 str = string2;
386 else
387 str = string1;
388
389 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
390 if (free_str)
391 re_free ((char *) str);
392 return rval;
393}
394
395/* The parameters have the same meaning as those of re_search.
396 Additional parameters:
397 If RET_LEN is nonzero the length of the match is returned (re_match style);
398 otherwise the position of the match is returned. */
399
400static int
401re_search_stub (struct re_pattern_buffer *bufp,
402 const char *string, int length, int start,
403 int range, int stop,
404 struct re_registers *regs, int ret_len)
405{
406 reg_errcode_t result;
407 regmatch_t *pmatch;
408 int nregs, rval;
409 int eflags = 0;
410
411 /* Check for out-of-range. */
412 if (BE (start < 0 || start > length, 0))
413 return -1;
414 if (BE (start + range > length, 0))
415 range = length - start;
416 else if (BE (start + range < 0, 0))
417 range = -start;
418
419 __libc_lock_lock (dfa->lock);
420
421 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
422 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
423
424 /* Compile fastmap if we haven't yet. */
425 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
426 re_compile_fastmap (bufp);
427
428 if (BE (bufp->no_sub, 0))
429 regs = NULL;
430
431 /* We need at least 1 register. */
432 if (regs == NULL)
433 nregs = 1;
434 else if (BE (bufp->regs_allocated == REGS_FIXED &&
435 regs->num_regs < bufp->re_nsub + 1, 0))
436 {
437 nregs = regs->num_regs;
438 if (BE (nregs < 1, 0))
439 {
440 /* Nothing can be copied to regs. */
441 regs = NULL;
442 nregs = 1;
443 }
444 }
445 else
446 nregs = bufp->re_nsub + 1;
447 pmatch = re_malloc (regmatch_t, nregs);
448 if (BE (pmatch == NULL, 0))
449 {
450 rval = -2;
451 goto out;
452 }
453
454 result = re_search_internal (bufp, string, length, start, range, stop,
455 nregs, pmatch, eflags);
456
457 rval = 0;
458
459 /* I hope we needn't fill ther regs with -1's when no match was found. */
460 if (result != REG_NOERROR)
461 rval = -1;
462 else if (regs != NULL)
463 {
464 /* If caller wants register contents data back, copy them. */
465 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
466 bufp->regs_allocated);
467 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
468 rval = -2;
469 }
470
471 if (BE (rval == 0, 1))
472 {
473 if (ret_len)
474 {
475 assert (pmatch[0].rm_so == start);
476 rval = pmatch[0].rm_eo - start;
477 }
478 else
479 rval = pmatch[0].rm_so;
480 }
481 re_free (pmatch);
482 out:
483 __libc_lock_unlock (dfa->lock);
484 return rval;
485}
486
487static unsigned
488re_copy_regs (struct re_registers *regs,
489 regmatch_t *pmatch,
490 int nregs, int regs_allocated)
491{
492 int rval = REGS_REALLOCATE;
493 int i;
494 int need_regs = nregs + 1;
495 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
496 uses. */
497
498 /* Have the register data arrays been allocated? */
499 if (regs_allocated == REGS_UNALLOCATED)
500 { /* No. So allocate them with malloc. */
501 regs->start = re_malloc (regoff_t, need_regs);
502 if (BE (regs->start == NULL, 0))
503 return REGS_UNALLOCATED;
504 regs->end = re_malloc (regoff_t, need_regs);
505 if (BE (regs->end == NULL, 0))
506 {
507 re_free (regs->start);
508 return REGS_UNALLOCATED;
509 }
510 regs->num_regs = need_regs;
511 }
512 else if (regs_allocated == REGS_REALLOCATE)
513 { /* Yes. If we need more elements than were already
514 allocated, reallocate them. If we need fewer, just
515 leave it alone. */
516 if (BE (need_regs > regs->num_regs, 0))
517 {
518 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
519 regoff_t *new_end;
520 if (BE (new_start == NULL, 0))
521 return REGS_UNALLOCATED;
522 new_end = re_realloc (regs->end, regoff_t, need_regs);
523 if (BE (new_end == NULL, 0))
524 {
525 re_free (new_start);
526 return REGS_UNALLOCATED;
527 }
528 regs->start = new_start;
529 regs->end = new_end;
530 regs->num_regs = need_regs;
531 }
532 }
533 else
534 {
535 assert (regs_allocated == REGS_FIXED);
536 /* This function may not be called with REGS_FIXED and nregs too big. */
537 assert (regs->num_regs >= nregs);
538 rval = REGS_FIXED;
539 }
540
541 /* Copy the regs. */
542 for (i = 0; i < nregs; ++i)
543 {
544 regs->start[i] = pmatch[i].rm_so;
545 regs->end[i] = pmatch[i].rm_eo;
546 }
547 for ( ; i < regs->num_regs; ++i)
548 regs->start[i] = regs->end[i] = -1;
549
550 return rval;
551}
552
553/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
554 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
555 this memory for recording register information. STARTS and ENDS
556 must be allocated using the malloc library routine, and must each
557 be at least NUM_REGS * sizeof (regoff_t) bytes long.
558
559 If NUM_REGS == 0, then subsequent matches should allocate their own
560 register data.
561
562 Unless this function is called, the first search or match using
563 PATTERN_BUFFER will allocate its own register data, without
564 freeing the old data. */
565
566void
567re_set_registers (struct re_pattern_buffer *bufp,
568 struct re_registers *regs,
569 unsigned num_regs,
570 regoff_t *starts,
571 regoff_t *ends)
572{
573 if (num_regs)
574 {
575 bufp->regs_allocated = REGS_REALLOCATE;
576 regs->num_regs = num_regs;
577 regs->start = starts;
578 regs->end = ends;
579 }
580 else
581 {
582 bufp->regs_allocated = REGS_UNALLOCATED;
583 regs->num_regs = 0;
584 regs->start = regs->end = (regoff_t *) 0;
585 }
586}
587#ifdef _LIBC
588weak_alias (__re_set_registers, re_set_registers)
589#endif
590\f
591/* Entry points compatible with 4.2 BSD regex library. We don't define
592 them unless specifically requested. */
593
594#if defined _REGEX_RE_COMP || defined _LIBC
595int
596# ifdef _LIBC
597weak_function
598# endif
599re_exec (s)
600 const char *s;
601{
602 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
603}
604#endif /* _REGEX_RE_COMP */
605\f
606/* Internal entry point. */
607
608/* Searches for a compiled pattern PREG in the string STRING, whose
609 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
610 mingings with regexec. START, and RANGE have the same meanings
611 with re_search.
612 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
613 otherwise return the error code.
614 Note: We assume front end functions already check ranges.
615 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
616
617static reg_errcode_t
618re_search_internal (const regex_t *preg,
619 const char *string,
620 int length, int start, int range, int stop,
621 size_t nmatch, regmatch_t pmatch[],
622 int eflags)
623{
624 reg_errcode_t err;
625 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
626 int left_lim, right_lim, incr;
627 int fl_longest_match, match_first, match_kind, match_last = -1;
628 int extra_nmatch;
629 int sb, ch;
630#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
631 re_match_context_t mctx = { .dfa = dfa };
632#else
633 re_match_context_t mctx;
634#endif
635 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
636 && range && !preg->can_be_null) ? preg->fastmap : NULL;
637 RE_TRANSLATE_TYPE t = preg->translate;
638
639#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
640 memset (&mctx, '\0', sizeof (re_match_context_t));
641 mctx.dfa = dfa;
642#endif
643
644 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
645 nmatch -= extra_nmatch;
646
647 /* Check if the DFA haven't been compiled. */
648 if (BE (preg->used == 0 || dfa->init_state == NULL
649 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
650 || dfa->init_state_begbuf == NULL, 0))
651 return REG_NOMATCH;
652
653#ifdef DEBUG
654 /* We assume front-end functions already check them. */
655 assert (start + range >= 0 && start + range <= length);
656#endif
657
658 /* If initial states with non-begbuf contexts have no elements,
659 the regex must be anchored. If preg->newline_anchor is set,
660 we'll never use init_state_nl, so do not check it. */
661 if (dfa->init_state->nodes.nelem == 0
662 && dfa->init_state_word->nodes.nelem == 0
663 && (dfa->init_state_nl->nodes.nelem == 0
664 || !preg->newline_anchor))
665 {
666 if (start != 0 && start + range != 0)
667 return REG_NOMATCH;
668 start = range = 0;
669 }
670
671 /* We must check the longest matching, if nmatch > 0. */
672 fl_longest_match = (nmatch != 0 || dfa->nbackref);
673
674 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
675 preg->translate, preg->syntax & RE_ICASE, dfa);
676 if (BE (err != REG_NOERROR, 0))
677 goto free_return;
678 mctx.input.stop = stop;
679 mctx.input.raw_stop = stop;
680 mctx.input.newline_anchor = preg->newline_anchor;
681
682 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
683 if (BE (err != REG_NOERROR, 0))
684 goto free_return;
685
686 /* We will log all the DFA states through which the dfa pass,
687 if nmatch > 1, or this dfa has "multibyte node", which is a
688 back-reference or a node which can accept multibyte character or
689 multi character collating element. */
690 if (nmatch > 1 || dfa->has_mb_node)
691 {
692 /* Avoid overflow. */
693 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
694 {
695 err = REG_ESPACE;
696 goto free_return;
697 }
698
699 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
700 if (BE (mctx.state_log == NULL, 0))
701 {
702 err = REG_ESPACE;
703 goto free_return;
704 }
705 }
706 else
707 mctx.state_log = NULL;
708
709 match_first = start;
710 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
711 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
712
713 /* Check incrementally whether of not the input string match. */
714 incr = (range < 0) ? -1 : 1;
715 left_lim = (range < 0) ? start + range : start;
716 right_lim = (range < 0) ? start : start + range;
717 sb = dfa->mb_cur_max == 1;
718 match_kind =
719 (fastmap
720 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
721 | (range >= 0 ? 2 : 0)
722 | (t != NULL ? 1 : 0))
723 : 8);
724
725 for (;; match_first += incr)
726 {
727 err = REG_NOMATCH;
728 if (match_first < left_lim || right_lim < match_first)
729 goto free_return;
730
731 /* Advance as rapidly as possible through the string, until we
732 find a plausible place to start matching. This may be done
733 with varying efficiency, so there are various possibilities:
734 only the most common of them are specialized, in order to
735 save on code size. We use a switch statement for speed. */
736 switch (match_kind)
737 {
738 case 8:
739 /* No fastmap. */
740 break;
741
742 case 7:
743 /* Fastmap with single-byte translation, match forward. */
744 while (BE (match_first < right_lim, 1)
745 && !fastmap[t[(unsigned char) string[match_first]]])
746 ++match_first;
747 goto forward_match_found_start_or_reached_end;
748
749 case 6:
750 /* Fastmap without translation, match forward. */
751 while (BE (match_first < right_lim, 1)
752 && !fastmap[(unsigned char) string[match_first]])
753 ++match_first;
754
755 forward_match_found_start_or_reached_end:
756 if (BE (match_first == right_lim, 0))
757 {
758 ch = match_first >= length
759 ? 0 : (unsigned char) string[match_first];
760 if (!fastmap[t ? t[ch] : ch])
761 goto free_return;
762 }
763 break;
764
765 case 4:
766 case 5:
767 /* Fastmap without multi-byte translation, match backwards. */
768 while (match_first >= left_lim)
769 {
770 ch = match_first >= length
771 ? 0 : (unsigned char) string[match_first];
772 if (fastmap[t ? t[ch] : ch])
773 break;
774 --match_first;
775 }
776 if (match_first < left_lim)
777 goto free_return;
778 break;
779
780 default:
781 /* In this case, we can't determine easily the current byte,
782 since it might be a component byte of a multibyte
783 character. Then we use the constructed buffer instead. */
784 for (;;)
785 {
786 /* If MATCH_FIRST is out of the valid range, reconstruct the
787 buffers. */
788 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
789 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
790 {
791 err = re_string_reconstruct (&mctx.input, match_first,
792 eflags);
793 if (BE (err != REG_NOERROR, 0))
794 goto free_return;
795
796 offset = match_first - mctx.input.raw_mbs_idx;
797 }
798 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
799 Note that MATCH_FIRST must not be smaller than 0. */
800 ch = (match_first >= length
801 ? 0 : re_string_byte_at (&mctx.input, offset));
802 if (fastmap[ch])
803 break;
804 match_first += incr;
805 if (match_first < left_lim || match_first > right_lim)
806 {
807 err = REG_NOMATCH;
808 goto free_return;
809 }
810 }
811 break;
812 }
813
814 /* Reconstruct the buffers so that the matcher can assume that
815 the matching starts from the beginning of the buffer. */
816 err = re_string_reconstruct (&mctx.input, match_first, eflags);
817 if (BE (err != REG_NOERROR, 0))
818 goto free_return;
819
820#ifdef RE_ENABLE_I18N
821 /* Don't consider this char as a possible match start if it part,
822 yet isn't the head, of a multibyte character. */
823 if (!sb && !re_string_first_byte (&mctx.input, 0))
824 continue;
825#endif
826
827 /* It seems to be appropriate one, then use the matcher. */
828 /* We assume that the matching starts from 0. */
829 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
830 match_last = check_matching (&mctx, fl_longest_match,
831 range >= 0 ? &match_first : NULL);
832 if (match_last != -1)
833 {
834 if (BE (match_last == -2, 0))
835 {
836 err = REG_ESPACE;
837 goto free_return;
838 }
839 else
840 {
841 mctx.match_last = match_last;
842 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
843 {
844 re_dfastate_t *pstate = mctx.state_log[match_last];
845 mctx.last_node = check_halt_state_context (&mctx, pstate,
846 match_last);
847 }
848 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
849 || dfa->nbackref)
850 {
851 err = prune_impossible_nodes (&mctx);
852 if (err == REG_NOERROR)
853 break;
854 if (BE (err != REG_NOMATCH, 0))
855 goto free_return;
856 match_last = -1;
857 }
858 else
859 break; /* We found a match. */
860 }
861 }
862
863 match_ctx_clean (&mctx);
864 }
865
866#ifdef DEBUG
867 assert (match_last != -1);
868 assert (err == REG_NOERROR);
869#endif
870
871 /* Set pmatch[] if we need. */
872 if (nmatch > 0)
873 {
874 int reg_idx;
875
876 /* Initialize registers. */
877 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
878 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
879
880 /* Set the points where matching start/end. */
881 pmatch[0].rm_so = 0;
882 pmatch[0].rm_eo = mctx.match_last;
883
884 if (!preg->no_sub && nmatch > 1)
885 {
886 err = set_regs (preg, &mctx, nmatch, pmatch,
887 dfa->has_plural_match && dfa->nbackref > 0);
888 if (BE (err != REG_NOERROR, 0))
889 goto free_return;
890 }
891
892 /* At last, add the offset to the each registers, since we slided
893 the buffers so that we could assume that the matching starts
894 from 0. */
895 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
896 if (pmatch[reg_idx].rm_so != -1)
897 {
898#ifdef RE_ENABLE_I18N
899 if (BE (mctx.input.offsets_needed != 0, 0))
900 {
901 pmatch[reg_idx].rm_so =
902 (pmatch[reg_idx].rm_so == mctx.input.valid_len
903 ? mctx.input.valid_raw_len
904 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
905 pmatch[reg_idx].rm_eo =
906 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
907 ? mctx.input.valid_raw_len
908 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
909 }
910#else
911 assert (mctx.input.offsets_needed == 0);
912#endif
913 pmatch[reg_idx].rm_so += match_first;
914 pmatch[reg_idx].rm_eo += match_first;
915 }
916 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
917 {
918 pmatch[nmatch + reg_idx].rm_so = -1;
919 pmatch[nmatch + reg_idx].rm_eo = -1;
920 }
921
922 if (dfa->subexp_map)
923 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
924 if (dfa->subexp_map[reg_idx] != reg_idx)
925 {
926 pmatch[reg_idx + 1].rm_so
927 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
928 pmatch[reg_idx + 1].rm_eo
929 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
930 }
931 }
932
933 free_return:
934 re_free (mctx.state_log);
935 if (dfa->nbackref)
936 match_ctx_free (&mctx);
937 re_string_destruct (&mctx.input);
938 return err;
939}
940
941static reg_errcode_t
942prune_impossible_nodes (re_match_context_t *mctx)
943{
944 const re_dfa_t *const dfa = mctx->dfa;
945 int halt_node, match_last;
946 reg_errcode_t ret;
947 re_dfastate_t **sifted_states;
948 re_dfastate_t **lim_states = NULL;
949 re_sift_context_t sctx;
950#ifdef DEBUG
951 assert (mctx->state_log != NULL);
952#endif
953 match_last = mctx->match_last;
954 halt_node = mctx->last_node;
955
956 /* Avoid overflow. */
957 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
958 return REG_ESPACE;
959
960 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
961 if (BE (sifted_states == NULL, 0))
962 {
963 ret = REG_ESPACE;
964 goto free_return;
965 }
966 if (dfa->nbackref)
967 {
968 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
969 if (BE (lim_states == NULL, 0))
970 {
971 ret = REG_ESPACE;
972 goto free_return;
973 }
974 while (1)
975 {
976 memset (lim_states, '\0',
977 sizeof (re_dfastate_t *) * (match_last + 1));
978 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
979 match_last);
980 ret = sift_states_backward (mctx, &sctx);
981 re_node_set_free (&sctx.limits);
982 if (BE (ret != REG_NOERROR, 0))
983 goto free_return;
984 if (sifted_states[0] != NULL || lim_states[0] != NULL)
985 break;
986 do
987 {
988 --match_last;
989 if (match_last < 0)
990 {
991 ret = REG_NOMATCH;
992 goto free_return;
993 }
994 } while (mctx->state_log[match_last] == NULL
995 || !mctx->state_log[match_last]->halt);
996 halt_node = check_halt_state_context (mctx,
997 mctx->state_log[match_last],
998 match_last);
999 }
1000 ret = merge_state_array (dfa, sifted_states, lim_states,
1001 match_last + 1);
1002 re_free (lim_states);
1003 lim_states = NULL;
1004 if (BE (ret != REG_NOERROR, 0))
1005 goto free_return;
1006 }
1007 else
1008 {
1009 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1010 ret = sift_states_backward (mctx, &sctx);
1011 re_node_set_free (&sctx.limits);
1012 if (BE (ret != REG_NOERROR, 0))
1013 goto free_return;
1014 if (sifted_states[0] == NULL)
1015 {
1016 ret = REG_NOMATCH;
1017 goto free_return;
1018 }
1019 }
1020 re_free (mctx->state_log);
1021 mctx->state_log = sifted_states;
1022 sifted_states = NULL;
1023 mctx->last_node = halt_node;
1024 mctx->match_last = match_last;
1025 ret = REG_NOERROR;
1026 free_return:
1027 re_free (sifted_states);
1028 re_free (lim_states);
1029 return ret;
1030}
1031
1032/* Acquire an initial state and return it.
1033 We must select appropriate initial state depending on the context,
1034 since initial states may have constraints like "\<", "^", etc.. */
1035
1036static inline re_dfastate_t *
1037__attribute ((always_inline)) internal_function
1038acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1039 int idx)
1040{
1041 const re_dfa_t *const dfa = mctx->dfa;
1042 if (dfa->init_state->has_constraint)
1043 {
1044 unsigned int context;
1045 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1046 if (IS_WORD_CONTEXT (context))
1047 return dfa->init_state_word;
1048 else if (IS_ORDINARY_CONTEXT (context))
1049 return dfa->init_state;
1050 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1051 return dfa->init_state_begbuf;
1052 else if (IS_NEWLINE_CONTEXT (context))
1053 return dfa->init_state_nl;
1054 else if (IS_BEGBUF_CONTEXT (context))
1055 {
1056 /* It is relatively rare case, then calculate on demand. */
1057 return re_acquire_state_context (err, dfa,
1058 dfa->init_state->entrance_nodes,
1059 context);
1060 }
1061 else
1062 /* Must not happen? */
1063 return dfa->init_state;
1064 }
1065 else
1066 return dfa->init_state;
1067}
1068
1069/* Check whether the regular expression match input string INPUT or not,
1070 and return the index where the matching end, return -1 if not match,
1071 or return -2 in case of an error.
1072 FL_LONGEST_MATCH means we want the POSIX longest matching.
1073 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1074 next place where we may want to try matching.
1075 Note that the matcher assume that the maching starts from the current
1076 index of the buffer. */
1077
1078static int
1079internal_function
1080check_matching (re_match_context_t *mctx, int fl_longest_match,
1081 int *p_match_first)
1082{
1083 const re_dfa_t *const dfa = mctx->dfa;
1084 reg_errcode_t err;
1085 int match = 0;
1086 int match_last = -1;
1087 int cur_str_idx = re_string_cur_idx (&mctx->input);
1088 re_dfastate_t *cur_state;
1089 int at_init_state = p_match_first != NULL;
1090 int next_start_idx = cur_str_idx;
1091
1092 err = REG_NOERROR;
1093 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1094 /* An initial state must not be NULL (invalid). */
1095 if (BE (cur_state == NULL, 0))
1096 {
1097 assert (err == REG_ESPACE);
1098 return -2;
1099 }
1100
1101 if (mctx->state_log != NULL)
1102 {
1103 mctx->state_log[cur_str_idx] = cur_state;
1104
1105 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1106 later. E.g. Processing back references. */
1107 if (BE (dfa->nbackref, 0))
1108 {
1109 at_init_state = 0;
1110 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1111 if (BE (err != REG_NOERROR, 0))
1112 return err;
1113
1114 if (cur_state->has_backref)
1115 {
1116 err = transit_state_bkref (mctx, &cur_state->nodes);
1117 if (BE (err != REG_NOERROR, 0))
1118 return err;
1119 }
1120 }
1121 }
1122
1123 /* If the RE accepts NULL string. */
1124 if (BE (cur_state->halt, 0))
1125 {
1126 if (!cur_state->has_constraint
1127 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1128 {
1129 if (!fl_longest_match)
1130 return cur_str_idx;
1131 else
1132 {
1133 match_last = cur_str_idx;
1134 match = 1;
1135 }
1136 }
1137 }
1138
1139 while (!re_string_eoi (&mctx->input))
1140 {
1141 re_dfastate_t *old_state = cur_state;
1142 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1143
1144 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1145 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1146 && mctx->input.valid_len < mctx->input.len))
1147 {
1148 err = extend_buffers (mctx);
1149 if (BE (err != REG_NOERROR, 0))
1150 {
1151 assert (err == REG_ESPACE);
1152 return -2;
1153 }
1154 }
1155
1156 cur_state = transit_state (&err, mctx, cur_state);
1157 if (mctx->state_log != NULL)
1158 cur_state = merge_state_with_log (&err, mctx, cur_state);
1159
1160 if (cur_state == NULL)
1161 {
1162 /* Reached the invalid state or an error. Try to recover a valid
1163 state using the state log, if available and if we have not
1164 already found a valid (even if not the longest) match. */
1165 if (BE (err != REG_NOERROR, 0))
1166 return -2;
1167
1168 if (mctx->state_log == NULL
1169 || (match && !fl_longest_match)
1170 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1171 break;
1172 }
1173
1174 if (BE (at_init_state, 0))
1175 {
1176 if (old_state == cur_state)
1177 next_start_idx = next_char_idx;
1178 else
1179 at_init_state = 0;
1180 }
1181
1182 if (cur_state->halt)
1183 {
1184 /* Reached a halt state.
1185 Check the halt state can satisfy the current context. */
1186 if (!cur_state->has_constraint
1187 || check_halt_state_context (mctx, cur_state,
1188 re_string_cur_idx (&mctx->input)))
1189 {
1190 /* We found an appropriate halt state. */
1191 match_last = re_string_cur_idx (&mctx->input);
1192 match = 1;
1193
1194 /* We found a match, do not modify match_first below. */
1195 p_match_first = NULL;
1196 if (!fl_longest_match)
1197 break;
1198 }
1199 }
1200 }
1201
1202 if (p_match_first)
1203 *p_match_first += next_start_idx;
1204
1205 return match_last;
1206}
1207
1208/* Check NODE match the current context. */
1209
1210static int
1211internal_function
1212check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1213{
1214 re_token_type_t type = dfa->nodes[node].type;
1215 unsigned int constraint = dfa->nodes[node].constraint;
1216 if (type != END_OF_RE)
1217 return 0;
1218 if (!constraint)
1219 return 1;
1220 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1221 return 0;
1222 return 1;
1223}
1224
1225/* Check the halt state STATE match the current context.
1226 Return 0 if not match, if the node, STATE has, is a halt node and
1227 match the context, return the node. */
1228
1229static int
1230internal_function
1231check_halt_state_context (const re_match_context_t *mctx,
1232 const re_dfastate_t *state, int idx)
1233{
1234 int i;
1235 unsigned int context;
1236#ifdef DEBUG
1237 assert (state->halt);
1238#endif
1239 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1240 for (i = 0; i < state->nodes.nelem; ++i)
1241 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1242 return state->nodes.elems[i];
1243 return 0;
1244}
1245
1246/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1247 corresponding to the DFA).
1248 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1249 of errors. */
1250
1251static int
1252internal_function
1253proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1254 int *pidx, int node, re_node_set *eps_via_nodes,
1255 struct re_fail_stack_t *fs)
1256{
1257 const re_dfa_t *const dfa = mctx->dfa;
1258 int i, err;
1259 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1260 {
1261 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1262 re_node_set *edests = &dfa->edests[node];
1263 int dest_node;
1264 err = re_node_set_insert (eps_via_nodes, node);
1265 if (BE (err < 0, 0))
1266 return -2;
1267 /* Pick up a valid destination, or return -1 if none is found. */
1268 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1269 {
1270 int candidate = edests->elems[i];
1271 if (!re_node_set_contains (cur_nodes, candidate))
1272 continue;
1273 if (dest_node == -1)
1274 dest_node = candidate;
1275
1276 else
1277 {
1278 /* In order to avoid infinite loop like "(a*)*", return the second
1279 epsilon-transition if the first was already considered. */
1280 if (re_node_set_contains (eps_via_nodes, dest_node))
1281 return candidate;
1282
1283 /* Otherwise, push the second epsilon-transition on the fail stack. */
1284 else if (fs != NULL
1285 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1286 eps_via_nodes))
1287 return -2;
1288
1289 /* We know we are going to exit. */
1290 break;
1291 }
1292 }
1293 return dest_node;
1294 }
1295 else
1296 {
1297 int naccepted = 0;
1298 re_token_type_t type = dfa->nodes[node].type;
1299
1300#ifdef RE_ENABLE_I18N
1301 if (dfa->nodes[node].accept_mb)
1302 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1303 else
1304#endif /* RE_ENABLE_I18N */
1305 if (type == OP_BACK_REF)
1306 {
1307 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1308 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1309 if (fs != NULL)
1310 {
1311 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1312 return -1;
1313 else if (naccepted)
1314 {
1315 char *buf = (char *) re_string_get_buffer (&mctx->input);
1316 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1317 naccepted) != 0)
1318 return -1;
1319 }
1320 }
1321
1322 if (naccepted == 0)
1323 {
1324 int dest_node;
1325 err = re_node_set_insert (eps_via_nodes, node);
1326 if (BE (err < 0, 0))
1327 return -2;
1328 dest_node = dfa->edests[node].elems[0];
1329 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1330 dest_node))
1331 return dest_node;
1332 }
1333 }
1334
1335 if (naccepted != 0
1336 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1337 {
1338 int dest_node = dfa->nexts[node];
1339 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1340 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1341 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1342 dest_node)))
1343 return -1;
1344 re_node_set_empty (eps_via_nodes);
1345 return dest_node;
1346 }
1347 }
1348 return -1;
1349}
1350
1351static reg_errcode_t
1352internal_function
1353push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1354 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1355{
1356 reg_errcode_t err;
1357 int num = fs->num++;
1358 if (fs->num == fs->alloc)
1359 {
1360 struct re_fail_stack_ent_t *new_array;
1361 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1362 * fs->alloc * 2));
1363 if (new_array == NULL)
1364 return REG_ESPACE;
1365 fs->alloc *= 2;
1366 fs->stack = new_array;
1367 }
1368 fs->stack[num].idx = str_idx;
1369 fs->stack[num].node = dest_node;
1370 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1371 if (fs->stack[num].regs == NULL)
1372 return REG_ESPACE;
1373 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1374 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1375 return err;
1376}
1377
1378static int
1379internal_function
1380pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1381 regmatch_t *regs, re_node_set *eps_via_nodes)
1382{
1383 int num = --fs->num;
1384 assert (num >= 0);
1385 *pidx = fs->stack[num].idx;
1386 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1387 re_node_set_free (eps_via_nodes);
1388 re_free (fs->stack[num].regs);
1389 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1390 return fs->stack[num].node;
1391}
1392
1393/* Set the positions where the subexpressions are starts/ends to registers
1394 PMATCH.
1395 Note: We assume that pmatch[0] is already set, and
1396 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1397
1398static reg_errcode_t
1399internal_function
1400set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1401 regmatch_t *pmatch, int fl_backtrack)
1402{
1403 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1404 int idx, cur_node;
1405 re_node_set eps_via_nodes;
1406 struct re_fail_stack_t *fs;
1407 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1408 regmatch_t *prev_idx_match;
1409 int prev_idx_match_malloced = 0;
1410
1411#ifdef DEBUG
1412 assert (nmatch > 1);
1413 assert (mctx->state_log != NULL);
1414#endif
1415 if (fl_backtrack)
1416 {
1417 fs = &fs_body;
1418 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1419 if (fs->stack == NULL)
1420 return REG_ESPACE;
1421 }
1422 else
1423 fs = NULL;
1424
1425 cur_node = dfa->init_node;
1426 re_node_set_init_empty (&eps_via_nodes);
1427
1428#ifdef HAVE_ALLOCA
1429 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1430 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1431 else
1432#endif
1433 {
1434 prev_idx_match = re_malloc (regmatch_t, nmatch);
1435 if (prev_idx_match == NULL)
1436 {
1437 free_fail_stack_return (fs);
1438 return REG_ESPACE;
1439 }
1440 prev_idx_match_malloced = 1;
1441 }
1442 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1443
1444 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1445 {
1446 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1447
1448 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1449 {
1450 int reg_idx;
1451 if (fs)
1452 {
1453 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1454 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1455 break;
1456 if (reg_idx == nmatch)
1457 {
1458 re_node_set_free (&eps_via_nodes);
1459 if (prev_idx_match_malloced)
1460 re_free (prev_idx_match);
1461 return free_fail_stack_return (fs);
1462 }
1463 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1464 &eps_via_nodes);
1465 }
1466 else
1467 {
1468 re_node_set_free (&eps_via_nodes);
1469 if (prev_idx_match_malloced)
1470 re_free (prev_idx_match);
1471 return REG_NOERROR;
1472 }
1473 }
1474
1475 /* Proceed to next node. */
1476 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1477 &eps_via_nodes, fs);
1478
1479 if (BE (cur_node < 0, 0))
1480 {
1481 if (BE (cur_node == -2, 0))
1482 {
1483 re_node_set_free (&eps_via_nodes);
1484 if (prev_idx_match_malloced)
1485 re_free (prev_idx_match);
1486 free_fail_stack_return (fs);
1487 return REG_ESPACE;
1488 }
1489 if (fs)
1490 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1491 &eps_via_nodes);
1492 else
1493 {
1494 re_node_set_free (&eps_via_nodes);
1495 if (prev_idx_match_malloced)
1496 re_free (prev_idx_match);
1497 return REG_NOMATCH;
1498 }
1499 }
1500 }
1501 re_node_set_free (&eps_via_nodes);
1502 if (prev_idx_match_malloced)
1503 re_free (prev_idx_match);
1504 return free_fail_stack_return (fs);
1505}
1506
1507static reg_errcode_t
1508internal_function
1509free_fail_stack_return (struct re_fail_stack_t *fs)
1510{
1511 if (fs)
1512 {
1513 int fs_idx;
1514 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1515 {
1516 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1517 re_free (fs->stack[fs_idx].regs);
1518 }
1519 re_free (fs->stack);
1520 }
1521 return REG_NOERROR;
1522}
1523
1524static void
1525internal_function
1526update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1527 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1528{
1529 int type = dfa->nodes[cur_node].type;
1530 if (type == OP_OPEN_SUBEXP)
1531 {
1532 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1533
1534 /* We are at the first node of this sub expression. */
1535 if (reg_num < nmatch)
1536 {
1537 pmatch[reg_num].rm_so = cur_idx;
1538 pmatch[reg_num].rm_eo = -1;
1539 }
1540 }
1541 else if (type == OP_CLOSE_SUBEXP)
1542 {
1543 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1544 if (reg_num < nmatch)
1545 {
1546 /* We are at the last node of this sub expression. */
1547 if (pmatch[reg_num].rm_so < cur_idx)
1548 {
1549 pmatch[reg_num].rm_eo = cur_idx;
1550 /* This is a non-empty match or we are not inside an optional
1551 subexpression. Accept this right away. */
1552 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1553 }
1554 else
1555 {
1556 if (dfa->nodes[cur_node].opt_subexp
1557 && prev_idx_match[reg_num].rm_so != -1)
1558 /* We transited through an empty match for an optional
1559 subexpression, like (a?)*, and this is not the subexp's
1560 first match. Copy back the old content of the registers
1561 so that matches of an inner subexpression are undone as
1562 well, like in ((a?))*. */
1563 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1564 else
1565 /* We completed a subexpression, but it may be part of
1566 an optional one, so do not update PREV_IDX_MATCH. */
1567 pmatch[reg_num].rm_eo = cur_idx;
1568 }
1569 }
1570 }
1571}
1572
1573/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1574 and sift the nodes in each states according to the following rules.
1575 Updated state_log will be wrote to STATE_LOG.
1576
1577 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1578 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1579 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1580 the LAST_NODE, we throw away the node `a'.
1581 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1582 string `s' and transit to `b':
1583 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1584 away the node `a'.
1585 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1586 thrown away, we throw away the node `a'.
1587 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1588 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1589 node `a'.
1590 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1591 we throw away the node `a'. */
1592
1593#define STATE_NODE_CONTAINS(state,node) \
1594 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1595
1596static reg_errcode_t
1597internal_function
1598sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1599{
1600 reg_errcode_t err;
1601 int null_cnt = 0;
1602 int str_idx = sctx->last_str_idx;
1603 re_node_set cur_dest;
1604
1605#ifdef DEBUG
1606 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1607#endif
1608
1609 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1610 transit to the last_node and the last_node itself. */
1611 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1612 if (BE (err != REG_NOERROR, 0))
1613 return err;
1614 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1615 if (BE (err != REG_NOERROR, 0))
1616 goto free_return;
1617
1618 /* Then check each states in the state_log. */
1619 while (str_idx > 0)
1620 {
1621 /* Update counters. */
1622 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1623 if (null_cnt > mctx->max_mb_elem_len)
1624 {
1625 memset (sctx->sifted_states, '\0',
1626 sizeof (re_dfastate_t *) * str_idx);
1627 re_node_set_free (&cur_dest);
1628 return REG_NOERROR;
1629 }
1630 re_node_set_empty (&cur_dest);
1631 --str_idx;
1632
1633 if (mctx->state_log[str_idx])
1634 {
1635 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1636 if (BE (err != REG_NOERROR, 0))
1637 goto free_return;
1638 }
1639
1640 /* Add all the nodes which satisfy the following conditions:
1641 - It can epsilon transit to a node in CUR_DEST.
1642 - It is in CUR_SRC.
1643 And update state_log. */
1644 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1645 if (BE (err != REG_NOERROR, 0))
1646 goto free_return;
1647 }
1648 err = REG_NOERROR;
1649 free_return:
1650 re_node_set_free (&cur_dest);
1651 return err;
1652}
1653
1654static reg_errcode_t
1655internal_function
1656build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1657 int str_idx, re_node_set *cur_dest)
1658{
1659 const re_dfa_t *const dfa = mctx->dfa;
1660 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1661 int i;
1662
1663 /* Then build the next sifted state.
1664 We build the next sifted state on `cur_dest', and update
1665 `sifted_states[str_idx]' with `cur_dest'.
1666 Note:
1667 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1668 `cur_src' points the node_set of the old `state_log[str_idx]'
1669 (with the epsilon nodes pre-filtered out). */
1670 for (i = 0; i < cur_src->nelem; i++)
1671 {
1672 int prev_node = cur_src->elems[i];
1673 int naccepted = 0;
1674 int ret;
1675
1676#ifdef DEBUG
1677 re_token_type_t type = dfa->nodes[prev_node].type;
1678 assert (!IS_EPSILON_NODE (type));
1679#endif
1680#ifdef RE_ENABLE_I18N
1681 /* If the node may accept `multi byte'. */
1682 if (dfa->nodes[prev_node].accept_mb)
1683 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1684 str_idx, sctx->last_str_idx);
1685#endif /* RE_ENABLE_I18N */
1686
1687 /* We don't check backreferences here.
1688 See update_cur_sifted_state(). */
1689 if (!naccepted
1690 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1691 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1692 dfa->nexts[prev_node]))
1693 naccepted = 1;
1694
1695 if (naccepted == 0)
1696 continue;
1697
1698 if (sctx->limits.nelem)
1699 {
1700 int to_idx = str_idx + naccepted;
1701 if (check_dst_limits (mctx, &sctx->limits,
1702 dfa->nexts[prev_node], to_idx,
1703 prev_node, str_idx))
1704 continue;
1705 }
1706 ret = re_node_set_insert (cur_dest, prev_node);
1707 if (BE (ret == -1, 0))
1708 return REG_ESPACE;
1709 }
1710
1711 return REG_NOERROR;
1712}
1713
1714/* Helper functions. */
1715
1716static reg_errcode_t
1717internal_function
1718clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1719{
1720 int top = mctx->state_log_top;
1721
1722 if (next_state_log_idx >= mctx->input.bufs_len
1723 || (next_state_log_idx >= mctx->input.valid_len
1724 && mctx->input.valid_len < mctx->input.len))
1725 {
1726 reg_errcode_t err;
1727 err = extend_buffers (mctx);
1728 if (BE (err != REG_NOERROR, 0))
1729 return err;
1730 }
1731
1732 if (top < next_state_log_idx)
1733 {
1734 memset (mctx->state_log + top + 1, '\0',
1735 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1736 mctx->state_log_top = next_state_log_idx;
1737 }
1738 return REG_NOERROR;
1739}
1740
1741static reg_errcode_t
1742internal_function
1743merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1744 re_dfastate_t **src, int num)
1745{
1746 int st_idx;
1747 reg_errcode_t err;
1748 for (st_idx = 0; st_idx < num; ++st_idx)
1749 {
1750 if (dst[st_idx] == NULL)
1751 dst[st_idx] = src[st_idx];
1752 else if (src[st_idx] != NULL)
1753 {
1754 re_node_set merged_set;
1755 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1756 &src[st_idx]->nodes);
1757 if (BE (err != REG_NOERROR, 0))
1758 return err;
1759 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1760 re_node_set_free (&merged_set);
1761 if (BE (err != REG_NOERROR, 0))
1762 return err;
1763 }
1764 }
1765 return REG_NOERROR;
1766}
1767
1768static reg_errcode_t
1769internal_function
1770update_cur_sifted_state (const re_match_context_t *mctx,
1771 re_sift_context_t *sctx, int str_idx,
1772 re_node_set *dest_nodes)
1773{
1774 const re_dfa_t *const dfa = mctx->dfa;
1775 reg_errcode_t err = REG_NOERROR;
1776 const re_node_set *candidates;
1777 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1778 : &mctx->state_log[str_idx]->nodes);
1779
1780 if (dest_nodes->nelem == 0)
1781 sctx->sifted_states[str_idx] = NULL;
1782 else
1783 {
1784 if (candidates)
1785 {
1786 /* At first, add the nodes which can epsilon transit to a node in
1787 DEST_NODE. */
1788 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1789 if (BE (err != REG_NOERROR, 0))
1790 return err;
1791
1792 /* Then, check the limitations in the current sift_context. */
1793 if (sctx->limits.nelem)
1794 {
1795 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1796 mctx->bkref_ents, str_idx);
1797 if (BE (err != REG_NOERROR, 0))
1798 return err;
1799 }
1800 }
1801
1802 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1803 if (BE (err != REG_NOERROR, 0))
1804 return err;
1805 }
1806
1807 if (candidates && mctx->state_log[str_idx]->has_backref)
1808 {
1809 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1810 if (BE (err != REG_NOERROR, 0))
1811 return err;
1812 }
1813 return REG_NOERROR;
1814}
1815
1816static reg_errcode_t
1817internal_function
1818add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1819 const re_node_set *candidates)
1820{
1821 reg_errcode_t err = REG_NOERROR;
1822 int i;
1823
1824 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1825 if (BE (err != REG_NOERROR, 0))
1826 return err;
1827
1828 if (!state->inveclosure.alloc)
1829 {
1830 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1831 if (BE (err != REG_NOERROR, 0))
1832 return REG_ESPACE;
1833 for (i = 0; i < dest_nodes->nelem; i++)
1834 {
1835 err = re_node_set_merge (&state->inveclosure,
1836 dfa->inveclosures + dest_nodes->elems[i]);
1837 if (BE (err != REG_NOERROR, 0))
1838 return REG_ESPACE;
1839 }
1840 }
1841 return re_node_set_add_intersect (dest_nodes, candidates,
1842 &state->inveclosure);
1843}
1844
1845static reg_errcode_t
1846internal_function
1847sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1848 const re_node_set *candidates)
1849{
1850 int ecl_idx;
1851 reg_errcode_t err;
1852 re_node_set *inv_eclosure = dfa->inveclosures + node;
1853 re_node_set except_nodes;
1854 re_node_set_init_empty (&except_nodes);
1855 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1856 {
1857 int cur_node = inv_eclosure->elems[ecl_idx];
1858 if (cur_node == node)
1859 continue;
1860 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1861 {
1862 int edst1 = dfa->edests[cur_node].elems[0];
1863 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1864 ? dfa->edests[cur_node].elems[1] : -1);
1865 if ((!re_node_set_contains (inv_eclosure, edst1)
1866 && re_node_set_contains (dest_nodes, edst1))
1867 || (edst2 > 0
1868 && !re_node_set_contains (inv_eclosure, edst2)
1869 && re_node_set_contains (dest_nodes, edst2)))
1870 {
1871 err = re_node_set_add_intersect (&except_nodes, candidates,
1872 dfa->inveclosures + cur_node);
1873 if (BE (err != REG_NOERROR, 0))
1874 {
1875 re_node_set_free (&except_nodes);
1876 return err;
1877 }
1878 }
1879 }
1880 }
1881 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1882 {
1883 int cur_node = inv_eclosure->elems[ecl_idx];
1884 if (!re_node_set_contains (&except_nodes, cur_node))
1885 {
1886 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1887 re_node_set_remove_at (dest_nodes, idx);
1888 }
1889 }
1890 re_node_set_free (&except_nodes);
1891 return REG_NOERROR;
1892}
1893
1894static int
1895internal_function
1896check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1897 int dst_node, int dst_idx, int src_node, int src_idx)
1898{
1899 const re_dfa_t *const dfa = mctx->dfa;
1900 int lim_idx, src_pos, dst_pos;
1901
1902 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1903 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1904 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1905 {
1906 int subexp_idx;
1907 struct re_backref_cache_entry *ent;
1908 ent = mctx->bkref_ents + limits->elems[lim_idx];
1909 subexp_idx = dfa->nodes[ent->node].opr.idx;
1910
1911 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1912 subexp_idx, dst_node, dst_idx,
1913 dst_bkref_idx);
1914 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1915 subexp_idx, src_node, src_idx,
1916 src_bkref_idx);
1917
1918 /* In case of:
1919 <src> <dst> ( <subexp> )
1920 ( <subexp> ) <src> <dst>
1921 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1922 if (src_pos == dst_pos)
1923 continue; /* This is unrelated limitation. */
1924 else
1925 return 1;
1926 }
1927 return 0;
1928}
1929
1930static int
1931internal_function
1932check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1933 int subexp_idx, int from_node, int bkref_idx)
1934{
1935 const re_dfa_t *const dfa = mctx->dfa;
1936 const re_node_set *eclosures = dfa->eclosures + from_node;
1937 int node_idx;
1938
1939 /* Else, we are on the boundary: examine the nodes on the epsilon
1940 closure. */
1941 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1942 {
1943 int node = eclosures->elems[node_idx];
1944 switch (dfa->nodes[node].type)
1945 {
1946 case OP_BACK_REF:
1947 if (bkref_idx != -1)
1948 {
1949 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1950 do
1951 {
1952 int dst, cpos;
1953
1954 if (ent->node != node)
1955 continue;
1956
1957 if (subexp_idx < BITSET_WORD_BITS
1958 && !(ent->eps_reachable_subexps_map
1959 & ((bitset_word_t) 1 << subexp_idx)))
1960 continue;
1961
1962 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1963 OP_CLOSE_SUBEXP cases below. But, if the
1964 destination node is the same node as the source
1965 node, don't recurse because it would cause an
1966 infinite loop: a regex that exhibits this behavior
1967 is ()\1*\1* */
1968 dst = dfa->edests[node].elems[0];
1969 if (dst == from_node)
1970 {
1971 if (boundaries & 1)
1972 return -1;
1973 else /* if (boundaries & 2) */
1974 return 0;
1975 }
1976
1977 cpos =
1978 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1979 dst, bkref_idx);
1980 if (cpos == -1 /* && (boundaries & 1) */)
1981 return -1;
1982 if (cpos == 0 && (boundaries & 2))
1983 return 0;
1984
1985 if (subexp_idx < BITSET_WORD_BITS)
1986 ent->eps_reachable_subexps_map
1987 &= ~((bitset_word_t) 1 << subexp_idx);
1988 }
1989 while (ent++->more);
1990 }
1991 break;
1992
1993 case OP_OPEN_SUBEXP:
1994 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1995 return -1;
1996 break;
1997
1998 case OP_CLOSE_SUBEXP:
1999 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2000 return 0;
2001 break;
2002
2003 default:
2004 break;
2005 }
2006 }
2007
2008 return (boundaries & 2) ? 1 : 0;
2009}
2010
2011static int
2012internal_function
2013check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2014 int subexp_idx, int from_node, int str_idx,
2015 int bkref_idx)
2016{
2017 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2018 int boundaries;
2019
2020 /* If we are outside the range of the subexpression, return -1 or 1. */
2021 if (str_idx < lim->subexp_from)
2022 return -1;
2023
2024 if (lim->subexp_to < str_idx)
2025 return 1;
2026
2027 /* If we are within the subexpression, return 0. */
2028 boundaries = (str_idx == lim->subexp_from);
2029 boundaries |= (str_idx == lim->subexp_to) << 1;
2030 if (boundaries == 0)
2031 return 0;
2032
2033 /* Else, examine epsilon closure. */
2034 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2035 from_node, bkref_idx);
2036}
2037
2038/* Check the limitations of sub expressions LIMITS, and remove the nodes
2039 which are against limitations from DEST_NODES. */
2040
2041static reg_errcode_t
2042internal_function
2043check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2044 const re_node_set *candidates, re_node_set *limits,
2045 struct re_backref_cache_entry *bkref_ents, int str_idx)
2046{
2047 reg_errcode_t err;
2048 int node_idx, lim_idx;
2049
2050 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2051 {
2052 int subexp_idx;
2053 struct re_backref_cache_entry *ent;
2054 ent = bkref_ents + limits->elems[lim_idx];
2055
2056 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2057 continue; /* This is unrelated limitation. */
2058
2059 subexp_idx = dfa->nodes[ent->node].opr.idx;
2060 if (ent->subexp_to == str_idx)
2061 {
2062 int ops_node = -1;
2063 int cls_node = -1;
2064 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2065 {
2066 int node = dest_nodes->elems[node_idx];
2067 re_token_type_t type = dfa->nodes[node].type;
2068 if (type == OP_OPEN_SUBEXP
2069 && subexp_idx == dfa->nodes[node].opr.idx)
2070 ops_node = node;
2071 else if (type == OP_CLOSE_SUBEXP
2072 && subexp_idx == dfa->nodes[node].opr.idx)
2073 cls_node = node;
2074 }
2075
2076 /* Check the limitation of the open subexpression. */
2077 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2078 if (ops_node >= 0)
2079 {
2080 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2081 candidates);
2082 if (BE (err != REG_NOERROR, 0))
2083 return err;
2084 }
2085
2086 /* Check the limitation of the close subexpression. */
2087 if (cls_node >= 0)
2088 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2089 {
2090 int node = dest_nodes->elems[node_idx];
2091 if (!re_node_set_contains (dfa->inveclosures + node,
2092 cls_node)
2093 && !re_node_set_contains (dfa->eclosures + node,
2094 cls_node))
2095 {
2096 /* It is against this limitation.
2097 Remove it form the current sifted state. */
2098 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2099 candidates);
2100 if (BE (err != REG_NOERROR, 0))
2101 return err;
2102 --node_idx;
2103 }
2104 }
2105 }
2106 else /* (ent->subexp_to != str_idx) */
2107 {
2108 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2109 {
2110 int node = dest_nodes->elems[node_idx];
2111 re_token_type_t type = dfa->nodes[node].type;
2112 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2113 {
2114 if (subexp_idx != dfa->nodes[node].opr.idx)
2115 continue;
2116 /* It is against this limitation.
2117 Remove it form the current sifted state. */
2118 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2119 candidates);
2120 if (BE (err != REG_NOERROR, 0))
2121 return err;
2122 }
2123 }
2124 }
2125 }
2126 return REG_NOERROR;
2127}
2128
2129static reg_errcode_t
2130internal_function
2131sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2132 int str_idx, const re_node_set *candidates)
2133{
2134 const re_dfa_t *const dfa = mctx->dfa;
2135 reg_errcode_t err;
2136 int node_idx, node;
2137 re_sift_context_t local_sctx;
2138 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2139
2140 if (first_idx == -1)
2141 return REG_NOERROR;
2142
2143 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2144
2145 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2146 {
2147 int enabled_idx;
2148 re_token_type_t type;
2149 struct re_backref_cache_entry *entry;
2150 node = candidates->elems[node_idx];
2151 type = dfa->nodes[node].type;
2152 /* Avoid infinite loop for the REs like "()\1+". */
2153 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2154 continue;
2155 if (type != OP_BACK_REF)
2156 continue;
2157
2158 entry = mctx->bkref_ents + first_idx;
2159 enabled_idx = first_idx;
2160 do
2161 {
2162 int subexp_len;
2163 int to_idx;
2164 int dst_node;
2165 int ret;
2166 re_dfastate_t *cur_state;
2167
2168 if (entry->node != node)
2169 continue;
2170 subexp_len = entry->subexp_to - entry->subexp_from;
2171 to_idx = str_idx + subexp_len;
2172 dst_node = (subexp_len ? dfa->nexts[node]
2173 : dfa->edests[node].elems[0]);
2174
2175 if (to_idx > sctx->last_str_idx
2176 || sctx->sifted_states[to_idx] == NULL
2177 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2178 || check_dst_limits (mctx, &sctx->limits, node,
2179 str_idx, dst_node, to_idx))
2180 continue;
2181
2182 if (local_sctx.sifted_states == NULL)
2183 {
2184 local_sctx = *sctx;
2185 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2186 if (BE (err != REG_NOERROR, 0))
2187 goto free_return;
2188 }
2189 local_sctx.last_node = node;
2190 local_sctx.last_str_idx = str_idx;
2191 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2192 if (BE (ret < 0, 0))
2193 {
2194 err = REG_ESPACE;
2195 goto free_return;
2196 }
2197 cur_state = local_sctx.sifted_states[str_idx];
2198 err = sift_states_backward (mctx, &local_sctx);
2199 if (BE (err != REG_NOERROR, 0))
2200 goto free_return;
2201 if (sctx->limited_states != NULL)
2202 {
2203 err = merge_state_array (dfa, sctx->limited_states,
2204 local_sctx.sifted_states,
2205 str_idx + 1);
2206 if (BE (err != REG_NOERROR, 0))
2207 goto free_return;
2208 }
2209 local_sctx.sifted_states[str_idx] = cur_state;
2210 re_node_set_remove (&local_sctx.limits, enabled_idx);
2211
2212 /* mctx->bkref_ents may have changed, reload the pointer. */
2213 entry = mctx->bkref_ents + enabled_idx;
2214 }
2215 while (enabled_idx++, entry++->more);
2216 }
2217 err = REG_NOERROR;
2218 free_return:
2219 if (local_sctx.sifted_states != NULL)
2220 {
2221 re_node_set_free (&local_sctx.limits);
2222 }
2223
2224 return err;
2225}
2226
2227
2228#ifdef RE_ENABLE_I18N
2229static int
2230internal_function
2231sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2232 int node_idx, int str_idx, int max_str_idx)
2233{
2234 const re_dfa_t *const dfa = mctx->dfa;
2235 int naccepted;
2236 /* Check the node can accept `multi byte'. */
2237 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2238 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2239 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2240 dfa->nexts[node_idx]))
2241 /* The node can't accept the `multi byte', or the
2242 destination was already thrown away, then the node
2243 could't accept the current input `multi byte'. */
2244 naccepted = 0;
2245 /* Otherwise, it is sure that the node could accept
2246 `naccepted' bytes input. */
2247 return naccepted;
2248}
2249#endif /* RE_ENABLE_I18N */
2250
2251\f
2252/* Functions for state transition. */
2253
2254/* Return the next state to which the current state STATE will transit by
2255 accepting the current input byte, and update STATE_LOG if necessary.
2256 If STATE can accept a multibyte char/collating element/back reference
2257 update the destination of STATE_LOG. */
2258
2259static re_dfastate_t *
2260internal_function
2261transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2262 re_dfastate_t *state)
2263{
2264 re_dfastate_t **trtable;
2265 unsigned char ch;
2266
2267#ifdef RE_ENABLE_I18N
2268 /* If the current state can accept multibyte. */
2269 if (BE (state->accept_mb, 0))
2270 {
2271 *err = transit_state_mb (mctx, state);
2272 if (BE (*err != REG_NOERROR, 0))
2273 return NULL;
2274 }
2275#endif /* RE_ENABLE_I18N */
2276
2277 /* Then decide the next state with the single byte. */
2278#if 0
2279 if (0)
2280 /* don't use transition table */
2281 return transit_state_sb (err, mctx, state);
2282#endif
2283
2284 /* Use transition table */
2285 ch = re_string_fetch_byte (&mctx->input);
2286 for (;;)
2287 {
2288 trtable = state->trtable;
2289 if (BE (trtable != NULL, 1))
2290 return trtable[ch];
2291
2292 trtable = state->word_trtable;
2293 if (BE (trtable != NULL, 1))
2294 {
2295 unsigned int context;
2296 context
2297 = re_string_context_at (&mctx->input,
2298 re_string_cur_idx (&mctx->input) - 1,
2299 mctx->eflags);
2300 if (IS_WORD_CONTEXT (context))
2301 return trtable[ch + SBC_MAX];
2302 else
2303 return trtable[ch];
2304 }
2305
2306 if (!build_trtable (mctx->dfa, state))
2307 {
2308 *err = REG_ESPACE;
2309 return NULL;
2310 }
2311
2312 /* Retry, we now have a transition table. */
2313 }
2314}
2315
2316/* Update the state_log if we need */
2317re_dfastate_t *
2318internal_function
2319merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2320 re_dfastate_t *next_state)
2321{
2322 const re_dfa_t *const dfa = mctx->dfa;
2323 int cur_idx = re_string_cur_idx (&mctx->input);
2324
2325 if (cur_idx > mctx->state_log_top)
2326 {
2327 mctx->state_log[cur_idx] = next_state;
2328 mctx->state_log_top = cur_idx;
2329 }
2330 else if (mctx->state_log[cur_idx] == 0)
2331 {
2332 mctx->state_log[cur_idx] = next_state;
2333 }
2334 else
2335 {
2336 re_dfastate_t *pstate;
2337 unsigned int context;
2338 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2339 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2340 the destination of a multibyte char/collating element/
2341 back reference. Then the next state is the union set of
2342 these destinations and the results of the transition table. */
2343 pstate = mctx->state_log[cur_idx];
2344 log_nodes = pstate->entrance_nodes;
2345 if (next_state != NULL)
2346 {
2347 table_nodes = next_state->entrance_nodes;
2348 *err = re_node_set_init_union (&next_nodes, table_nodes,
2349 log_nodes);
2350 if (BE (*err != REG_NOERROR, 0))
2351 return NULL;
2352 }
2353 else
2354 next_nodes = *log_nodes;
2355 /* Note: We already add the nodes of the initial state,
2356 then we don't need to add them here. */
2357
2358 context = re_string_context_at (&mctx->input,
2359 re_string_cur_idx (&mctx->input) - 1,
2360 mctx->eflags);
2361 next_state = mctx->state_log[cur_idx]
2362 = re_acquire_state_context (err, dfa, &next_nodes, context);
2363 /* We don't need to check errors here, since the return value of
2364 this function is next_state and ERR is already set. */
2365
2366 if (table_nodes != NULL)
2367 re_node_set_free (&next_nodes);
2368 }
2369
2370 if (BE (dfa->nbackref, 0) && next_state != NULL)
2371 {
2372 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2373 later. We must check them here, since the back references in the
2374 next state might use them. */
2375 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2376 cur_idx);
2377 if (BE (*err != REG_NOERROR, 0))
2378 return NULL;
2379
2380 /* If the next state has back references. */
2381 if (next_state->has_backref)
2382 {
2383 *err = transit_state_bkref (mctx, &next_state->nodes);
2384 if (BE (*err != REG_NOERROR, 0))
2385 return NULL;
2386 next_state = mctx->state_log[cur_idx];
2387 }
2388 }
2389
2390 return next_state;
2391}
2392
2393/* Skip bytes in the input that correspond to part of a
2394 multi-byte match, then look in the log for a state
2395 from which to restart matching. */
2396re_dfastate_t *
2397internal_function
2398find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2399{
2400 re_dfastate_t *cur_state;
2401 do
2402 {
2403 int max = mctx->state_log_top;
2404 int cur_str_idx = re_string_cur_idx (&mctx->input);
2405
2406 do
2407 {
2408 if (++cur_str_idx > max)
2409 return NULL;
2410 re_string_skip_bytes (&mctx->input, 1);
2411 }
2412 while (mctx->state_log[cur_str_idx] == NULL);
2413
2414 cur_state = merge_state_with_log (err, mctx, NULL);
2415 }
2416 while (*err == REG_NOERROR && cur_state == NULL);
2417 return cur_state;
2418}
2419
2420/* Helper functions for transit_state. */
2421
2422/* From the node set CUR_NODES, pick up the nodes whose types are
2423 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2424 expression. And register them to use them later for evaluating the
2425 correspoding back references. */
2426
2427static reg_errcode_t
2428internal_function
2429check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2430 int str_idx)
2431{
2432 const re_dfa_t *const dfa = mctx->dfa;
2433 int node_idx;
2434 reg_errcode_t err;
2435
2436 /* TODO: This isn't efficient.
2437 Because there might be more than one nodes whose types are
2438 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2439 nodes.
2440 E.g. RE: (a){2} */
2441 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2442 {
2443 int node = cur_nodes->elems[node_idx];
2444 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2445 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2446 && (dfa->used_bkref_map
2447 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2448 {
2449 err = match_ctx_add_subtop (mctx, node, str_idx);
2450 if (BE (err != REG_NOERROR, 0))
2451 return err;
2452 }
2453 }
2454 return REG_NOERROR;
2455}
2456
2457#if 0
2458/* Return the next state to which the current state STATE will transit by
2459 accepting the current input byte. */
2460
2461static re_dfastate_t *
2462transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2463 re_dfastate_t *state)
2464{
2465 const re_dfa_t *const dfa = mctx->dfa;
2466 re_node_set next_nodes;
2467 re_dfastate_t *next_state;
2468 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2469 unsigned int context;
2470
2471 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2472 if (BE (*err != REG_NOERROR, 0))
2473 return NULL;
2474 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2475 {
2476 int cur_node = state->nodes.elems[node_cnt];
2477 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2478 {
2479 *err = re_node_set_merge (&next_nodes,
2480 dfa->eclosures + dfa->nexts[cur_node]);
2481 if (BE (*err != REG_NOERROR, 0))
2482 {
2483 re_node_set_free (&next_nodes);
2484 return NULL;
2485 }
2486 }
2487 }
2488 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2489 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2490 /* We don't need to check errors here, since the return value of
2491 this function is next_state and ERR is already set. */
2492
2493 re_node_set_free (&next_nodes);
2494 re_string_skip_bytes (&mctx->input, 1);
2495 return next_state;
2496}
2497#endif
2498
2499#ifdef RE_ENABLE_I18N
2500static reg_errcode_t
2501internal_function
2502transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2503{
2504 const re_dfa_t *const dfa = mctx->dfa;
2505 reg_errcode_t err;
2506 int i;
2507
2508 for (i = 0; i < pstate->nodes.nelem; ++i)
2509 {
2510 re_node_set dest_nodes, *new_nodes;
2511 int cur_node_idx = pstate->nodes.elems[i];
2512 int naccepted, dest_idx;
2513 unsigned int context;
2514 re_dfastate_t *dest_state;
2515
2516 if (!dfa->nodes[cur_node_idx].accept_mb)
2517 continue;
2518
2519 if (dfa->nodes[cur_node_idx].constraint)
2520 {
2521 context = re_string_context_at (&mctx->input,
2522 re_string_cur_idx (&mctx->input),
2523 mctx->eflags);
2524 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2525 context))
2526 continue;
2527 }
2528
2529 /* How many bytes the node can accept? */
2530 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2531 re_string_cur_idx (&mctx->input));
2532 if (naccepted == 0)
2533 continue;
2534
2535 /* The node can accepts `naccepted' bytes. */
2536 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2537 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2538 : mctx->max_mb_elem_len);
2539 err = clean_state_log_if_needed (mctx, dest_idx);
2540 if (BE (err != REG_NOERROR, 0))
2541 return err;
2542#ifdef DEBUG
2543 assert (dfa->nexts[cur_node_idx] != -1);
2544#endif
2545 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2546
2547 dest_state = mctx->state_log[dest_idx];
2548 if (dest_state == NULL)
2549 dest_nodes = *new_nodes;
2550 else
2551 {
2552 err = re_node_set_init_union (&dest_nodes,
2553 dest_state->entrance_nodes, new_nodes);
2554 if (BE (err != REG_NOERROR, 0))
2555 return err;
2556 }
2557 context = re_string_context_at (&mctx->input, dest_idx - 1,
2558 mctx->eflags);
2559 mctx->state_log[dest_idx]
2560 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2561 if (dest_state != NULL)
2562 re_node_set_free (&dest_nodes);
2563 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2564 return err;
2565 }
2566 return REG_NOERROR;
2567}
2568#endif /* RE_ENABLE_I18N */
2569
2570static reg_errcode_t
2571internal_function
2572transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2573{
2574 const re_dfa_t *const dfa = mctx->dfa;
2575 reg_errcode_t err;
2576 int i;
2577 int cur_str_idx = re_string_cur_idx (&mctx->input);
2578
2579 for (i = 0; i < nodes->nelem; ++i)
2580 {
2581 int dest_str_idx, prev_nelem, bkc_idx;
2582 int node_idx = nodes->elems[i];
2583 unsigned int context;
2584 const re_token_t *node = dfa->nodes + node_idx;
2585 re_node_set *new_dest_nodes;
2586
2587 /* Check whether `node' is a backreference or not. */
2588 if (node->type != OP_BACK_REF)
2589 continue;
2590
2591 if (node->constraint)
2592 {
2593 context = re_string_context_at (&mctx->input, cur_str_idx,
2594 mctx->eflags);
2595 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2596 continue;
2597 }
2598
2599 /* `node' is a backreference.
2600 Check the substring which the substring matched. */
2601 bkc_idx = mctx->nbkref_ents;
2602 err = get_subexp (mctx, node_idx, cur_str_idx);
2603 if (BE (err != REG_NOERROR, 0))
2604 goto free_return;
2605
2606 /* And add the epsilon closures (which is `new_dest_nodes') of
2607 the backreference to appropriate state_log. */
2608#ifdef DEBUG
2609 assert (dfa->nexts[node_idx] != -1);
2610#endif
2611 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2612 {
2613 int subexp_len;
2614 re_dfastate_t *dest_state;
2615 struct re_backref_cache_entry *bkref_ent;
2616 bkref_ent = mctx->bkref_ents + bkc_idx;
2617 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2618 continue;
2619 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2620 new_dest_nodes = (subexp_len == 0
2621 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2622 : dfa->eclosures + dfa->nexts[node_idx]);
2623 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2624 - bkref_ent->subexp_from);
2625 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2626 mctx->eflags);
2627 dest_state = mctx->state_log[dest_str_idx];
2628 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2629 : mctx->state_log[cur_str_idx]->nodes.nelem);
2630 /* Add `new_dest_node' to state_log. */
2631 if (dest_state == NULL)
2632 {
2633 mctx->state_log[dest_str_idx]
2634 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2635 context);
2636 if (BE (mctx->state_log[dest_str_idx] == NULL
2637 && err != REG_NOERROR, 0))
2638 goto free_return;
2639 }
2640 else
2641 {
2642 re_node_set dest_nodes;
2643 err = re_node_set_init_union (&dest_nodes,
2644 dest_state->entrance_nodes,
2645 new_dest_nodes);
2646 if (BE (err != REG_NOERROR, 0))
2647 {
2648 re_node_set_free (&dest_nodes);
2649 goto free_return;
2650 }
2651 mctx->state_log[dest_str_idx]
2652 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2653 re_node_set_free (&dest_nodes);
2654 if (BE (mctx->state_log[dest_str_idx] == NULL
2655 && err != REG_NOERROR, 0))
2656 goto free_return;
2657 }
2658 /* We need to check recursively if the backreference can epsilon
2659 transit. */
2660 if (subexp_len == 0
2661 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2662 {
2663 err = check_subexp_matching_top (mctx, new_dest_nodes,
2664 cur_str_idx);
2665 if (BE (err != REG_NOERROR, 0))
2666 goto free_return;
2667 err = transit_state_bkref (mctx, new_dest_nodes);
2668 if (BE (err != REG_NOERROR, 0))
2669 goto free_return;
2670 }
2671 }
2672 }
2673 err = REG_NOERROR;
2674 free_return:
2675 return err;
2676}
2677
2678/* Enumerate all the candidates which the backreference BKREF_NODE can match
2679 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2680 Note that we might collect inappropriate candidates here.
2681 However, the cost of checking them strictly here is too high, then we
2682 delay these checking for prune_impossible_nodes(). */
2683
2684static reg_errcode_t
2685internal_function
2686get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2687{
2688 const re_dfa_t *const dfa = mctx->dfa;
2689 int subexp_num, sub_top_idx;
2690 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2691 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2692 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2693 if (cache_idx != -1)
2694 {
2695 const struct re_backref_cache_entry *entry
2696 = mctx->bkref_ents + cache_idx;
2697 do
2698 if (entry->node == bkref_node)
2699 return REG_NOERROR; /* We already checked it. */
2700 while (entry++->more);
2701 }
2702
2703 subexp_num = dfa->nodes[bkref_node].opr.idx;
2704
2705 /* For each sub expression */
2706 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2707 {
2708 reg_errcode_t err;
2709 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2710 re_sub_match_last_t *sub_last;
2711 int sub_last_idx, sl_str, bkref_str_off;
2712
2713 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2714 continue; /* It isn't related. */
2715
2716 sl_str = sub_top->str_idx;
2717 bkref_str_off = bkref_str_idx;
2718 /* At first, check the last node of sub expressions we already
2719 evaluated. */
2720 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2721 {
2722 int sl_str_diff;
2723 sub_last = sub_top->lasts[sub_last_idx];
2724 sl_str_diff = sub_last->str_idx - sl_str;
2725 /* The matched string by the sub expression match with the substring
2726 at the back reference? */
2727 if (sl_str_diff > 0)
2728 {
2729 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2730 {
2731 /* Not enough chars for a successful match. */
2732 if (bkref_str_off + sl_str_diff > mctx->input.len)
2733 break;
2734
2735 err = clean_state_log_if_needed (mctx,
2736 bkref_str_off
2737 + sl_str_diff);
2738 if (BE (err != REG_NOERROR, 0))
2739 return err;
2740 buf = (const char *) re_string_get_buffer (&mctx->input);
2741 }
2742 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2743 /* We don't need to search this sub expression any more. */
2744 break;
2745 }
2746 bkref_str_off += sl_str_diff;
2747 sl_str += sl_str_diff;
2748 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2749 bkref_str_idx);
2750
2751 /* Reload buf, since the preceding call might have reallocated
2752 the buffer. */
2753 buf = (const char *) re_string_get_buffer (&mctx->input);
2754
2755 if (err == REG_NOMATCH)
2756 continue;
2757 if (BE (err != REG_NOERROR, 0))
2758 return err;
2759 }
2760
2761 if (sub_last_idx < sub_top->nlasts)
2762 continue;
2763 if (sub_last_idx > 0)
2764 ++sl_str;
2765 /* Then, search for the other last nodes of the sub expression. */
2766 for (; sl_str <= bkref_str_idx; ++sl_str)
2767 {
2768 int cls_node, sl_str_off;
2769 const re_node_set *nodes;
2770 sl_str_off = sl_str - sub_top->str_idx;
2771 /* The matched string by the sub expression match with the substring
2772 at the back reference? */
2773 if (sl_str_off > 0)
2774 {
2775 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2776 {
2777 /* If we are at the end of the input, we cannot match. */
2778 if (bkref_str_off >= mctx->input.len)
2779 break;
2780
2781 err = extend_buffers (mctx);
2782 if (BE (err != REG_NOERROR, 0))
2783 return err;
2784
2785 buf = (const char *) re_string_get_buffer (&mctx->input);
2786 }
2787 if (buf [bkref_str_off++] != buf[sl_str - 1])
2788 break; /* We don't need to search this sub expression
2789 any more. */
2790 }
2791 if (mctx->state_log[sl_str] == NULL)
2792 continue;
2793 /* Does this state have a ')' of the sub expression? */
2794 nodes = &mctx->state_log[sl_str]->nodes;
2795 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2796 OP_CLOSE_SUBEXP);
2797 if (cls_node == -1)
2798 continue; /* No. */
2799 if (sub_top->path == NULL)
2800 {
2801 sub_top->path = calloc (sizeof (state_array_t),
2802 sl_str - sub_top->str_idx + 1);
2803 if (sub_top->path == NULL)
2804 return REG_ESPACE;
2805 }
2806 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2807 in the current context? */
2808 err = check_arrival (mctx, sub_top->path, sub_top->node,
2809 sub_top->str_idx, cls_node, sl_str,
2810 OP_CLOSE_SUBEXP);
2811 if (err == REG_NOMATCH)
2812 continue;
2813 if (BE (err != REG_NOERROR, 0))
2814 return err;
2815 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2816 if (BE (sub_last == NULL, 0))
2817 return REG_ESPACE;
2818 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2819 bkref_str_idx);
2820 if (err == REG_NOMATCH)
2821 continue;
2822 }
2823 }
2824 return REG_NOERROR;
2825}
2826
2827/* Helper functions for get_subexp(). */
2828
2829/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2830 If it can arrive, register the sub expression expressed with SUB_TOP
2831 and SUB_LAST. */
2832
2833static reg_errcode_t
2834internal_function
2835get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2836 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2837{
2838 reg_errcode_t err;
2839 int to_idx;
2840 /* Can the subexpression arrive the back reference? */
2841 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2842 sub_last->str_idx, bkref_node, bkref_str,
2843 OP_OPEN_SUBEXP);
2844 if (err != REG_NOERROR)
2845 return err;
2846 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2847 sub_last->str_idx);
2848 if (BE (err != REG_NOERROR, 0))
2849 return err;
2850 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2851 return clean_state_log_if_needed (mctx, to_idx);
2852}
2853
2854/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2855 Search '(' if FL_OPEN, or search ')' otherwise.
2856 TODO: This function isn't efficient...
2857 Because there might be more than one nodes whose types are
2858 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2859 nodes.
2860 E.g. RE: (a){2} */
2861
2862static int
2863internal_function
2864find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2865 int subexp_idx, int type)
2866{
2867 int cls_idx;
2868 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2869 {
2870 int cls_node = nodes->elems[cls_idx];
2871 const re_token_t *node = dfa->nodes + cls_node;
2872 if (node->type == type
2873 && node->opr.idx == subexp_idx)
2874 return cls_node;
2875 }
2876 return -1;
2877}
2878
2879/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2880 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2881 heavily reused.
2882 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2883
2884static reg_errcode_t
2885internal_function
2886check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2887 int top_str, int last_node, int last_str, int type)
2888{
2889 const re_dfa_t *const dfa = mctx->dfa;
2890 reg_errcode_t err = REG_NOERROR;
2891 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2892 re_dfastate_t *cur_state = NULL;
2893 re_node_set *cur_nodes, next_nodes;
2894 re_dfastate_t **backup_state_log;
2895 unsigned int context;
2896
2897 subexp_num = dfa->nodes[top_node].opr.idx;
2898 /* Extend the buffer if we need. */
2899 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2900 {
2901 re_dfastate_t **new_array;
2902 int old_alloc = path->alloc;
2903 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2904 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2905 if (BE (new_array == NULL, 0))
2906 {
2907 path->alloc = old_alloc;
2908 return REG_ESPACE;
2909 }
2910 path->array = new_array;
2911 memset (new_array + old_alloc, '\0',
2912 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2913 }
2914
2915 str_idx = path->next_idx ? path->next_idx : top_str;
2916
2917 /* Temporary modify MCTX. */
2918 backup_state_log = mctx->state_log;
2919 backup_cur_idx = mctx->input.cur_idx;
2920 mctx->state_log = path->array;
2921 mctx->input.cur_idx = str_idx;
2922
2923 /* Setup initial node set. */
2924 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2925 if (str_idx == top_str)
2926 {
2927 err = re_node_set_init_1 (&next_nodes, top_node);
2928 if (BE (err != REG_NOERROR, 0))
2929 return err;
2930 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2931 if (BE (err != REG_NOERROR, 0))
2932 {
2933 re_node_set_free (&next_nodes);
2934 return err;
2935 }
2936 }
2937 else
2938 {
2939 cur_state = mctx->state_log[str_idx];
2940 if (cur_state && cur_state->has_backref)
2941 {
2942 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2943 if (BE (err != REG_NOERROR, 0))
2944 return err;
2945 }
2946 else
2947 re_node_set_init_empty (&next_nodes);
2948 }
2949 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2950 {
2951 if (next_nodes.nelem)
2952 {
2953 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2954 subexp_num, type);
2955 if (BE (err != REG_NOERROR, 0))
2956 {
2957 re_node_set_free (&next_nodes);
2958 return err;
2959 }
2960 }
2961 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2962 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2963 {
2964 re_node_set_free (&next_nodes);
2965 return err;
2966 }
2967 mctx->state_log[str_idx] = cur_state;
2968 }
2969
2970 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2971 {
2972 re_node_set_empty (&next_nodes);
2973 if (mctx->state_log[str_idx + 1])
2974 {
2975 err = re_node_set_merge (&next_nodes,
2976 &mctx->state_log[str_idx + 1]->nodes);
2977 if (BE (err != REG_NOERROR, 0))
2978 {
2979 re_node_set_free (&next_nodes);
2980 return err;
2981 }
2982 }
2983 if (cur_state)
2984 {
2985 err = check_arrival_add_next_nodes (mctx, str_idx,
2986 &cur_state->non_eps_nodes,
2987 &next_nodes);
2988 if (BE (err != REG_NOERROR, 0))
2989 {
2990 re_node_set_free (&next_nodes);
2991 return err;
2992 }
2993 }
2994 ++str_idx;
2995 if (next_nodes.nelem)
2996 {
2997 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2998 if (BE (err != REG_NOERROR, 0))
2999 {
3000 re_node_set_free (&next_nodes);
3001 return err;
3002 }
3003 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3004 subexp_num, type);
3005 if (BE (err != REG_NOERROR, 0))
3006 {
3007 re_node_set_free (&next_nodes);
3008 return err;
3009 }
3010 }
3011 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3012 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3013 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3014 {
3015 re_node_set_free (&next_nodes);
3016 return err;
3017 }
3018 mctx->state_log[str_idx] = cur_state;
3019 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3020 }
3021 re_node_set_free (&next_nodes);
3022 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3023 : &mctx->state_log[last_str]->nodes);
3024 path->next_idx = str_idx;
3025
3026 /* Fix MCTX. */
3027 mctx->state_log = backup_state_log;
3028 mctx->input.cur_idx = backup_cur_idx;
3029
3030 /* Then check the current node set has the node LAST_NODE. */
3031 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3032 return REG_NOERROR;
3033
3034 return REG_NOMATCH;
3035}
3036
3037/* Helper functions for check_arrival. */
3038
3039/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3040 to NEXT_NODES.
3041 TODO: This function is similar to the functions transit_state*(),
3042 however this function has many additional works.
3043 Can't we unify them? */
3044
3045static reg_errcode_t
3046internal_function
3047check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3048 re_node_set *cur_nodes, re_node_set *next_nodes)
3049{
3050 const re_dfa_t *const dfa = mctx->dfa;
3051 int result;
3052 int cur_idx;
3053#ifdef RE_ENABLE_I18N
3054 reg_errcode_t err = REG_NOERROR;
3055#endif
3056 re_node_set union_set;
3057 re_node_set_init_empty (&union_set);
3058 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3059 {
3060 int naccepted = 0;
3061 int cur_node = cur_nodes->elems[cur_idx];
3062#ifdef DEBUG
3063 re_token_type_t type = dfa->nodes[cur_node].type;
3064 assert (!IS_EPSILON_NODE (type));
3065#endif
3066#ifdef RE_ENABLE_I18N
3067 /* If the node may accept `multi byte'. */
3068 if (dfa->nodes[cur_node].accept_mb)
3069 {
3070 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3071 str_idx);
3072 if (naccepted > 1)
3073 {
3074 re_dfastate_t *dest_state;
3075 int next_node = dfa->nexts[cur_node];
3076 int next_idx = str_idx + naccepted;
3077 dest_state = mctx->state_log[next_idx];
3078 re_node_set_empty (&union_set);
3079 if (dest_state)
3080 {
3081 err = re_node_set_merge (&union_set, &dest_state->nodes);
3082 if (BE (err != REG_NOERROR, 0))
3083 {
3084 re_node_set_free (&union_set);
3085 return err;
3086 }
3087 }
3088 result = re_node_set_insert (&union_set, next_node);
3089 if (BE (result < 0, 0))
3090 {
3091 re_node_set_free (&union_set);
3092 return REG_ESPACE;
3093 }
3094 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3095 &union_set);
3096 if (BE (mctx->state_log[next_idx] == NULL
3097 && err != REG_NOERROR, 0))
3098 {
3099 re_node_set_free (&union_set);
3100 return err;
3101 }
3102 }
3103 }
3104#endif /* RE_ENABLE_I18N */
3105 if (naccepted
3106 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3107 {
3108 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3109 if (BE (result < 0, 0))
3110 {
3111 re_node_set_free (&union_set);
3112 return REG_ESPACE;
3113 }
3114 }
3115 }
3116 re_node_set_free (&union_set);
3117 return REG_NOERROR;
3118}
3119
3120/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3121 CUR_NODES, however exclude the nodes which are:
3122 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3123 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3124*/
3125
3126static reg_errcode_t
3127internal_function
3128check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3129 int ex_subexp, int type)
3130{
3131 reg_errcode_t err;
3132 int idx, outside_node;
3133 re_node_set new_nodes;
3134#ifdef DEBUG
3135 assert (cur_nodes->nelem);
3136#endif
3137 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3138 if (BE (err != REG_NOERROR, 0))
3139 return err;
3140 /* Create a new node set NEW_NODES with the nodes which are epsilon
3141 closures of the node in CUR_NODES. */
3142
3143 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3144 {
3145 int cur_node = cur_nodes->elems[idx];
3146 const re_node_set *eclosure = dfa->eclosures + cur_node;
3147 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3148 if (outside_node == -1)
3149 {
3150 /* There are no problematic nodes, just merge them. */
3151 err = re_node_set_merge (&new_nodes, eclosure);
3152 if (BE (err != REG_NOERROR, 0))
3153 {
3154 re_node_set_free (&new_nodes);
3155 return err;
3156 }
3157 }
3158 else
3159 {
3160 /* There are problematic nodes, re-calculate incrementally. */
3161 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3162 ex_subexp, type);
3163 if (BE (err != REG_NOERROR, 0))
3164 {
3165 re_node_set_free (&new_nodes);
3166 return err;
3167 }
3168 }
3169 }
3170 re_node_set_free (cur_nodes);
3171 *cur_nodes = new_nodes;
3172 return REG_NOERROR;
3173}
3174
3175/* Helper function for check_arrival_expand_ecl.
3176 Check incrementally the epsilon closure of TARGET, and if it isn't
3177 problematic append it to DST_NODES. */
3178
3179static reg_errcode_t
3180internal_function
3181check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3182 int target, int ex_subexp, int type)
3183{
3184 int cur_node;
3185 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3186 {
3187 int err;
3188
3189 if (dfa->nodes[cur_node].type == type
3190 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3191 {
3192 if (type == OP_CLOSE_SUBEXP)
3193 {
3194 err = re_node_set_insert (dst_nodes, cur_node);
3195 if (BE (err == -1, 0))
3196 return REG_ESPACE;
3197 }
3198 break;
3199 }
3200 err = re_node_set_insert (dst_nodes, cur_node);
3201 if (BE (err == -1, 0))
3202 return REG_ESPACE;
3203 if (dfa->edests[cur_node].nelem == 0)
3204 break;
3205 if (dfa->edests[cur_node].nelem == 2)
3206 {
3207 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3208 dfa->edests[cur_node].elems[1],
3209 ex_subexp, type);
3210 if (BE (err != REG_NOERROR, 0))
3211 return err;
3212 }
3213 cur_node = dfa->edests[cur_node].elems[0];
3214 }
3215 return REG_NOERROR;
3216}
3217
3218
3219/* For all the back references in the current state, calculate the
3220 destination of the back references by the appropriate entry
3221 in MCTX->BKREF_ENTS. */
3222
3223static reg_errcode_t
3224internal_function
3225expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3226 int cur_str, int subexp_num, int type)
3227{
3228 const re_dfa_t *const dfa = mctx->dfa;
3229 reg_errcode_t err;
3230 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3231 struct re_backref_cache_entry *ent;
3232
3233 if (cache_idx_start == -1)
3234 return REG_NOERROR;
3235
3236 restart:
3237 ent = mctx->bkref_ents + cache_idx_start;
3238 do
3239 {
3240 int to_idx, next_node;
3241
3242 /* Is this entry ENT is appropriate? */
3243 if (!re_node_set_contains (cur_nodes, ent->node))
3244 continue; /* No. */
3245
3246 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3247 /* Calculate the destination of the back reference, and append it
3248 to MCTX->STATE_LOG. */
3249 if (to_idx == cur_str)
3250 {
3251 /* The backreference did epsilon transit, we must re-check all the
3252 node in the current state. */
3253 re_node_set new_dests;
3254 reg_errcode_t err2, err3;
3255 next_node = dfa->edests[ent->node].elems[0];
3256 if (re_node_set_contains (cur_nodes, next_node))
3257 continue;
3258 err = re_node_set_init_1 (&new_dests, next_node);
3259 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3260 err3 = re_node_set_merge (cur_nodes, &new_dests);
3261 re_node_set_free (&new_dests);
3262 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3263 || err3 != REG_NOERROR, 0))
3264 {
3265 err = (err != REG_NOERROR ? err
3266 : (err2 != REG_NOERROR ? err2 : err3));
3267 return err;
3268 }
3269 /* TODO: It is still inefficient... */
3270 goto restart;
3271 }
3272 else
3273 {
3274 re_node_set union_set;
3275 next_node = dfa->nexts[ent->node];
3276 if (mctx->state_log[to_idx])
3277 {
3278 int ret;
3279 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3280 next_node))
3281 continue;
3282 err = re_node_set_init_copy (&union_set,
3283 &mctx->state_log[to_idx]->nodes);
3284 ret = re_node_set_insert (&union_set, next_node);
3285 if (BE (err != REG_NOERROR || ret < 0, 0))
3286 {
3287 re_node_set_free (&union_set);
3288 err = err != REG_NOERROR ? err : REG_ESPACE;
3289 return err;
3290 }
3291 }
3292 else
3293 {
3294 err = re_node_set_init_1 (&union_set, next_node);
3295 if (BE (err != REG_NOERROR, 0))
3296 return err;
3297 }
3298 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3299 re_node_set_free (&union_set);
3300 if (BE (mctx->state_log[to_idx] == NULL
3301 && err != REG_NOERROR, 0))
3302 return err;
3303 }
3304 }
3305 while (ent++->more);
3306 return REG_NOERROR;
3307}
3308
3309/* Build transition table for the state.
3310 Return 1 if succeeded, otherwise return NULL. */
3311
3312static int
3313internal_function
3314build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3315{
3316 reg_errcode_t err;
3317 int i, j, ch, need_word_trtable = 0;
3318 bitset_word_t elem, mask;
3319 bool dests_node_malloced = false;
3320 bool dest_states_malloced = false;
3321 int ndests; /* Number of the destination states from `state'. */
3322 re_dfastate_t **trtable;
3323 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3324 re_node_set follows, *dests_node;
3325 bitset_t *dests_ch;
3326 bitset_t acceptable;
3327
3328 struct dests_alloc
3329 {
3330 re_node_set dests_node[SBC_MAX];
3331 bitset_t dests_ch[SBC_MAX];
3332 } *dests_alloc;
3333
3334 /* We build DFA states which corresponds to the destination nodes
3335 from `state'. `dests_node[i]' represents the nodes which i-th
3336 destination state contains, and `dests_ch[i]' represents the
3337 characters which i-th destination state accepts. */
3338#ifdef HAVE_ALLOCA
3339 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3340 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3341 else
3342#endif
3343 {
3344 dests_alloc = re_malloc (struct dests_alloc, 1);
3345 if (BE (dests_alloc == NULL, 0))
3346 return 0;
3347 dests_node_malloced = true;
3348 }
3349 dests_node = dests_alloc->dests_node;
3350 dests_ch = dests_alloc->dests_ch;
3351
3352 /* Initialize transiton table. */
3353 state->word_trtable = state->trtable = NULL;
3354
3355 /* At first, group all nodes belonging to `state' into several
3356 destinations. */
3357 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3358 if (BE (ndests <= 0, 0))
3359 {
3360 if (dests_node_malloced)
3361 free (dests_alloc);
3362 /* Return 0 in case of an error, 1 otherwise. */
3363 if (ndests == 0)
3364 {
3365 state->trtable = (re_dfastate_t **)
3366 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3367 return 1;
3368 }
3369 return 0;
3370 }
3371
3372 err = re_node_set_alloc (&follows, ndests + 1);
3373 if (BE (err != REG_NOERROR, 0))
3374 goto out_free;
3375
3376 /* Avoid arithmetic overflow in size calculation. */
3377 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3378 / (3 * sizeof (re_dfastate_t *)))
3379 < ndests),
3380 0))
3381 goto out_free;
3382
3383#ifdef HAVE_ALLOCA
3384 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3385 + ndests * 3 * sizeof (re_dfastate_t *)))
3386 dest_states = (re_dfastate_t **)
3387 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3388 else
3389#endif
3390 {
3391 dest_states = (re_dfastate_t **)
3392 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3393 if (BE (dest_states == NULL, 0))
3394 {
3395out_free:
3396 if (dest_states_malloced)
3397 free (dest_states);
3398 re_node_set_free (&follows);
3399 for (i = 0; i < ndests; ++i)
3400 re_node_set_free (dests_node + i);
3401 if (dests_node_malloced)
3402 free (dests_alloc);
3403 return 0;
3404 }
3405 dest_states_malloced = true;
3406 }
3407 dest_states_word = dest_states + ndests;
3408 dest_states_nl = dest_states_word + ndests;
3409 bitset_empty (acceptable);
3410
3411 /* Then build the states for all destinations. */
3412 for (i = 0; i < ndests; ++i)
3413 {
3414 int next_node;
3415 re_node_set_empty (&follows);
3416 /* Merge the follows of this destination states. */
3417 for (j = 0; j < dests_node[i].nelem; ++j)
3418 {
3419 next_node = dfa->nexts[dests_node[i].elems[j]];
3420 if (next_node != -1)
3421 {
3422 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3423 if (BE (err != REG_NOERROR, 0))
3424 goto out_free;
3425 }
3426 }
3427 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3428 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3429 goto out_free;
3430 /* If the new state has context constraint,
3431 build appropriate states for these contexts. */
3432 if (dest_states[i]->has_constraint)
3433 {
3434 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3435 CONTEXT_WORD);
3436 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3437 goto out_free;
3438
3439 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3440 need_word_trtable = 1;
3441
3442 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3443 CONTEXT_NEWLINE);
3444 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3445 goto out_free;
3446 }
3447 else
3448 {
3449 dest_states_word[i] = dest_states[i];
3450 dest_states_nl[i] = dest_states[i];
3451 }
3452 bitset_merge (acceptable, dests_ch[i]);
3453 }
3454
3455 if (!BE (need_word_trtable, 0))
3456 {
3457 /* We don't care about whether the following character is a word
3458 character, or we are in a single-byte character set so we can
3459 discern by looking at the character code: allocate a
3460 256-entry transition table. */
3461 trtable = state->trtable =
3462 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3463 if (BE (trtable == NULL, 0))
3464 goto out_free;
3465
3466 /* For all characters ch...: */
3467 for (i = 0; i < BITSET_WORDS; ++i)
3468 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3469 elem;
3470 mask <<= 1, elem >>= 1, ++ch)
3471 if (BE (elem & 1, 0))
3472 {
3473 /* There must be exactly one destination which accepts
3474 character ch. See group_nodes_into_DFAstates. */
3475 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3476 ;
3477
3478 /* j-th destination accepts the word character ch. */
3479 if (dfa->word_char[i] & mask)
3480 trtable[ch] = dest_states_word[j];
3481 else
3482 trtable[ch] = dest_states[j];
3483 }
3484 }
3485 else
3486 {
3487 /* We care about whether the following character is a word
3488 character, and we are in a multi-byte character set: discern
3489 by looking at the character code: build two 256-entry
3490 transition tables, one starting at trtable[0] and one
3491 starting at trtable[SBC_MAX]. */
3492 trtable = state->word_trtable =
3493 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3494 if (BE (trtable == NULL, 0))
3495 goto out_free;
3496
3497 /* For all characters ch...: */
3498 for (i = 0; i < BITSET_WORDS; ++i)
3499 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3500 elem;
3501 mask <<= 1, elem >>= 1, ++ch)
3502 if (BE (elem & 1, 0))
3503 {
3504 /* There must be exactly one destination which accepts
3505 character ch. See group_nodes_into_DFAstates. */
3506 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3507 ;
3508
3509 /* j-th destination accepts the word character ch. */
3510 trtable[ch] = dest_states[j];
3511 trtable[ch + SBC_MAX] = dest_states_word[j];
3512 }
3513 }
3514
3515 /* new line */
3516 if (bitset_contain (acceptable, NEWLINE_CHAR))
3517 {
3518 /* The current state accepts newline character. */
3519 for (j = 0; j < ndests; ++j)
3520 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3521 {
3522 /* k-th destination accepts newline character. */
3523 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3524 if (need_word_trtable)
3525 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3526 /* There must be only one destination which accepts
3527 newline. See group_nodes_into_DFAstates. */
3528 break;
3529 }
3530 }
3531
3532 if (dest_states_malloced)
3533 free (dest_states);
3534
3535 re_node_set_free (&follows);
3536 for (i = 0; i < ndests; ++i)
3537 re_node_set_free (dests_node + i);
3538
3539 if (dests_node_malloced)
3540 free (dests_alloc);
3541
3542 return 1;
3543}
3544
3545/* Group all nodes belonging to STATE into several destinations.
3546 Then for all destinations, set the nodes belonging to the destination
3547 to DESTS_NODE[i] and set the characters accepted by the destination
3548 to DEST_CH[i]. This function return the number of destinations. */
3549
3550static int
3551internal_function
3552group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3553 re_node_set *dests_node, bitset_t *dests_ch)
3554{
3555 reg_errcode_t err;
3556 int result;
3557 int i, j, k;
3558 int ndests; /* Number of the destinations from `state'. */
3559 bitset_t accepts; /* Characters a node can accept. */
3560 const re_node_set *cur_nodes = &state->nodes;
3561 bitset_empty (accepts);
3562 ndests = 0;
3563
3564 /* For all the nodes belonging to `state', */
3565 for (i = 0; i < cur_nodes->nelem; ++i)
3566 {
3567 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3568 re_token_type_t type = node->type;
3569 unsigned int constraint = node->constraint;
3570
3571 /* Enumerate all single byte character this node can accept. */
3572 if (type == CHARACTER)
3573 bitset_set (accepts, node->opr.c);
3574 else if (type == SIMPLE_BRACKET)
3575 {
3576 bitset_merge (accepts, node->opr.sbcset);
3577 }
3578 else if (type == OP_PERIOD)
3579 {
3580#ifdef RE_ENABLE_I18N
3581 if (dfa->mb_cur_max > 1)
3582 bitset_merge (accepts, dfa->sb_char);
3583 else
3584#endif
3585 bitset_set_all (accepts);
3586 if (!(dfa->syntax & RE_DOT_NEWLINE))
3587 bitset_clear (accepts, '\n');
3588 if (dfa->syntax & RE_DOT_NOT_NULL)
3589 bitset_clear (accepts, '\0');
3590 }
3591#ifdef RE_ENABLE_I18N
3592 else if (type == OP_UTF8_PERIOD)
3593 {
3594 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3595 if (!(dfa->syntax & RE_DOT_NEWLINE))
3596 bitset_clear (accepts, '\n');
3597 if (dfa->syntax & RE_DOT_NOT_NULL)
3598 bitset_clear (accepts, '\0');
3599 }
3600#endif
3601 else
3602 continue;
3603
3604 /* Check the `accepts' and sift the characters which are not
3605 match it the context. */
3606 if (constraint)
3607 {
3608 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3609 {
3610 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3611 bitset_empty (accepts);
3612 if (accepts_newline)
3613 bitset_set (accepts, NEWLINE_CHAR);
3614 else
3615 continue;
3616 }
3617 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3618 {
3619 bitset_empty (accepts);
3620 continue;
3621 }
3622
3623 if (constraint & NEXT_WORD_CONSTRAINT)
3624 {
3625 bitset_word_t any_set = 0;
3626 if (type == CHARACTER && !node->word_char)
3627 {
3628 bitset_empty (accepts);
3629 continue;
3630 }
3631#ifdef RE_ENABLE_I18N
3632 if (dfa->mb_cur_max > 1)
3633 for (j = 0; j < BITSET_WORDS; ++j)
3634 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3635 else
3636#endif
3637 for (j = 0; j < BITSET_WORDS; ++j)
3638 any_set |= (accepts[j] &= dfa->word_char[j]);
3639 if (!any_set)
3640 continue;
3641 }
3642 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3643 {
3644 bitset_word_t any_set = 0;
3645 if (type == CHARACTER && node->word_char)
3646 {
3647 bitset_empty (accepts);
3648 continue;
3649 }
3650#ifdef RE_ENABLE_I18N
3651 if (dfa->mb_cur_max > 1)
3652 for (j = 0; j < BITSET_WORDS; ++j)
3653 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3654 else
3655#endif
3656 for (j = 0; j < BITSET_WORDS; ++j)
3657 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3658 if (!any_set)
3659 continue;
3660 }
3661 }
3662
3663 /* Then divide `accepts' into DFA states, or create a new
3664 state. Above, we make sure that accepts is not empty. */
3665 for (j = 0; j < ndests; ++j)
3666 {
3667 bitset_t intersec; /* Intersection sets, see below. */
3668 bitset_t remains;
3669 /* Flags, see below. */
3670 bitset_word_t has_intersec, not_subset, not_consumed;
3671
3672 /* Optimization, skip if this state doesn't accept the character. */
3673 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3674 continue;
3675
3676 /* Enumerate the intersection set of this state and `accepts'. */
3677 has_intersec = 0;
3678 for (k = 0; k < BITSET_WORDS; ++k)
3679 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3680 /* And skip if the intersection set is empty. */
3681 if (!has_intersec)
3682 continue;
3683
3684 /* Then check if this state is a subset of `accepts'. */
3685 not_subset = not_consumed = 0;
3686 for (k = 0; k < BITSET_WORDS; ++k)
3687 {
3688 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3689 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3690 }
3691
3692 /* If this state isn't a subset of `accepts', create a
3693 new group state, which has the `remains'. */
3694 if (not_subset)
3695 {
3696 bitset_copy (dests_ch[ndests], remains);
3697 bitset_copy (dests_ch[j], intersec);
3698 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3699 if (BE (err != REG_NOERROR, 0))
3700 goto error_return;
3701 ++ndests;
3702 }
3703
3704 /* Put the position in the current group. */
3705 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3706 if (BE (result < 0, 0))
3707 goto error_return;
3708
3709 /* If all characters are consumed, go to next node. */
3710 if (!not_consumed)
3711 break;
3712 }
3713 /* Some characters remain, create a new group. */
3714 if (j == ndests)
3715 {
3716 bitset_copy (dests_ch[ndests], accepts);
3717 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3718 if (BE (err != REG_NOERROR, 0))
3719 goto error_return;
3720 ++ndests;
3721 bitset_empty (accepts);
3722 }
3723 }
3724 return ndests;
3725 error_return:
3726 for (j = 0; j < ndests; ++j)
3727 re_node_set_free (dests_node + j);
3728 return -1;
3729}
3730
3731#ifdef RE_ENABLE_I18N
3732/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3733 Return the number of the bytes the node accepts.
3734 STR_IDX is the current index of the input string.
3735
3736 This function handles the nodes which can accept one character, or
3737 one collating element like '.', '[a-z]', opposite to the other nodes
3738 can only accept one byte. */
3739
3740static int
3741internal_function
3742check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3743 const re_string_t *input, int str_idx)
3744{
3745 const re_token_t *node = dfa->nodes + node_idx;
3746 int char_len, elem_len;
3747 int i;
3748 wint_t wc;
3749
3750 if (BE (node->type == OP_UTF8_PERIOD, 0))
3751 {
3752 unsigned char c = re_string_byte_at (input, str_idx), d;
3753 if (BE (c < 0xc2, 1))
3754 return 0;
3755
3756 if (str_idx + 2 > input->len)
3757 return 0;
3758
3759 d = re_string_byte_at (input, str_idx + 1);
3760 if (c < 0xe0)
3761 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3762 else if (c < 0xf0)
3763 {
3764 char_len = 3;
3765 if (c == 0xe0 && d < 0xa0)
3766 return 0;
3767 }
3768 else if (c < 0xf8)
3769 {
3770 char_len = 4;
3771 if (c == 0xf0 && d < 0x90)
3772 return 0;
3773 }
3774 else if (c < 0xfc)
3775 {
3776 char_len = 5;
3777 if (c == 0xf8 && d < 0x88)
3778 return 0;
3779 }
3780 else if (c < 0xfe)
3781 {
3782 char_len = 6;
3783 if (c == 0xfc && d < 0x84)
3784 return 0;
3785 }
3786 else
3787 return 0;
3788
3789 if (str_idx + char_len > input->len)
3790 return 0;
3791
3792 for (i = 1; i < char_len; ++i)
3793 {
3794 d = re_string_byte_at (input, str_idx + i);
3795 if (d < 0x80 || d > 0xbf)
3796 return 0;
3797 }
3798 return char_len;
3799 }
3800
3801 char_len = re_string_char_size_at (input, str_idx);
3802 if (node->type == OP_PERIOD)
3803 {
3804 if (char_len <= 1)
3805 return 0;
3806 /* FIXME: I don't think this if is needed, as both '\n'
3807 and '\0' are char_len == 1. */
3808 /* '.' accepts any one character except the following two cases. */
3809 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3810 re_string_byte_at (input, str_idx) == '\n') ||
3811 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3812 re_string_byte_at (input, str_idx) == '\0'))
3813 return 0;
3814 return char_len;
3815 }
3816
3817 elem_len = re_string_elem_size_at (input, str_idx);
3818 wc = __btowc(*(input->mbs+str_idx));
3819 if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3820 return 0;
3821
3822 if (node->type == COMPLEX_BRACKET)
3823 {
3824 const re_charset_t *cset = node->opr.mbcset;
3825# ifdef _LIBC
3826 const unsigned char *pin
3827 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3828 int j;
3829 uint32_t nrules;
3830# endif /* _LIBC */
3831 int match_len = 0;
3832 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3833 ? re_string_wchar_at (input, str_idx) : 0);
3834
3835 /* match with multibyte character? */
3836 for (i = 0; i < cset->nmbchars; ++i)
3837 if (wc == cset->mbchars[i])
3838 {
3839 match_len = char_len;
3840 goto check_node_accept_bytes_match;
3841 }
3842 /* match with character_class? */
3843 for (i = 0; i < cset->nchar_classes; ++i)
3844 {
3845 wctype_t wt = cset->char_classes[i];
3846 if (__iswctype (wc, wt))
3847 {
3848 match_len = char_len;
3849 goto check_node_accept_bytes_match;
3850 }
3851 }
3852
3853# ifdef _LIBC
3854 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3855 if (nrules != 0)
3856 {
3857 unsigned int in_collseq = 0;
3858 const int32_t *table, *indirect;
3859 const unsigned char *weights, *extra;
3860 const char *collseqwc;
3861 /* This #include defines a local function! */
3862# include <locale/weight.h>
3863
3864 /* match with collating_symbol? */
3865 if (cset->ncoll_syms)
3866 extra = (const unsigned char *)
3867 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3868 for (i = 0; i < cset->ncoll_syms; ++i)
3869 {
3870 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3871 /* Compare the length of input collating element and
3872 the length of current collating element. */
3873 if (*coll_sym != elem_len)
3874 continue;
3875 /* Compare each bytes. */
3876 for (j = 0; j < *coll_sym; j++)
3877 if (pin[j] != coll_sym[1 + j])
3878 break;
3879 if (j == *coll_sym)
3880 {
3881 /* Match if every bytes is equal. */
3882 match_len = j;
3883 goto check_node_accept_bytes_match;
3884 }
3885 }
3886
3887 if (cset->nranges)
3888 {
3889 if (elem_len <= char_len)
3890 {
3891 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3892 in_collseq = __collseq_table_lookup (collseqwc, wc);
3893 }
3894 else
3895 in_collseq = find_collation_sequence_value (pin, elem_len);
3896 }
3897 /* match with range expression? */
3898 for (i = 0; i < cset->nranges; ++i)
3899 if (cset->range_starts[i] <= in_collseq
3900 && in_collseq <= cset->range_ends[i])
3901 {
3902 match_len = elem_len;
3903 goto check_node_accept_bytes_match;
3904 }
3905
3906 /* match with equivalence_class? */
3907 if (cset->nequiv_classes)
3908 {
3909 const unsigned char *cp = pin;
3910 table = (const int32_t *)
3911 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3912 weights = (const unsigned char *)
3913 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3914 extra = (const unsigned char *)
3915 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3916 indirect = (const int32_t *)
3917 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3918 int32_t idx = findidx (&cp);
3919 if (idx > 0)
3920 for (i = 0; i < cset->nequiv_classes; ++i)
3921 {
3922 int32_t equiv_class_idx = cset->equiv_classes[i];
3923 size_t weight_len = weights[idx & 0xffffff];
3924 if (weight_len == weights[equiv_class_idx & 0xffffff]
3925 && (idx >> 24) == (equiv_class_idx >> 24))
3926 {
3927 int cnt = 0;
3928
3929 idx &= 0xffffff;
3930 equiv_class_idx &= 0xffffff;
3931
3932 while (cnt <= weight_len
3933 && (weights[equiv_class_idx + 1 + cnt]
3934 == weights[idx + 1 + cnt]))
3935 ++cnt;
3936 if (cnt > weight_len)
3937 {
3938 match_len = elem_len;
3939 goto check_node_accept_bytes_match;
3940 }
3941 }
3942 }
3943 }
3944 }
3945 else
3946# endif /* _LIBC */
3947 {
3948 /* match with range expression? */
3949#if __GNUC__ >= 2
3950 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3951#else
3952 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3953 cmp_buf[2] = wc;
3954#endif
3955 for (i = 0; i < cset->nranges; ++i)
3956 {
3957 cmp_buf[0] = cset->range_starts[i];
3958 cmp_buf[4] = cset->range_ends[i];
3959 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3960 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3961 {
3962 match_len = char_len;
3963 goto check_node_accept_bytes_match;
3964 }
3965 }
3966 }
3967 check_node_accept_bytes_match:
3968 if (!cset->non_match)
3969 return match_len;
3970 else
3971 {
3972 if (match_len > 0)
3973 return 0;
3974 else
3975 return (elem_len > char_len) ? elem_len : char_len;
3976 }
3977 }
3978 return 0;
3979}
3980
3981# ifdef _LIBC
3982static unsigned int
3983internal_function
3984find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3985{
3986 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3987 if (nrules == 0)
3988 {
3989 if (mbs_len == 1)
3990 {
3991 /* No valid character. Match it as a single byte character. */
3992 const unsigned char *collseq = (const unsigned char *)
3993 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3994 return collseq[mbs[0]];
3995 }
3996 return UINT_MAX;
3997 }
3998 else
3999 {
4000 int32_t idx;
4001 const unsigned char *extra = (const unsigned char *)
4002 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4003 int32_t extrasize = (const unsigned char *)
4004 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4005
4006 for (idx = 0; idx < extrasize;)
4007 {
4008 int mbs_cnt, found = 0;
4009 int32_t elem_mbs_len;
4010 /* Skip the name of collating element name. */
4011 idx = idx + extra[idx] + 1;
4012 elem_mbs_len = extra[idx++];
4013 if (mbs_len == elem_mbs_len)
4014 {
4015 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4016 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4017 break;
4018 if (mbs_cnt == elem_mbs_len)
4019 /* Found the entry. */
4020 found = 1;
4021 }
4022 /* Skip the byte sequence of the collating element. */
4023 idx += elem_mbs_len;
4024 /* Adjust for the alignment. */
4025 idx = (idx + 3) & ~3;
4026 /* Skip the collation sequence value. */
4027 idx += sizeof (uint32_t);
4028 /* Skip the wide char sequence of the collating element. */
4029 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4030 /* If we found the entry, return the sequence value. */
4031 if (found)
4032 return *(uint32_t *) (extra + idx);
4033 /* Skip the collation sequence value. */
4034 idx += sizeof (uint32_t);
4035 }
4036 return UINT_MAX;
4037 }
4038}
4039# endif /* _LIBC */
4040#endif /* RE_ENABLE_I18N */
4041
4042/* Check whether the node accepts the byte which is IDX-th
4043 byte of the INPUT. */
4044
4045static int
4046internal_function
4047check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4048 int idx)
4049{
4050 unsigned char ch;
4051 ch = re_string_byte_at (&mctx->input, idx);
4052 switch (node->type)
4053 {
4054 case CHARACTER:
4055 if (node->opr.c != ch)
4056 return 0;
4057 break;
4058
4059 case SIMPLE_BRACKET:
4060 if (!bitset_contain (node->opr.sbcset, ch))
4061 return 0;
4062 break;
4063
4064#ifdef RE_ENABLE_I18N
4065 case OP_UTF8_PERIOD:
4066 if (ch >= 0x80)
4067 return 0;
4068 /* FALLTHROUGH */
4069#endif
4070 case OP_PERIOD:
4071 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4072 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4073 return 0;
4074 break;
4075
4076 default:
4077 return 0;
4078 }
4079
4080 if (node->constraint)
4081 {
4082 /* The node has constraints. Check whether the current context
4083 satisfies the constraints. */
4084 unsigned int context = re_string_context_at (&mctx->input, idx,
4085 mctx->eflags);
4086 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4087 return 0;
4088 }
4089
4090 return 1;
4091}
4092
4093/* Extend the buffers, if the buffers have run out. */
4094
4095static reg_errcode_t
4096internal_function
4097extend_buffers (re_match_context_t *mctx)
4098{
4099 reg_errcode_t ret;
4100 re_string_t *pstr = &mctx->input;
4101
4102 /* Avoid overflow. */
4103 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4104 return REG_ESPACE;
4105
4106 /* Double the lengthes of the buffers. */
4107 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4108 if (BE (ret != REG_NOERROR, 0))
4109 return ret;
4110
4111 if (mctx->state_log != NULL)
4112 {
4113 /* And double the length of state_log. */
4114 /* XXX We have no indication of the size of this buffer. If this
4115 allocation fail we have no indication that the state_log array
4116 does not have the right size. */
4117 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4118 pstr->bufs_len + 1);
4119 if (BE (new_array == NULL, 0))
4120 return REG_ESPACE;
4121 mctx->state_log = new_array;
4122 }
4123
4124 /* Then reconstruct the buffers. */
4125 if (pstr->icase)
4126 {
4127#ifdef RE_ENABLE_I18N
4128 if (pstr->mb_cur_max > 1)
4129 {
4130 ret = build_wcs_upper_buffer (pstr);
4131 if (BE (ret != REG_NOERROR, 0))
4132 return ret;
4133 }
4134 else
4135#endif /* RE_ENABLE_I18N */
4136 build_upper_buffer (pstr);
4137 }
4138 else
4139 {
4140#ifdef RE_ENABLE_I18N
4141 if (pstr->mb_cur_max > 1)
4142 build_wcs_buffer (pstr);
4143 else
4144#endif /* RE_ENABLE_I18N */
4145 {
4146 if (pstr->trans != NULL)
4147 re_string_translate_buffer (pstr);
4148 }
4149 }
4150 return REG_NOERROR;
4151}
4152
4153\f
4154/* Functions for matching context. */
4155
4156/* Initialize MCTX. */
4157
4158static reg_errcode_t
4159internal_function
4160match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4161{
4162 mctx->eflags = eflags;
4163 mctx->match_last = -1;
4164 if (n > 0)
4165 {
4166 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4167 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4168 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4169 return REG_ESPACE;
4170 }
4171 /* Already zero-ed by the caller.
4172 else
4173 mctx->bkref_ents = NULL;
4174 mctx->nbkref_ents = 0;
4175 mctx->nsub_tops = 0; */
4176 mctx->abkref_ents = n;
4177 mctx->max_mb_elem_len = 1;
4178 mctx->asub_tops = n;
4179 return REG_NOERROR;
4180}
4181
4182/* Clean the entries which depend on the current input in MCTX.
4183 This function must be invoked when the matcher changes the start index
4184 of the input, or changes the input string. */
4185
4186static void
4187internal_function
4188match_ctx_clean (re_match_context_t *mctx)
4189{
4190 int st_idx;
4191 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4192 {
4193 int sl_idx;
4194 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4195 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4196 {
4197 re_sub_match_last_t *last = top->lasts[sl_idx];
4198 re_free (last->path.array);
4199 re_free (last);
4200 }
4201 re_free (top->lasts);
4202 if (top->path)
4203 {
4204 re_free (top->path->array);
4205 re_free (top->path);
4206 }
4207 free (top);
4208 }
4209
4210 mctx->nsub_tops = 0;
4211 mctx->nbkref_ents = 0;
4212}
4213
4214/* Free all the memory associated with MCTX. */
4215
4216static void
4217internal_function
4218match_ctx_free (re_match_context_t *mctx)
4219{
4220 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4221 match_ctx_clean (mctx);
4222 re_free (mctx->sub_tops);
4223 re_free (mctx->bkref_ents);
4224}
4225
4226/* Add a new backreference entry to MCTX.
4227 Note that we assume that caller never call this function with duplicate
4228 entry, and call with STR_IDX which isn't smaller than any existing entry.
4229*/
4230
4231static reg_errcode_t
4232internal_function
4233match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4234 int to)
4235{
4236 if (mctx->nbkref_ents >= mctx->abkref_ents)
4237 {
4238 struct re_backref_cache_entry* new_entry;
4239 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4240 mctx->abkref_ents * 2);
4241 if (BE (new_entry == NULL, 0))
4242 {
4243 re_free (mctx->bkref_ents);
4244 return REG_ESPACE;
4245 }
4246 mctx->bkref_ents = new_entry;
4247 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4248 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4249 mctx->abkref_ents *= 2;
4250 }
4251 if (mctx->nbkref_ents > 0
4252 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4253 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4254
4255 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4256 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4257 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4258 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4259
4260 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4261 If bit N is clear, means that this entry won't epsilon-transition to
4262 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4263 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4264 such node.
4265
4266 A backreference does not epsilon-transition unless it is empty, so set
4267 to all zeros if FROM != TO. */
4268 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4269 = (from == to ? ~0 : 0);
4270
4271 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4272 if (mctx->max_mb_elem_len < to - from)
4273 mctx->max_mb_elem_len = to - from;
4274 return REG_NOERROR;
4275}
4276
4277/* Search for the first entry which has the same str_idx, or -1 if none is
4278 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4279
4280static int
4281internal_function
4282search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4283{
4284 int left, right, mid, last;
4285 last = right = mctx->nbkref_ents;
4286 for (left = 0; left < right;)
4287 {
4288 mid = (left + right) / 2;
4289 if (mctx->bkref_ents[mid].str_idx < str_idx)
4290 left = mid + 1;
4291 else
4292 right = mid;
4293 }
4294 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4295 return left;
4296 else
4297 return -1;
4298}
4299
4300/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4301 at STR_IDX. */
4302
4303static reg_errcode_t
4304internal_function
4305match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4306{
4307#ifdef DEBUG
4308 assert (mctx->sub_tops != NULL);
4309 assert (mctx->asub_tops > 0);
4310#endif
4311 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4312 {
4313 int new_asub_tops = mctx->asub_tops * 2;
4314 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4315 re_sub_match_top_t *,
4316 new_asub_tops);
4317 if (BE (new_array == NULL, 0))
4318 return REG_ESPACE;
4319 mctx->sub_tops = new_array;
4320 mctx->asub_tops = new_asub_tops;
4321 }
4322 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4323 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4324 return REG_ESPACE;
4325 mctx->sub_tops[mctx->nsub_tops]->node = node;
4326 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4327 return REG_NOERROR;
4328}
4329
4330/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4331 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4332
4333static re_sub_match_last_t *
4334internal_function
4335match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4336{
4337 re_sub_match_last_t *new_entry;
4338 if (BE (subtop->nlasts == subtop->alasts, 0))
4339 {
4340 int new_alasts = 2 * subtop->alasts + 1;
4341 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4342 re_sub_match_last_t *,
4343 new_alasts);
4344 if (BE (new_array == NULL, 0))
4345 return NULL;
4346 subtop->lasts = new_array;
4347 subtop->alasts = new_alasts;
4348 }
4349 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4350 if (BE (new_entry != NULL, 1))
4351 {
4352 subtop->lasts[subtop->nlasts] = new_entry;
4353 new_entry->node = node;
4354 new_entry->str_idx = str_idx;
4355 ++subtop->nlasts;
4356 }
4357 return new_entry;
4358}
4359
4360static void
4361internal_function
4362sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4363 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4364{
4365 sctx->sifted_states = sifted_sts;
4366 sctx->limited_states = limited_sts;
4367 sctx->last_node = last_node;
4368 sctx->last_str_idx = last_str_idx;
4369 re_node_set_init_empty (&sctx->limits);
4370}