From: Junio C Hamano Date: Tue, 19 Jul 2016 20:22:17 +0000 (-0700) Subject: Merge branch 'nd/icase' X-Git-Tag: v2.10.0-rc0~109 X-Git-Url: https://git.lorimer.id.au/gitweb.git/diff_plain/a883c31af66195556a775f75851f46573c98e43d?hp=-c Merge branch 'nd/icase' "git grep -i" has been taught to fold case in non-ascii locales correctly. * nd/icase: grep.c: reuse "icase" variable diffcore-pickaxe: support case insensitive match on non-ascii diffcore-pickaxe: Add regcomp_or_die() grep/pcre: support utf-8 gettext: add is_utf8_locale() grep/pcre: prepare locale-dependent tables for icase matching grep: rewrite an if/else condition to avoid duplicate expression grep/icase: avoid kwsset when -F is specified grep/icase: avoid kwsset on literal non-ascii strings test-regex: expose full regcomp() to the command line test-regex: isolate the bug test code grep: break down an "if" stmt in preparation for next changes --- a883c31af66195556a775f75851f46573c98e43d diff --combined grep.c index 1e15b6292d,906406a182..394c8569db --- a/grep.c +++ b/grep.c @@@ -4,6 -4,8 +4,8 @@@ #include "xdiff-interface.h" #include "diff.h" #include "diffcore.h" + #include "commit.h" + #include "quote.h" static int grep_source_load(struct grep_source *gs); static int grep_source_is_binary(struct grep_source *gs); @@@ -322,11 -324,16 +324,16 @@@ static void compile_pcre_regexp(struct int erroffset; int options = PCRE_MULTILINE; - if (opt->ignore_case) + if (opt->ignore_case) { + if (has_non_ascii(p->pattern)) + p->pcre_tables = pcre_maketables(); options |= PCRE_CASELESS; + } + if (is_utf8_locale() && has_non_ascii(p->pattern)) + options |= PCRE_UTF8; p->pcre_regexp = pcre_compile(p->pattern, options, &error, &erroffset, - NULL); + p->pcre_tables); if (!p->pcre_regexp) compile_regexp_failed(p, error); @@@ -360,6 -367,7 +367,7 @@@ static void free_pcre_regexp(struct gre { pcre_free(p->pcre_regexp); pcre_free(p->pcre_extra_info); + pcre_free((void *)p->pcre_tables); } #else /* !USE_LIBPCRE */ static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt) @@@ -396,26 -404,68 +404,68 @@@ static int is_fixed(const char *s, size return 1; } + static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt) + { + struct strbuf sb = STRBUF_INIT; + int err; + int regflags; + + basic_regex_quote_buf(&sb, p->pattern); + regflags = opt->regflags & ~REG_EXTENDED; + if (opt->ignore_case) + regflags |= REG_ICASE; + err = regcomp(&p->regexp, sb.buf, regflags); + if (opt->debug) + fprintf(stderr, "fixed %s\n", sb.buf); + strbuf_release(&sb); + if (err) { + char errbuf[1024]; + regerror(err, &p->regexp, errbuf, sizeof(errbuf)); + regfree(&p->regexp); + compile_regexp_failed(p, errbuf); + } + } + static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) { + int icase, ascii_only; int err; p->word_regexp = opt->word_regexp; p->ignore_case = opt->ignore_case; - + icase = opt->regflags & REG_ICASE || p->ignore_case; + ascii_only = !has_non_ascii(p->pattern); + + /* + * Even when -F (fixed) asks us to do a non-regexp search, we + * may not be able to correctly case-fold when -i + * (ignore-case) is asked (in which case, we'll synthesize a + * regexp to match the pattern that matches regexp special + * characters literally, while ignoring case differences). On + * the other hand, even without -F, if the pattern does not + * have any regexp special characters and there is no need for + * case-folding search, we can internally turn it into a + * simple string match using kws. p->fixed tells us if we + * want to use kws. + */ if (opt->fixed || is_fixed(p->pattern, p->patternlen)) - p->fixed = 1; + p->fixed = !icase || ascii_only; else p->fixed = 0; if (p->fixed) { - if (opt->regflags & REG_ICASE || p->ignore_case) - p->kws = kwsalloc(tolower_trans_tbl); - else - p->kws = kwsalloc(NULL); + p->kws = kwsalloc(icase ? tolower_trans_tbl : NULL); kwsincr(p->kws, p->pattern, p->patternlen); kwsprep(p->kws); return; + } else if (opt->fixed) { + /* + * We come here when the pattern has the non-ascii + * characters we cannot case-fold, and asked to + * ignore-case. + */ + compile_fixed_regexp(p, opt); + return; } if (opt->pcre) { @@@ -1396,17 -1446,9 +1446,17 @@@ static int fill_textconv_grep(struct us return 0; } +static int is_empty_line(const char *bol, const char *eol) +{ + while (bol < eol && isspace(*bol)) + bol++; + return bol == eol; +} + static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int collect_hits) { char *bol; + char *peek_bol = NULL; unsigned long left; unsigned lno = 1; unsigned last_hit = 0; @@@ -1551,24 -1593,8 +1601,24 @@@ show_function = 1; goto next_line; } - if (show_function && match_funcname(opt, gs, bol, eol)) - show_function = 0; + if (show_function && (!peek_bol || peek_bol < bol)) { + unsigned long peek_left = left; + char *peek_eol = eol; + + /* + * Trailing empty lines are not interesting. + * Peek past them to see if they belong to the + * body of the current function. + */ + peek_bol = bol; + while (is_empty_line(peek_bol, peek_eol)) { + peek_bol = peek_eol + 1; + peek_eol = end_of_line(peek_bol, &peek_left); + } + + if (match_funcname(opt, gs, peek_bol, peek_eol)) + show_function = 0; + } if (show_function || (last_hit && lno <= last_hit + opt->post_context)) { /* If the last hit is within the post context, @@@ -1756,7 -1782,7 +1806,7 @@@ static int grep_source_load_file(struc if (lstat(filename, &st) < 0) { err_ret: if (errno != ENOENT) - error(_("'%s': %s"), filename, strerror(errno)); + error_errno(_("failed to stat '%s'"), filename); return -1; } if (!S_ISREG(st.st_mode)) @@@ -1765,14 -1791,15 +1815,14 @@@ i = open(filename, O_RDONLY); if (i < 0) goto err_ret; - data = xmalloc(size + 1); + data = xmallocz(size); if (st.st_size != read_in_full(i, data, size)) { - error(_("'%s': short read %s"), filename, strerror(errno)); + error_errno(_("'%s': short read"), filename); close(i); free(data); return -1; } close(i); - data[size] = 0; gs->buf = data; gs->size = size; diff --combined quote.c index b281a8fe45,c67adb718c..53b98a5b84 --- a/quote.c +++ b/quote.c @@@ -43,19 -43,6 +43,19 @@@ void sq_quote_buf(struct strbuf *dst, c free(to_free); } +void sq_quotef(struct strbuf *dst, const char *fmt, ...) +{ + struct strbuf src = STRBUF_INIT; + + va_list ap; + va_start(ap, fmt); + strbuf_vaddf(&src, fmt, ap); + va_end(ap); + + sq_quote_buf(dst, src.buf); + strbuf_release(&src); +} + void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen) { int i; @@@ -453,3 -440,40 +453,40 @@@ void tcl_quote_buf(struct strbuf *sb, c } strbuf_addch(sb, '"'); } + + void basic_regex_quote_buf(struct strbuf *sb, const char *src) + { + char c; + + if (*src == '^') { + /* only beginning '^' is special and needs quoting */ + strbuf_addch(sb, '\\'); + strbuf_addch(sb, *src++); + } + if (*src == '*') + /* beginning '*' is not special, no quoting */ + strbuf_addch(sb, *src++); + + while ((c = *src++)) { + switch (c) { + case '[': + case '.': + case '\\': + case '*': + strbuf_addch(sb, '\\'); + strbuf_addch(sb, c); + break; + + case '$': + /* only the end '$' is special and needs quoting */ + if (*src == '\0') + strbuf_addch(sb, '\\'); + strbuf_addch(sb, c); + break; + + default: + strbuf_addch(sb, c); + break; + } + } + } diff --combined quote.h index 6c53a2cc66,362d315bec..66f5644aa2 --- a/quote.h +++ b/quote.h @@@ -25,13 -25,10 +25,13 @@@ struct strbuf * sq_quote_buf() writes to an existing buffer of specified size; it * will return the number of characters that would have been written * excluding the final null regardless of the buffer size. + * + * sq_quotef() quotes the entire formatted string as a single result. */ extern void sq_quote_buf(struct strbuf *, const char *src); extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen); +extern void sq_quotef(struct strbuf *, const char *fmt, ...); /* This unwraps what sq_quote() produces in place, but returns * NULL if the input does not look like what sq_quote would have @@@ -70,5 -67,6 +70,6 @@@ extern char *quote_path_relative(const extern void perl_quote_buf(struct strbuf *sb, const char *src); extern void python_quote_buf(struct strbuf *sb, const char *src); extern void tcl_quote_buf(struct strbuf *sb, const char *src); + extern void basic_regex_quote_buf(struct strbuf *sb, const char *src); #endif diff --combined t/helper/test-regex.c index 0dc598ecdc,0000000000..eff26f534f mode 100644,000000..100644 --- a/t/helper/test-regex.c +++ b/t/helper/test-regex.c @@@ -1,20 -1,0 +1,75 @@@ +#include "git-compat-util.h" ++#include "gettext.h" + - int main(int argc, char **argv) ++struct reg_flag { ++ const char *name; ++ int flag; ++}; ++ ++static struct reg_flag reg_flags[] = { ++ { "EXTENDED", REG_EXTENDED }, ++ { "NEWLINE", REG_NEWLINE }, ++ { "ICASE", REG_ICASE }, ++ { "NOTBOL", REG_NOTBOL }, ++#ifdef REG_STARTEND ++ { "STARTEND", REG_STARTEND }, ++#endif ++ { NULL, 0 } ++}; ++ ++static int test_regex_bug(void) +{ + char *pat = "[^={} \t]+"; + char *str = "={}\nfred"; + regex_t r; + regmatch_t m[1]; + + if (regcomp(&r, pat, REG_EXTENDED | REG_NEWLINE)) + die("failed regcomp() for pattern '%s'", pat); + if (regexec(&r, str, 1, m, 0)) + die("no match of pattern '%s' to string '%s'", pat, str); + + /* http://sourceware.org/bugzilla/show_bug.cgi?id=3957 */ + if (m[0].rm_so == 3) /* matches '\n' when it should not */ + die("regex bug confirmed: re-build git with NO_REGEX=1"); + - exit(0); ++ return 0; ++} ++ ++int main(int argc, char **argv) ++{ ++ const char *pat; ++ const char *str; ++ int flags = 0; ++ regex_t r; ++ regmatch_t m[1]; ++ ++ if (argc == 2 && !strcmp(argv[1], "--bug")) ++ return test_regex_bug(); ++ else if (argc < 3) ++ usage("test-regex --bug\n" ++ "test-regex []"); ++ ++ argv++; ++ pat = *argv++; ++ str = *argv++; ++ while (*argv) { ++ struct reg_flag *rf; ++ for (rf = reg_flags; rf->name; rf++) ++ if (!strcmp(*argv, rf->name)) { ++ flags |= rf->flag; ++ break; ++ } ++ if (!rf->name) ++ die("do not recognize %s", *argv); ++ argv++; ++ } ++ git_setup_gettext(); ++ ++ if (regcomp(&r, pat, flags)) ++ die("failed regcomp() for pattern '%s'", pat); ++ if (regexec(&r, str, 1, m, 0)) ++ return 1; ++ ++ return 0; +}