grep/icase: avoid kwsset when -F is specified
authorNguyễn Thái Ngọc Duy <pclouds@gmail.com>
Sat, 25 Jun 2016 05:22:31 +0000 (07:22 +0200)
committerJunio C Hamano <gitster@pobox.com>
Fri, 1 Jul 2016 19:44:30 +0000 (12:44 -0700)
Similar to the previous commit, we can't use kws on icase search
outside ascii range. But we can't simply pass the pattern to
regcomp/pcre like the previous commit because it may contain regex
special characters, so we need to quote the regex first.

To avoid misquote traps that could lead to undefined behavior, we
always stick to basic regex engine in this case. We don't need fancy
features for grepping a literal string anyway.

basic_regex_quote_buf() assumes that if the pattern is in a multibyte
encoding, ascii chars must be unambiguously encoded as single
bytes. This is true at least for UTF-8. For others, let's wait until
people yell up. Chances are nobody uses multibyte, non utf-8 charsets
anymore.

Noticed-by: Plamen Totev <plamen.totev@abv.bg>
Helped-by: René Scharfe <l.s.r@web.de>
Helped-by: Eric Sunshine <sunshine@sunshineco.com>
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
grep.c
quote.c
quote.h
t/t7812-grep-icase-non-ascii.sh
diff --git a/grep.c b/grep.c
index 451275d2980428a8891e0fcf9f8f15aa543f16b1..627ae3e3e816998e30ff771d4c12abd0a1627122 100644 (file)
--- a/grep.c
+++ b/grep.c
@@ -5,6 +5,7 @@
 #include "diff.h"
 #include "diffcore.h"
 #include "commit.h"
+#include "quote.h"
 
 static int grep_source_load(struct grep_source *gs);
 static int grep_source_is_binary(struct grep_source *gs);
@@ -397,6 +398,28 @@ static int is_fixed(const char *s, size_t len)
        return 1;
 }
 
+static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
+{
+       struct strbuf sb = STRBUF_INIT;
+       int err;
+       int regflags;
+
+       basic_regex_quote_buf(&sb, p->pattern);
+       regflags = opt->regflags & ~REG_EXTENDED;
+       if (opt->ignore_case)
+               regflags |= REG_ICASE;
+       err = regcomp(&p->regexp, sb.buf, regflags);
+       if (opt->debug)
+               fprintf(stderr, "fixed %s\n", sb.buf);
+       strbuf_release(&sb);
+       if (err) {
+               char errbuf[1024];
+               regerror(err, &p->regexp, errbuf, sizeof(errbuf));
+               regfree(&p->regexp);
+               compile_regexp_failed(p, errbuf);
+       }
+}
+
 static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
 {
        int icase, ascii_only;
@@ -407,8 +430,20 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
        icase          = opt->regflags & REG_ICASE || p->ignore_case;
        ascii_only     = !has_non_ascii(p->pattern);
 
+       /*
+        * Even when -F (fixed) asks us to do a non-regexp search, we
+        * may not be able to correctly case-fold when -i
+        * (ignore-case) is asked (in which case, we'll synthesize a
+        * regexp to match the pattern that matches regexp special
+        * characters literally, while ignoring case differences).  On
+        * the other hand, even without -F, if the pattern does not
+        * have any regexp special characters and there is no need for
+        * case-folding search, we can internally turn it into a
+        * simple string match using kws.  p->fixed tells us if we
+        * want to use kws.
+        */
        if (opt->fixed)
-               p->fixed = 1;
+               p->fixed = !icase || ascii_only;
        else if ((!icase || ascii_only) &&
                 is_fixed(p->pattern, p->patternlen))
                p->fixed = 1;
@@ -423,6 +458,14 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
                kwsincr(p->kws, p->pattern, p->patternlen);
                kwsprep(p->kws);
                return;
+       } else if (opt->fixed) {
+               /*
+                * We come here when the pattern has the non-ascii
+                * characters we cannot case-fold, and asked to
+                * ignore-case.
+                */
+               compile_fixed_regexp(p, opt);
+               return;
        }
 
        if (opt->pcre) {
diff --git a/quote.c b/quote.c
index fe884d24521f91a0f1f30b3cf5f08734bab7bd31..c67adb718c915fa58f568968118d2bfea3d9cc28 100644 (file)
--- a/quote.c
+++ b/quote.c
@@ -440,3 +440,40 @@ void tcl_quote_buf(struct strbuf *sb, const char *src)
        }
        strbuf_addch(sb, '"');
 }
+
+void basic_regex_quote_buf(struct strbuf *sb, const char *src)
+{
+       char c;
+
+       if (*src == '^') {
+               /* only beginning '^' is special and needs quoting */
+               strbuf_addch(sb, '\\');
+               strbuf_addch(sb, *src++);
+       }
+       if (*src == '*')
+               /* beginning '*' is not special, no quoting */
+               strbuf_addch(sb, *src++);
+
+       while ((c = *src++)) {
+               switch (c) {
+               case '[':
+               case '.':
+               case '\\':
+               case '*':
+                       strbuf_addch(sb, '\\');
+                       strbuf_addch(sb, c);
+                       break;
+
+               case '$':
+                       /* only the end '$' is special and needs quoting */
+                       if (*src == '\0')
+                               strbuf_addch(sb, '\\');
+                       strbuf_addch(sb, c);
+                       break;
+
+               default:
+                       strbuf_addch(sb, c);
+                       break;
+               }
+       }
+}
diff --git a/quote.h b/quote.h
index 99e04d34bf223a0147e65623322ef2c2fb01e01d..362d315bec6933de95a506920405a55af9015e88 100644 (file)
--- a/quote.h
+++ b/quote.h
@@ -67,5 +67,6 @@ extern char *quote_path_relative(const char *in, const char *prefix,
 extern void perl_quote_buf(struct strbuf *sb, const char *src);
 extern void python_quote_buf(struct strbuf *sb, const char *src);
 extern void tcl_quote_buf(struct strbuf *sb, const char *src);
+extern void basic_regex_quote_buf(struct strbuf *sb, const char *src);
 
 #endif
index b78a774dab79cf54fcb47daa059bc5f78e76a4af..1929809d4a71f13d6cdf024a5ac63eed18c884fb 100755 (executable)
@@ -20,4 +20,30 @@ test_expect_success REGEX_LOCALE 'grep literal string, no -F' '
        git grep -i "TILRAUN: HALLÓ HEIMUR!"
 '
 
+test_expect_success REGEX_LOCALE 'grep literal string, with -F' '
+       git grep --debug -i -F "TILRAUN: Halló Heimur!"  2>&1 >/dev/null |
+                grep fixed >debug1 &&
+       test_write_lines "fixed TILRAUN: Halló Heimur!" >expect1 &&
+       test_cmp expect1 debug1 &&
+
+       git grep --debug -i -F "TILRAUN: HALLÓ HEIMUR!"  2>&1 >/dev/null |
+                grep fixed >debug2 &&
+       test_write_lines "fixed TILRAUN: HALLÓ HEIMUR!" >expect2 &&
+       test_cmp expect2 debug2
+'
+
+test_expect_success REGEX_LOCALE 'grep string with regex, with -F' '
+       test_write_lines "^*TILR^AUN:.* \\Halló \$He[]imur!\$" >file &&
+
+       git grep --debug -i -F "^*TILR^AUN:.* \\Halló \$He[]imur!\$" 2>&1 >/dev/null |
+                grep fixed >debug1 &&
+       test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\Halló \$He\\[]imur!\\\$" >expect1 &&
+       test_cmp expect1 debug1 &&
+
+       git grep --debug -i -F "^*TILR^AUN:.* \\HALLÓ \$HE[]IMUR!\$"  2>&1 >/dev/null |
+                grep fixed >debug2 &&
+       test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\HALLÓ \$HE\\[]IMUR!\\\$" >expect2 &&
+       test_cmp expect2 debug2
+'
+
 test_done