commit/commit-tree: correct latin1 to utf-8
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Jun 2012 18:24:14 +0000 (11:24 -0700)
committerJunio C Hamano <gitster@pobox.com>
Tue, 21 Aug 2012 23:10:53 +0000 (16:10 -0700)
When a line in the message is not a valid utf-8, "git mailinfo"
attempts to convert it to utf-8 assuming the input is latin1 (and
punt if it does not convert cleanly). Using the same heuristics in
"git commit" and "git commit-tree" lets the editor output be in
latin1 to make the overall system more consistent.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
builtin/mailinfo.c
commit.c
index eaf9e157a3897c2442756911b906ff9d3ee2be90..dd4f925475fd2012822575a86e78dcd24c1ef8ef 100644 (file)
@@ -481,36 +481,12 @@ static struct strbuf *decode_b_segment(const struct strbuf *b_seg)
        return out;
 }
 
        return out;
 }
 
-/*
- * When there is no known charset, guess.
- *
- * Right now we assume that if the target is UTF-8 (the default),
- * and it already looks like UTF-8 (which includes US-ASCII as its
- * subset, of course) then that is what it is and there is nothing
- * to do.
- *
- * Otherwise, we default to assuming it is Latin1 for historical
- * reasons.
- */
-static const char *guess_charset(const struct strbuf *line, const char *target_charset)
-{
-       if (is_encoding_utf8(target_charset)) {
-               if (is_utf8(line->buf))
-                       return NULL;
-       }
-       return "ISO8859-1";
-}
-
 static void convert_to_utf8(struct strbuf *line, const char *charset)
 {
        char *out;
 
 static void convert_to_utf8(struct strbuf *line, const char *charset)
 {
        char *out;
 
-       if (!charset || !*charset) {
-               charset = guess_charset(line, metainfo_charset);
-               if (!charset)
-                       return;
-       }
-
+       if (!charset || !*charset)
+               return;
        if (!strcasecmp(metainfo_charset, charset))
                return;
        out = reencode_string(line->buf, metainfo_charset, charset);
        if (!strcasecmp(metainfo_charset, charset))
                return;
        out = reencode_string(line->buf, metainfo_charset, charset);
index 8248a994a50ab91874600567e544aa3735e4aa98..1360bbd2cb7d9c83d128fd301bcee4763ac8f054 100644 (file)
--- a/commit.c
+++ b/commit.c
@@ -1112,8 +1112,92 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree,
        return result;
 }
 
        return result;
 }
 
+static int find_invalid_utf8(const char *buf, int len)
+{
+       int offset = 0;
+
+       while (len) {
+               unsigned char c = *buf++;
+               int bytes, bad_offset;
+
+               len--;
+               offset++;
+
+               /* Simple US-ASCII? No worries. */
+               if (c < 0x80)
+                       continue;
+
+               bad_offset = offset-1;
+
+               /*
+                * Count how many more high bits set: that's how
+                * many more bytes this sequence should have.
+                */
+               bytes = 0;
+               while (c & 0x40) {
+                       c <<= 1;
+                       bytes++;
+               }
+
+               /* Must be between 1 and 5 more bytes */
+               if (bytes < 1 || bytes > 5)
+                       return bad_offset;
+
+               /* Do we *have* that many bytes? */
+               if (len < bytes)
+                       return bad_offset;
+
+               offset += bytes;
+               len -= bytes;
+
+               /* And verify that they are good continuation bytes */
+               do {
+                       if ((*buf++ & 0xc0) != 0x80)
+                               return bad_offset;
+               } while (--bytes);
+
+               /* We could/should check the value and length here too */
+       }
+       return -1;
+}
+
+/*
+ * This verifies that the buffer is in proper utf8 format.
+ *
+ * If it isn't, it assumes any non-utf8 characters are Latin1,
+ * and does the conversion.
+ *
+ * Fixme: we should probably also disallow overlong forms and
+ * invalid characters. But we don't do that currently.
+ */
+static int verify_utf8(struct strbuf *buf)
+{
+       int ok = 1;
+       long pos = 0;
+
+       for (;;) {
+               int bad;
+               unsigned char c;
+               unsigned char replace[2];
+
+               bad = find_invalid_utf8(buf->buf + pos, buf->len - pos);
+               if (bad < 0)
+                       return ok;
+               pos += bad;
+               ok = 0;
+               c = buf->buf[pos];
+               strbuf_remove(buf, pos, 1);
+
+               /* We know 'c' must be in the range 128-255 */
+               replace[0] = 0xc0 + (c >> 6);
+               replace[1] = 0x80 + (c & 0x3f);
+               strbuf_insert(buf, pos, replace, 2);
+               pos += 2;
+       }
+}
+
 static const char commit_utf8_warn[] =
 static const char commit_utf8_warn[] =
-"Warning: commit message does not conform to UTF-8.\n"
+"Warning: commit message did not conform to UTF-8.\n"
 "You may want to amend it after fixing the message, or set the config\n"
 "variable i18n.commitencoding to the encoding your project uses.\n";
 
 "You may want to amend it after fixing the message, or set the config\n"
 "variable i18n.commitencoding to the encoding your project uses.\n";
 
@@ -1170,7 +1254,7 @@ int commit_tree_extended(const struct strbuf *msg, unsigned char *tree,
        strbuf_addbuf(&buffer, msg);
 
        /* And check the encoding */
        strbuf_addbuf(&buffer, msg);
 
        /* And check the encoding */
-       if (encoding_is_utf8 && !is_utf8(buffer.buf))
+       if (encoding_is_utf8 && !verify_utf8(&buffer))
                fprintf(stderr, commit_utf8_warn);
 
        if (sign_commit && do_sign_commit(&buffer, sign_commit))
                fprintf(stderr, commit_utf8_warn);
 
        if (sign_commit && do_sign_commit(&buffer, sign_commit))