Merge branch 'lt/commit-tree-guess-utf-8'
authorJunio C Hamano <gitster@pobox.com>
Fri, 7 Sep 2012 18:08:38 +0000 (11:08 -0700)
committerJunio C Hamano <gitster@pobox.com>
Fri, 7 Sep 2012 18:08:38 +0000 (11:08 -0700)
Teach "git commit" and "git commit-tree" the "we are told to use
utf-8 in log message, but this does not look like utf-8---attempt to
pass it through convert-from-latin1-to-utf8 and see if it makes
sense" heuristics "git mailinfo" already uses.

* lt/commit-tree-guess-utf-8:
commit/commit-tree: correct latin1 to utf-8

1  2 
commit.c
diff --combined commit.c
index 42af4c1f238b96eeb9e88b227b6f6c10ef317058,1360bbd2cb7d9c83d128fd301bcee4763ac8f054..87268682f98f0cd0f18221b3647ad30dbcbc560e
+++ b/commit.c
@@@ -68,7 -68,7 +68,7 @@@ struct commit *lookup_commit_reference_
        unsigned char sha1[20];
        struct commit *commit;
  
 -      if (get_sha1(name, sha1))
 +      if (get_sha1_committish(name, sha1))
                return NULL;
        commit = lookup_commit_reference(sha1);
        if (!commit || parse_commit(commit))
@@@ -1112,8 -1112,92 +1112,92 @@@ int commit_tree(const struct strbuf *ms
        return result;
  }
  
+ static int find_invalid_utf8(const char *buf, int len)
+ {
+       int offset = 0;
+       while (len) {
+               unsigned char c = *buf++;
+               int bytes, bad_offset;
+               len--;
+               offset++;
+               /* Simple US-ASCII? No worries. */
+               if (c < 0x80)
+                       continue;
+               bad_offset = offset-1;
+               /*
+                * Count how many more high bits set: that's how
+                * many more bytes this sequence should have.
+                */
+               bytes = 0;
+               while (c & 0x40) {
+                       c <<= 1;
+                       bytes++;
+               }
+               /* Must be between 1 and 5 more bytes */
+               if (bytes < 1 || bytes > 5)
+                       return bad_offset;
+               /* Do we *have* that many bytes? */
+               if (len < bytes)
+                       return bad_offset;
+               offset += bytes;
+               len -= bytes;
+               /* And verify that they are good continuation bytes */
+               do {
+                       if ((*buf++ & 0xc0) != 0x80)
+                               return bad_offset;
+               } while (--bytes);
+               /* We could/should check the value and length here too */
+       }
+       return -1;
+ }
+ /*
+  * This verifies that the buffer is in proper utf8 format.
+  *
+  * If it isn't, it assumes any non-utf8 characters are Latin1,
+  * and does the conversion.
+  *
+  * Fixme: we should probably also disallow overlong forms and
+  * invalid characters. But we don't do that currently.
+  */
+ static int verify_utf8(struct strbuf *buf)
+ {
+       int ok = 1;
+       long pos = 0;
+       for (;;) {
+               int bad;
+               unsigned char c;
+               unsigned char replace[2];
+               bad = find_invalid_utf8(buf->buf + pos, buf->len - pos);
+               if (bad < 0)
+                       return ok;
+               pos += bad;
+               ok = 0;
+               c = buf->buf[pos];
+               strbuf_remove(buf, pos, 1);
+               /* We know 'c' must be in the range 128-255 */
+               replace[0] = 0xc0 + (c >> 6);
+               replace[1] = 0x80 + (c & 0x3f);
+               strbuf_insert(buf, pos, replace, 2);
+               pos += 2;
+       }
+ }
  static const char commit_utf8_warn[] =
- "Warning: commit message does not conform to UTF-8.\n"
+ "Warning: commit message did not conform to UTF-8.\n"
  "You may want to amend it after fixing the message, or set the config\n"
  "variable i18n.commitencoding to the encoding your project uses.\n";
  
@@@ -1170,7 -1254,7 +1254,7 @@@ int commit_tree_extended(const struct s
        strbuf_addbuf(&buffer, msg);
  
        /* And check the encoding */
-       if (encoding_is_utf8 && !is_utf8(buffer.buf))
+       if (encoding_is_utf8 && !verify_utf8(&buffer))
                fprintf(stderr, commit_utf8_warn);
  
        if (sign_commit && do_sign_commit(&buffer, sign_commit))