From: Junio C Hamano <gitster@pobox.com>
Date: Fri, 7 Sep 2012 18:08:38 +0000 (-0700)
Subject: Merge branch 'lt/commit-tree-guess-utf-8'
X-Git-Tag: v1.8.0-rc0~99
X-Git-Url: https://git.lorimer.id.au/gitweb.git/diff_plain/ae80b5a8928770a2b56c390d13bd3c48ebd2255f?ds=inline;hp=-c

Merge branch 'lt/commit-tree-guess-utf-8'

Teach "git commit" and "git commit-tree" the "we are told to use
utf-8 in log message, but this does not look like utf-8---attempt to
pass it through convert-from-latin1-to-utf8 and see if it makes
sense" heuristics "git mailinfo" already uses.

* lt/commit-tree-guess-utf-8:
commit/commit-tree: correct latin1 to utf-8
---

ae80b5a8928770a2b56c390d13bd3c48ebd2255f
diff --combined commit.c
index 42af4c1f23,1360bbd2cb..87268682f9
--- a/commit.c
+++ b/commit.c
@@@ -68,7 -68,7 +68,7 @@@ struct commit *lookup_commit_reference_
  	unsigned char sha1[20];
  	struct commit *commit;
  
 -	if (get_sha1(name, sha1))
 +	if (get_sha1_committish(name, sha1))
  		return NULL;
  	commit = lookup_commit_reference(sha1);
  	if (!commit || parse_commit(commit))
@@@ -1112,8 -1112,92 +1112,92 @@@ int commit_tree(const struct strbuf *ms
  	return result;
  }
  
+ static int find_invalid_utf8(const char *buf, int len)
+ {
+ 	int offset = 0;
+ 
+ 	while (len) {
+ 		unsigned char c = *buf++;
+ 		int bytes, bad_offset;
+ 
+ 		len--;
+ 		offset++;
+ 
+ 		/* Simple US-ASCII? No worries. */
+ 		if (c < 0x80)
+ 			continue;
+ 
+ 		bad_offset = offset-1;
+ 
+ 		/*
+ 		 * Count how many more high bits set: that's how
+ 		 * many more bytes this sequence should have.
+ 		 */
+ 		bytes = 0;
+ 		while (c & 0x40) {
+ 			c <<= 1;
+ 			bytes++;
+ 		}
+ 
+ 		/* Must be between 1 and 5 more bytes */
+ 		if (bytes < 1 || bytes > 5)
+ 			return bad_offset;
+ 
+ 		/* Do we *have* that many bytes? */
+ 		if (len < bytes)
+ 			return bad_offset;
+ 
+ 		offset += bytes;
+ 		len -= bytes;
+ 
+ 		/* And verify that they are good continuation bytes */
+ 		do {
+ 			if ((*buf++ & 0xc0) != 0x80)
+ 				return bad_offset;
+ 		} while (--bytes);
+ 
+ 		/* We could/should check the value and length here too */
+ 	}
+ 	return -1;
+ }
+ 
+ /*
+  * This verifies that the buffer is in proper utf8 format.
+  *
+  * If it isn't, it assumes any non-utf8 characters are Latin1,
+  * and does the conversion.
+  *
+  * Fixme: we should probably also disallow overlong forms and
+  * invalid characters. But we don't do that currently.
+  */
+ static int verify_utf8(struct strbuf *buf)
+ {
+ 	int ok = 1;
+ 	long pos = 0;
+ 
+ 	for (;;) {
+ 		int bad;
+ 		unsigned char c;
+ 		unsigned char replace[2];
+ 
+ 		bad = find_invalid_utf8(buf->buf + pos, buf->len - pos);
+ 		if (bad < 0)
+ 			return ok;
+ 		pos += bad;
+ 		ok = 0;
+ 		c = buf->buf[pos];
+ 		strbuf_remove(buf, pos, 1);
+ 
+ 		/* We know 'c' must be in the range 128-255 */
+ 		replace[0] = 0xc0 + (c >> 6);
+ 		replace[1] = 0x80 + (c & 0x3f);
+ 		strbuf_insert(buf, pos, replace, 2);
+ 		pos += 2;
+ 	}
+ }
+ 
  static const char commit_utf8_warn[] =
- "Warning: commit message does not conform to UTF-8.\n"
+ "Warning: commit message did not conform to UTF-8.\n"
  "You may want to amend it after fixing the message, or set the config\n"
  "variable i18n.commitencoding to the encoding your project uses.\n";
  
@@@ -1170,7 -1254,7 +1254,7 @@@ int commit_tree_extended(const struct s
  	strbuf_addbuf(&buffer, msg);
  
  	/* And check the encoding */
- 	if (encoding_is_utf8 && !is_utf8(buffer.buf))
+ 	if (encoding_is_utf8 && !verify_utf8(&buffer))
  		fprintf(stderr, commit_utf8_warn);
  
  	if (sign_commit && do_sign_commit(&buffer, sign_commit))