From: Junio C Hamano <gitster@pobox.com>
Date: Fri, 7 Dec 2012 22:10:56 +0000 (-0800)
Subject: Merge branch 'jc/same-encoding' into maint
X-Git-Tag: v1.8.1-rc1~2^2~2
X-Git-Url: https://git.lorimer.id.au/gitweb.git/diff_plain/fff26a68053022a2cbc39142c7c6f6d016837058?ds=inline;hp=-c

Merge branch 'jc/same-encoding' into maint

Various codepaths checked if two encoding names are the same using
ad-hoc code and some of them ended up asking iconv() to convert
between "utf8" and "UTF-8".  The former is not a valid way to spell
the encoding name, but often people use it by mistake, and we
equated them in some but not all codepaths. Introduce a new helper
function to make these codepaths consistent.

* jc/same-encoding:
reencode_string(): introduce and use same_encoding()
---

fff26a68053022a2cbc39142c7c6f6d016837058
diff --combined builtin/mailinfo.c
index da231400b3,90b158d4f5..24a772d8e1
--- a/builtin/mailinfo.c
+++ b/builtin/mailinfo.c
@@@ -477,13 -477,37 +477,14 @@@ static struct strbuf *decode_b_segment(
  	return out;
  }
  
 -/*
 - * When there is no known charset, guess.
 - *
 - * Right now we assume that if the target is UTF-8 (the default),
 - * and it already looks like UTF-8 (which includes US-ASCII as its
 - * subset, of course) then that is what it is and there is nothing
 - * to do.
 - *
 - * Otherwise, we default to assuming it is Latin1 for historical
 - * reasons.
 - */
 -static const char *guess_charset(const struct strbuf *line, const char *target_charset)
 -{
 -	if (is_encoding_utf8(target_charset)) {
 -		if (is_utf8(line->buf))
 -			return NULL;
 -	}
 -	return "ISO8859-1";
 -}
 -
  static void convert_to_utf8(struct strbuf *line, const char *charset)
  {
  	char *out;
  
 -	if (!charset || !*charset) {
 -		charset = guess_charset(line, metainfo_charset);
 -		if (!charset)
 -			return;
 -	}
 +	if (!charset || !*charset)
 +		return;
- 	if (!strcasecmp(metainfo_charset, charset))
+ 
+ 	if (same_encoding(metainfo_charset, charset))
  		return;
  	out = reencode_string(line->buf, metainfo_charset, charset);
  	if (!out)
diff --combined notes.c
index bc454e1eab,e48f6604d8..ee8f01f1d5
--- a/notes.c
+++ b/notes.c
@@@ -1196,18 -1196,8 +1196,18 @@@ void free_notes(struct notes_tree *t
  	memset(t, 0, sizeof(struct notes_tree));
  }
  
 -void format_note(struct notes_tree *t, const unsigned char *object_sha1,
 -		struct strbuf *sb, const char *output_encoding, int flags)
 +/*
 + * Fill the given strbuf with the notes associated with the given object.
 + *
 + * If the given notes_tree structure is not initialized, it will be auto-
 + * initialized to the default value (see documentation for init_notes() above).
 + * If the given notes_tree is NULL, the internal/default notes_tree will be
 + * used instead.
 + *
 + * 'flags' is a bitwise combination of the flags for format_display_notes.
 + */
 +static void format_note(struct notes_tree *t, const unsigned char *object_sha1,
 +			struct strbuf *sb, const char *output_encoding, int flags)
  {
  	static const char utf8[] = "utf-8";
  	const unsigned char *sha1;
@@@ -1231,7 -1221,7 +1231,7 @@@
  	}
  
  	if (output_encoding && *output_encoding &&
- 			strcmp(utf8, output_encoding)) {
+ 	    !is_encoding_utf8(output_encoding)) {
  		char *reencoded = reencode_string(msg, output_encoding, utf8);
  		if (reencoded) {
  			free(msg);
diff --combined pretty.c
index 413e7587b6,e87fe9fec3..dba682828c
--- a/pretty.c
+++ b/pretty.c
@@@ -231,7 -231,7 +231,7 @@@ static int is_rfc822_special(char ch
  	}
  }
  
 -static int has_rfc822_specials(const char *s, int len)
 +static int needs_rfc822_quoting(const char *s, int len)
  {
  	int i;
  	for (i = 0; i < len; i++)
@@@ -240,17 -240,6 +240,17 @@@
  	return 0;
  }
  
 +static int last_line_length(struct strbuf *sb)
 +{
 +	int i;
 +
 +	/* How many bytes are already used on the last line? */
 +	for (i = sb->len - 1; i >= 0; i--)
 +		if (sb->buf[i] == '\n')
 +			break;
 +	return sb->len - (i + 1);
 +}
 +
  static void add_rfc822_quoted(struct strbuf *out, const char *s, int len)
  {
  	int i;
@@@ -272,110 -261,57 +272,110 @@@
  	strbuf_addch(out, '"');
  }
  
 -static int is_rfc2047_special(char ch)
 +enum rfc2047_type {
 +	RFC2047_SUBJECT,
 +	RFC2047_ADDRESS,
 +};
 +
 +static int is_rfc2047_special(char ch, enum rfc2047_type type)
  {
 -	return (non_ascii(ch) || (ch == '=') || (ch == '?') || (ch == '_'));
 +	/*
 +	 * rfc2047, section 4.2:
 +	 *
 +	 *    8-bit values which correspond to printable ASCII characters other
 +	 *    than "=", "?", and "_" (underscore), MAY be represented as those
 +	 *    characters.  (But see section 5 for restrictions.)  In
 +	 *    particular, SPACE and TAB MUST NOT be represented as themselves
 +	 *    within encoded words.
 +	 */
 +
 +	/*
 +	 * rule out non-ASCII characters and non-printable characters (the
 +	 * non-ASCII check should be redundant as isprint() is not localized
 +	 * and only knows about ASCII, but be defensive about that)
 +	 */
 +	if (non_ascii(ch) || !isprint(ch))
 +		return 1;
 +
 +	/*
 +	 * rule out special printable characters (' ' should be the only
 +	 * whitespace character considered printable, but be defensive and use
 +	 * isspace())
 +	 */
 +	if (isspace(ch) || ch == '=' || ch == '?' || ch == '_')
 +		return 1;
 +
 +	/*
 +	 * rfc2047, section 5.3:
 +	 *
 +	 *    As a replacement for a 'word' entity within a 'phrase', for example,
 +	 *    one that precedes an address in a From, To, or Cc header.  The ABNF
 +	 *    definition for 'phrase' from RFC 822 thus becomes:
 +	 *
 +	 *    phrase = 1*( encoded-word / word )
 +	 *
 +	 *    In this case the set of characters that may be used in a "Q"-encoded
 +	 *    'encoded-word' is restricted to: <upper and lower case ASCII
 +	 *    letters, decimal digits, "!", "*", "+", "-", "/", "=", and "_"
 +	 *    (underscore, ASCII 95.)>.  An 'encoded-word' that appears within a
 +	 *    'phrase' MUST be separated from any adjacent 'word', 'text' or
 +	 *    'special' by 'linear-white-space'.
 +	 */
 +
 +	if (type != RFC2047_ADDRESS)
 +		return 0;
 +
 +	/* '=' and '_' are special cases and have been checked above */
 +	return !(isalnum(ch) || ch == '!' || ch == '*' || ch == '+' || ch == '-' || ch == '/');
  }
  
 -static void add_rfc2047(struct strbuf *sb, const char *line, int len,
 -		       const char *encoding)
 +static int needs_rfc2047_encoding(const char *line, int len,
 +				  enum rfc2047_type type)
  {
 -	static const int max_length = 78; /* per rfc2822 */
  	int i;
 -	int line_len;
 -
 -	/* How many bytes are already used on the current line? */
 -	for (i = sb->len - 1; i >= 0; i--)
 -		if (sb->buf[i] == '\n')
 -			break;
 -	line_len = sb->len - (i+1);
  
  	for (i = 0; i < len; i++) {
  		int ch = line[i];
  		if (non_ascii(ch) || ch == '\n')
 -			goto needquote;
 +			return 1;
  		if ((i + 1 < len) && (ch == '=' && line[i+1] == '?'))
 -			goto needquote;
 +			return 1;
  	}
 -	strbuf_add_wrapped_bytes(sb, line, len, 0, 1, max_length - line_len);
 -	return;
  
 -needquote:
 +	return 0;
 +}
 +
 +static void add_rfc2047(struct strbuf *sb, const char *line, int len,
 +		       const char *encoding, enum rfc2047_type type)
 +{
 +	static const int max_encoded_length = 76; /* per rfc2047 */
 +	int i;
 +	int line_len = last_line_length(sb);
 +
  	strbuf_grow(sb, len * 3 + strlen(encoding) + 100);
  	strbuf_addf(sb, "=?%s?q?", encoding);
  	line_len += strlen(encoding) + 5; /* 5 for =??q? */
  	for (i = 0; i < len; i++) {
  		unsigned ch = line[i] & 0xFF;
 +		int is_special = is_rfc2047_special(ch, type);
 +
 +		/*
 +		 * According to RFC 2047, we could encode the special character
 +		 * ' ' (space) with '_' (underscore) for readability. But many
 +		 * programs do not understand this and just leave the
 +		 * underscore in place. Thus, we do nothing special here, which
 +		 * causes ' ' to be encoded as '=20', avoiding this problem.
 +		 */
  
 -		if (line_len >= max_length - 2) {
 +		if (line_len + 2 + (is_special ? 3 : 1) > max_encoded_length) {
  			strbuf_addf(sb, "?=\n =?%s?q?", encoding);
  			line_len = strlen(encoding) + 5 + 1; /* =??q? plus SP */
  		}
  
 -		/*
 -		 * We encode ' ' using '=20' even though rfc2047
 -		 * allows using '_' for readability.  Unfortunately,
 -		 * many programs do not understand this and just
 -		 * leave the underscore in place.
 -		 */
 -		if (is_rfc2047_special(ch) || ch == ' ' || ch == '\n') {
 +		if (is_special) {
  			strbuf_addf(sb, "=%02X", ch);
  			line_len += 3;
 -		}
 -		else {
 +		} else {
  			strbuf_addch(sb, ch);
  			line_len++;
  		}
@@@ -387,7 -323,6 +387,7 @@@ void pp_user_info(const struct pretty_p
  		  const char *what, struct strbuf *sb,
  		  const char *line, const char *encoding)
  {
 +	int max_length = 78; /* per rfc2822 */
  	char *date;
  	int namelen;
  	unsigned long time;
@@@ -405,27 -340,25 +405,27 @@@
  	if (pp->fmt == CMIT_FMT_EMAIL) {
  		char *name_tail = strchr(line, '<');
  		int display_name_length;
 -		int final_line;
  		if (!name_tail)
  			return;
  		while (line < name_tail && isspace(name_tail[-1]))
  			name_tail--;
  		display_name_length = name_tail - line;
  		strbuf_addstr(sb, "From: ");
 -		if (!has_rfc822_specials(line, display_name_length)) {
 -			add_rfc2047(sb, line, display_name_length, encoding);
 -		} else {
 +		if (needs_rfc2047_encoding(line, display_name_length, RFC2047_ADDRESS)) {
 +			add_rfc2047(sb, line, display_name_length,
 +						encoding, RFC2047_ADDRESS);
 +			max_length = 76; /* per rfc2047 */
 +		} else if (needs_rfc822_quoting(line, display_name_length)) {
  			struct strbuf quoted = STRBUF_INIT;
  			add_rfc822_quoted(&quoted, line, display_name_length);
 -			add_rfc2047(sb, quoted.buf, quoted.len, encoding);
 +			strbuf_add_wrapped_bytes(sb, quoted.buf, quoted.len,
 +							-6, 1, max_length);
  			strbuf_release(&quoted);
 +		} else {
 +			strbuf_add_wrapped_bytes(sb, line, display_name_length,
 +							-6, 1, max_length);
  		}
 -		for (final_line = 0; final_line < sb->len; final_line++)
 -			if (sb->buf[sb->len - final_line - 1] == '\n')
 -				break;
 -		if (namelen - display_name_length + final_line > 78) {
 +		if (namelen - display_name_length + last_line_length(sb) > max_length) {
  			strbuf_addch(sb, '\n');
  			if (!isspace(name_tail[0]))
  				strbuf_addch(sb, ' ');
@@@ -571,7 -504,7 +571,7 @@@ char *logmsg_reencode(const struct comm
  		return NULL;
  	encoding = get_header(commit, "encoding");
  	use_encoding = encoding ? encoding : utf8;
- 	if (!strcmp(use_encoding, output_encoding))
+ 	if (same_encoding(use_encoding, output_encoding))
  		if (encoding) /* we'll strip encoding header later */
  			out = xstrdup(commit->buffer);
  		else
@@@ -1345,7 -1278,6 +1345,7 @@@ void pp_title_line(const struct pretty_
  		   const char *encoding,
  		   int need_8bit_cte)
  {
 +	static const int max_length = 78; /* per rfc2047 */
  	struct strbuf title;
  
  	strbuf_init(&title, 80);
@@@ -1355,12 -1287,7 +1355,12 @@@
  	strbuf_grow(sb, title.len + 1024);
  	if (pp->subject) {
  		strbuf_addstr(sb, pp->subject);
 -		add_rfc2047(sb, title.buf, title.len, encoding);
 +		if (needs_rfc2047_encoding(title.buf, title.len, RFC2047_SUBJECT))
 +			add_rfc2047(sb, title.buf, title.len,
 +						encoding, RFC2047_SUBJECT);
 +		else
 +			strbuf_add_wrapped_bytes(sb, title.buf, title.len,
 +					 -last_line_length(sb), 1, max_length);
  	} else {
  		strbuf_addbuf(sb, &title);
  	}
diff --combined sequencer.c
index e3723d2095,f2f5b137ea..73c396bd89
--- a/sequencer.c
+++ b/sequencer.c
@@@ -17,9 -17,7 +17,9 @@@
  
  #define GIT_REFLOG_ACTION "GIT_REFLOG_ACTION"
  
 -void remove_sequencer_state(void)
 +const char sign_off_header[] = "Signed-off-by: ";
 +
 +static void remove_sequencer_state(void)
  {
  	struct strbuf seq_dir = STRBUF_INIT;
  
@@@ -60,7 -58,7 +60,7 @@@ static int get_message(struct commit *c
  
  	out->reencoded_message = NULL;
  	out->message = commit->buffer;
- 	if (strcmp(encoding, git_commit_encoding))
+ 	if (same_encoding(encoding, git_commit_encoding))
  		out->reencoded_message = reencode_string(commit->buffer,
  					git_commit_encoding, encoding);
  	if (out->reencoded_message)
@@@ -235,9 -233,6 +235,9 @@@ static int do_recursive_merge(struct co
  		die(_("%s: Unable to write new index file"), action_name(opts));
  	rollback_lock_file(&index_lock);
  
 +	if (opts->signoff)
 +		append_signoff(msgbuf, 0);
 +
  	if (!clean) {
  		int i;
  		strbuf_addstr(msgbuf, "\nConflicts:\n");
@@@ -316,9 -311,6 +316,9 @@@ static int run_git_commit(const char *d
  	if (allow_empty)
  		argv_array_push(&array, "--allow-empty");
  
 +	if (opts->allow_empty_message)
 +		argv_array_push(&array, "--allow-empty-message");
 +
  	rc = run_command_v_opt(array.argv, RUN_GIT_CMD);
  	argv_array_clear(&array);
  	return rc;
@@@ -1016,63 -1008,3 +1016,63 @@@ int sequencer_pick_revisions(struct rep
  	save_opts(opts);
  	return pick_commits(todo_list, opts);
  }
 +
 +static int ends_rfc2822_footer(struct strbuf *sb, int ignore_footer)
 +{
 +	int ch;
 +	int hit = 0;
 +	int i, j, k;
 +	int len = sb->len - ignore_footer;
 +	int first = 1;
 +	const char *buf = sb->buf;
 +
 +	for (i = len - 1; i > 0; i--) {
 +		if (hit && buf[i] == '\n')
 +			break;
 +		hit = (buf[i] == '\n');
 +	}
 +
 +	while (i < len - 1 && buf[i] == '\n')
 +		i++;
 +
 +	for (; i < len; i = k) {
 +		for (k = i; k < len && buf[k] != '\n'; k++)
 +			; /* do nothing */
 +		k++;
 +
 +		if ((buf[k] == ' ' || buf[k] == '\t') && !first)
 +			continue;
 +
 +		first = 0;
 +
 +		for (j = 0; i + j < len; j++) {
 +			ch = buf[i + j];
 +			if (ch == ':')
 +				break;
 +			if (isalnum(ch) ||
 +			    (ch == '-'))
 +				continue;
 +			return 0;
 +		}
 +	}
 +	return 1;
 +}
 +
 +void append_signoff(struct strbuf *msgbuf, int ignore_footer)
 +{
 +	struct strbuf sob = STRBUF_INIT;
 +	int i;
 +
 +	strbuf_addstr(&sob, sign_off_header);
 +	strbuf_addstr(&sob, fmt_name(getenv("GIT_COMMITTER_NAME"),
 +				getenv("GIT_COMMITTER_EMAIL")));
 +	strbuf_addch(&sob, '\n');
 +	for (i = msgbuf->len - 1 - ignore_footer; i > 0 && msgbuf->buf[i - 1] != '\n'; i--)
 +		; /* do nothing */
 +	if (prefixcmp(msgbuf->buf + i, sob.buf)) {
 +		if (!i || !ends_rfc2822_footer(msgbuf, ignore_footer))
 +			strbuf_splice(msgbuf, msgbuf->len - ignore_footer, 0, "\n", 1);
 +		strbuf_splice(msgbuf, msgbuf->len - ignore_footer, 0, sob.buf, sob.len);
 +	}
 +	strbuf_release(&sob);
 +}
diff --combined utf8.c
index 28791a7c31,6a52834576..5c61bbe113
--- a/utf8.c
+++ b/utf8.c
@@@ -353,7 -353,7 +353,7 @@@ retry
  
  		c = *text;
  		if (!c || isspace(c)) {
 -			if (w < width || !space) {
 +			if (w <= width || !space) {
  				const char *start = bol;
  				if (!c && text == start)
  					return w;
@@@ -423,6 -423,13 +423,13 @@@ int is_encoding_utf8(const char *name
  	return 0;
  }
  
+ int same_encoding(const char *src, const char *dst)
+ {
+ 	if (is_encoding_utf8(src) && is_encoding_utf8(dst))
+ 		return 1;
+ 	return !strcasecmp(src, dst);
+ }
+ 
  /*
   * Given a buffer and its encoding, return it re-encoded
   * with iconv.  If the conversion fails, returns NULL.