From: Junio C Hamano Date: Fri, 7 Dec 2012 22:10:56 +0000 (-0800) Subject: Merge branch 'jc/same-encoding' into maint X-Git-Tag: v1.8.1-rc1~2^2~2 X-Git-Url: https://git.lorimer.id.au/gitweb.git/diff_plain/fff26a68053022a2cbc39142c7c6f6d016837058?ds=inline;hp=-c Merge branch 'jc/same-encoding' into maint Various codepaths checked if two encoding names are the same using ad-hoc code and some of them ended up asking iconv() to convert between "utf8" and "UTF-8". The former is not a valid way to spell the encoding name, but often people use it by mistake, and we equated them in some but not all codepaths. Introduce a new helper function to make these codepaths consistent. * jc/same-encoding: reencode_string(): introduce and use same_encoding() --- fff26a68053022a2cbc39142c7c6f6d016837058 diff --combined builtin/mailinfo.c index da231400b3,90b158d4f5..24a772d8e1 --- a/builtin/mailinfo.c +++ b/builtin/mailinfo.c @@@ -477,13 -477,37 +477,14 @@@ static struct strbuf *decode_b_segment( return out; } -/* - * When there is no known charset, guess. - * - * Right now we assume that if the target is UTF-8 (the default), - * and it already looks like UTF-8 (which includes US-ASCII as its - * subset, of course) then that is what it is and there is nothing - * to do. - * - * Otherwise, we default to assuming it is Latin1 for historical - * reasons. - */ -static const char *guess_charset(const struct strbuf *line, const char *target_charset) -{ - if (is_encoding_utf8(target_charset)) { - if (is_utf8(line->buf)) - return NULL; - } - return "ISO8859-1"; -} - static void convert_to_utf8(struct strbuf *line, const char *charset) { char *out; - if (!charset || !*charset) { - charset = guess_charset(line, metainfo_charset); - if (!charset) - return; - } + if (!charset || !*charset) + return; - if (!strcasecmp(metainfo_charset, charset)) + + if (same_encoding(metainfo_charset, charset)) return; out = reencode_string(line->buf, metainfo_charset, charset); if (!out) diff --combined notes.c index bc454e1eab,e48f6604d8..ee8f01f1d5 --- a/notes.c +++ b/notes.c @@@ -1196,18 -1196,8 +1196,18 @@@ void free_notes(struct notes_tree *t memset(t, 0, sizeof(struct notes_tree)); } -void format_note(struct notes_tree *t, const unsigned char *object_sha1, - struct strbuf *sb, const char *output_encoding, int flags) +/* + * Fill the given strbuf with the notes associated with the given object. + * + * If the given notes_tree structure is not initialized, it will be auto- + * initialized to the default value (see documentation for init_notes() above). + * If the given notes_tree is NULL, the internal/default notes_tree will be + * used instead. + * + * 'flags' is a bitwise combination of the flags for format_display_notes. + */ +static void format_note(struct notes_tree *t, const unsigned char *object_sha1, + struct strbuf *sb, const char *output_encoding, int flags) { static const char utf8[] = "utf-8"; const unsigned char *sha1; @@@ -1231,7 -1221,7 +1231,7 @@@ } if (output_encoding && *output_encoding && - strcmp(utf8, output_encoding)) { + !is_encoding_utf8(output_encoding)) { char *reencoded = reencode_string(msg, output_encoding, utf8); if (reencoded) { free(msg); diff --combined pretty.c index 413e7587b6,e87fe9fec3..dba682828c --- a/pretty.c +++ b/pretty.c @@@ -231,7 -231,7 +231,7 @@@ static int is_rfc822_special(char ch } } -static int has_rfc822_specials(const char *s, int len) +static int needs_rfc822_quoting(const char *s, int len) { int i; for (i = 0; i < len; i++) @@@ -240,17 -240,6 +240,17 @@@ return 0; } +static int last_line_length(struct strbuf *sb) +{ + int i; + + /* How many bytes are already used on the last line? */ + for (i = sb->len - 1; i >= 0; i--) + if (sb->buf[i] == '\n') + break; + return sb->len - (i + 1); +} + static void add_rfc822_quoted(struct strbuf *out, const char *s, int len) { int i; @@@ -272,110 -261,57 +272,110 @@@ strbuf_addch(out, '"'); } -static int is_rfc2047_special(char ch) +enum rfc2047_type { + RFC2047_SUBJECT, + RFC2047_ADDRESS, +}; + +static int is_rfc2047_special(char ch, enum rfc2047_type type) { - return (non_ascii(ch) || (ch == '=') || (ch == '?') || (ch == '_')); + /* + * rfc2047, section 4.2: + * + * 8-bit values which correspond to printable ASCII characters other + * than "=", "?", and "_" (underscore), MAY be represented as those + * characters. (But see section 5 for restrictions.) In + * particular, SPACE and TAB MUST NOT be represented as themselves + * within encoded words. + */ + + /* + * rule out non-ASCII characters and non-printable characters (the + * non-ASCII check should be redundant as isprint() is not localized + * and only knows about ASCII, but be defensive about that) + */ + if (non_ascii(ch) || !isprint(ch)) + return 1; + + /* + * rule out special printable characters (' ' should be the only + * whitespace character considered printable, but be defensive and use + * isspace()) + */ + if (isspace(ch) || ch == '=' || ch == '?' || ch == '_') + return 1; + + /* + * rfc2047, section 5.3: + * + * As a replacement for a 'word' entity within a 'phrase', for example, + * one that precedes an address in a From, To, or Cc header. The ABNF + * definition for 'phrase' from RFC 822 thus becomes: + * + * phrase = 1*( encoded-word / word ) + * + * In this case the set of characters that may be used in a "Q"-encoded + * 'encoded-word' is restricted to: . An 'encoded-word' that appears within a + * 'phrase' MUST be separated from any adjacent 'word', 'text' or + * 'special' by 'linear-white-space'. + */ + + if (type != RFC2047_ADDRESS) + return 0; + + /* '=' and '_' are special cases and have been checked above */ + return !(isalnum(ch) || ch == '!' || ch == '*' || ch == '+' || ch == '-' || ch == '/'); } -static void add_rfc2047(struct strbuf *sb, const char *line, int len, - const char *encoding) +static int needs_rfc2047_encoding(const char *line, int len, + enum rfc2047_type type) { - static const int max_length = 78; /* per rfc2822 */ int i; - int line_len; - - /* How many bytes are already used on the current line? */ - for (i = sb->len - 1; i >= 0; i--) - if (sb->buf[i] == '\n') - break; - line_len = sb->len - (i+1); for (i = 0; i < len; i++) { int ch = line[i]; if (non_ascii(ch) || ch == '\n') - goto needquote; + return 1; if ((i + 1 < len) && (ch == '=' && line[i+1] == '?')) - goto needquote; + return 1; } - strbuf_add_wrapped_bytes(sb, line, len, 0, 1, max_length - line_len); - return; -needquote: + return 0; +} + +static void add_rfc2047(struct strbuf *sb, const char *line, int len, + const char *encoding, enum rfc2047_type type) +{ + static const int max_encoded_length = 76; /* per rfc2047 */ + int i; + int line_len = last_line_length(sb); + strbuf_grow(sb, len * 3 + strlen(encoding) + 100); strbuf_addf(sb, "=?%s?q?", encoding); line_len += strlen(encoding) + 5; /* 5 for =??q? */ for (i = 0; i < len; i++) { unsigned ch = line[i] & 0xFF; + int is_special = is_rfc2047_special(ch, type); + + /* + * According to RFC 2047, we could encode the special character + * ' ' (space) with '_' (underscore) for readability. But many + * programs do not understand this and just leave the + * underscore in place. Thus, we do nothing special here, which + * causes ' ' to be encoded as '=20', avoiding this problem. + */ - if (line_len >= max_length - 2) { + if (line_len + 2 + (is_special ? 3 : 1) > max_encoded_length) { strbuf_addf(sb, "?=\n =?%s?q?", encoding); line_len = strlen(encoding) + 5 + 1; /* =??q? plus SP */ } - /* - * We encode ' ' using '=20' even though rfc2047 - * allows using '_' for readability. Unfortunately, - * many programs do not understand this and just - * leave the underscore in place. - */ - if (is_rfc2047_special(ch) || ch == ' ' || ch == '\n') { + if (is_special) { strbuf_addf(sb, "=%02X", ch); line_len += 3; - } - else { + } else { strbuf_addch(sb, ch); line_len++; } @@@ -387,7 -323,6 +387,7 @@@ void pp_user_info(const struct pretty_p const char *what, struct strbuf *sb, const char *line, const char *encoding) { + int max_length = 78; /* per rfc2822 */ char *date; int namelen; unsigned long time; @@@ -405,27 -340,25 +405,27 @@@ if (pp->fmt == CMIT_FMT_EMAIL) { char *name_tail = strchr(line, '<'); int display_name_length; - int final_line; if (!name_tail) return; while (line < name_tail && isspace(name_tail[-1])) name_tail--; display_name_length = name_tail - line; strbuf_addstr(sb, "From: "); - if (!has_rfc822_specials(line, display_name_length)) { - add_rfc2047(sb, line, display_name_length, encoding); - } else { + if (needs_rfc2047_encoding(line, display_name_length, RFC2047_ADDRESS)) { + add_rfc2047(sb, line, display_name_length, + encoding, RFC2047_ADDRESS); + max_length = 76; /* per rfc2047 */ + } else if (needs_rfc822_quoting(line, display_name_length)) { struct strbuf quoted = STRBUF_INIT; add_rfc822_quoted("ed, line, display_name_length); - add_rfc2047(sb, quoted.buf, quoted.len, encoding); + strbuf_add_wrapped_bytes(sb, quoted.buf, quoted.len, + -6, 1, max_length); strbuf_release("ed); + } else { + strbuf_add_wrapped_bytes(sb, line, display_name_length, + -6, 1, max_length); } - for (final_line = 0; final_line < sb->len; final_line++) - if (sb->buf[sb->len - final_line - 1] == '\n') - break; - if (namelen - display_name_length + final_line > 78) { + if (namelen - display_name_length + last_line_length(sb) > max_length) { strbuf_addch(sb, '\n'); if (!isspace(name_tail[0])) strbuf_addch(sb, ' '); @@@ -571,7 -504,7 +571,7 @@@ char *logmsg_reencode(const struct comm return NULL; encoding = get_header(commit, "encoding"); use_encoding = encoding ? encoding : utf8; - if (!strcmp(use_encoding, output_encoding)) + if (same_encoding(use_encoding, output_encoding)) if (encoding) /* we'll strip encoding header later */ out = xstrdup(commit->buffer); else @@@ -1345,7 -1278,6 +1345,7 @@@ void pp_title_line(const struct pretty_ const char *encoding, int need_8bit_cte) { + static const int max_length = 78; /* per rfc2047 */ struct strbuf title; strbuf_init(&title, 80); @@@ -1355,12 -1287,7 +1355,12 @@@ strbuf_grow(sb, title.len + 1024); if (pp->subject) { strbuf_addstr(sb, pp->subject); - add_rfc2047(sb, title.buf, title.len, encoding); + if (needs_rfc2047_encoding(title.buf, title.len, RFC2047_SUBJECT)) + add_rfc2047(sb, title.buf, title.len, + encoding, RFC2047_SUBJECT); + else + strbuf_add_wrapped_bytes(sb, title.buf, title.len, + -last_line_length(sb), 1, max_length); } else { strbuf_addbuf(sb, &title); } diff --combined sequencer.c index e3723d2095,f2f5b137ea..73c396bd89 --- a/sequencer.c +++ b/sequencer.c @@@ -17,9 -17,7 +17,9 @@@ #define GIT_REFLOG_ACTION "GIT_REFLOG_ACTION" -void remove_sequencer_state(void) +const char sign_off_header[] = "Signed-off-by: "; + +static void remove_sequencer_state(void) { struct strbuf seq_dir = STRBUF_INIT; @@@ -60,7 -58,7 +60,7 @@@ static int get_message(struct commit *c out->reencoded_message = NULL; out->message = commit->buffer; - if (strcmp(encoding, git_commit_encoding)) + if (same_encoding(encoding, git_commit_encoding)) out->reencoded_message = reencode_string(commit->buffer, git_commit_encoding, encoding); if (out->reencoded_message) @@@ -235,9 -233,6 +235,9 @@@ static int do_recursive_merge(struct co die(_("%s: Unable to write new index file"), action_name(opts)); rollback_lock_file(&index_lock); + if (opts->signoff) + append_signoff(msgbuf, 0); + if (!clean) { int i; strbuf_addstr(msgbuf, "\nConflicts:\n"); @@@ -316,9 -311,6 +316,9 @@@ static int run_git_commit(const char *d if (allow_empty) argv_array_push(&array, "--allow-empty"); + if (opts->allow_empty_message) + argv_array_push(&array, "--allow-empty-message"); + rc = run_command_v_opt(array.argv, RUN_GIT_CMD); argv_array_clear(&array); return rc; @@@ -1016,63 -1008,3 +1016,63 @@@ int sequencer_pick_revisions(struct rep save_opts(opts); return pick_commits(todo_list, opts); } + +static int ends_rfc2822_footer(struct strbuf *sb, int ignore_footer) +{ + int ch; + int hit = 0; + int i, j, k; + int len = sb->len - ignore_footer; + int first = 1; + const char *buf = sb->buf; + + for (i = len - 1; i > 0; i--) { + if (hit && buf[i] == '\n') + break; + hit = (buf[i] == '\n'); + } + + while (i < len - 1 && buf[i] == '\n') + i++; + + for (; i < len; i = k) { + for (k = i; k < len && buf[k] != '\n'; k++) + ; /* do nothing */ + k++; + + if ((buf[k] == ' ' || buf[k] == '\t') && !first) + continue; + + first = 0; + + for (j = 0; i + j < len; j++) { + ch = buf[i + j]; + if (ch == ':') + break; + if (isalnum(ch) || + (ch == '-')) + continue; + return 0; + } + } + return 1; +} + +void append_signoff(struct strbuf *msgbuf, int ignore_footer) +{ + struct strbuf sob = STRBUF_INIT; + int i; + + strbuf_addstr(&sob, sign_off_header); + strbuf_addstr(&sob, fmt_name(getenv("GIT_COMMITTER_NAME"), + getenv("GIT_COMMITTER_EMAIL"))); + strbuf_addch(&sob, '\n'); + for (i = msgbuf->len - 1 - ignore_footer; i > 0 && msgbuf->buf[i - 1] != '\n'; i--) + ; /* do nothing */ + if (prefixcmp(msgbuf->buf + i, sob.buf)) { + if (!i || !ends_rfc2822_footer(msgbuf, ignore_footer)) + strbuf_splice(msgbuf, msgbuf->len - ignore_footer, 0, "\n", 1); + strbuf_splice(msgbuf, msgbuf->len - ignore_footer, 0, sob.buf, sob.len); + } + strbuf_release(&sob); +} diff --combined utf8.c index 28791a7c31,6a52834576..5c61bbe113 --- a/utf8.c +++ b/utf8.c @@@ -353,7 -353,7 +353,7 @@@ retry c = *text; if (!c || isspace(c)) { - if (w < width || !space) { + if (w <= width || !space) { const char *start = bol; if (!c && text == start) return w; @@@ -423,6 -423,13 +423,13 @@@ int is_encoding_utf8(const char *name return 0; } + int same_encoding(const char *src, const char *dst) + { + if (is_encoding_utf8(src) && is_encoding_utf8(dst)) + return 1; + return !strcasecmp(src, dst); + } + /* * Given a buffer and its encoding, return it re-encoded * with iconv. If the conversion fails, returns NULL.