Merge branch 'jc/utf8'
authorJunio C Hamano <junkio@cox.net>
Fri, 29 Dec 2006 03:03:02 +0000 (19:03 -0800)
committerJunio C Hamano <junkio@cox.net>
Fri, 29 Dec 2006 03:03:02 +0000 (19:03 -0800)
* jc/utf8:
t3900: test conversion to non UTF-8 as well
Rename t3900 test vector file
UTF-8: introduce i18n.logoutputencoding.
Teach log family --encoding
i18n.logToUTF8: convert commit log message to UTF-8
Move encoding conversion routine out of mailinfo to utf8.c

Conflicts:

commit.c

18 files changed:
Documentation/config.txt
builtin-commit-tree.c
builtin-log.c
builtin-mailinfo.c
cache.h
commit.c
config.c
contrib/completion/git-completion.bash
environment.c
revision.h
t/t3900-i18n-commit.sh [new file with mode: 0755]
t/t3900/1-UTF-8.txt [new file with mode: 0644]
t/t3900/2-UTF-8.txt [new file with mode: 0644]
t/t3900/EUCJP.txt [new file with mode: 0644]
t/t3900/ISO-2022-JP.txt [new file with mode: 0644]
t/t3900/ISO-8859-1.txt [new file with mode: 0644]
utf8.c
utf8.h
index 6452a8be144f92f9140fb133acda4d9fd32f2b15..178e0e1e209e066ad220d6ba95be0728c2db5e99 100644 (file)
@@ -267,6 +267,10 @@ i18n.commitEncoding::
        browser (and possibly at other places in the future or in other
        porcelains). See e.g. gitlink:git-mailinfo[1]. Defaults to 'utf-8'.
 
+i18n.logOutputEncoding::
+       Character encoding the commit messages are converted to when
+       running `git-log` and friends.
+
 log.showroot::
        If true, the initial commit will be shown as a big creation event.
        This is equivalent to a diff against an empty tree.
index f641787988e197209f097cbc9d1b260a2cb6d9d8..146aaffd282987454c0910477cfe7a047f478e94 100644 (file)
@@ -92,6 +92,7 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix)
        char comment[1000];
        char *buffer;
        unsigned int size;
+       int encoding_is_utf8;
 
        setup_ident();
        git_config(git_default_config);
@@ -117,6 +118,10 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix)
                        parents++;
        }
 
+       /* Not having i18n.commitencoding is the same as having utf-8 */
+       encoding_is_utf8 = (!git_commit_encoding ||
+                           !strcmp(git_commit_encoding, "utf-8"));
+
        init_buffer(&buffer, &size);
        add_buffer(&buffer, &size, "tree %s\n", sha1_to_hex(tree_sha1));
 
@@ -130,7 +135,11 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix)
 
        /* Person/date information */
        add_buffer(&buffer, &size, "author %s\n", git_author_info(1));
-       add_buffer(&buffer, &size, "committer %s\n\n", git_committer_info(1));
+       add_buffer(&buffer, &size, "committer %s\n", git_committer_info(1));
+       if (!encoding_is_utf8)
+               add_buffer(&buffer, &size,
+                               "encoding %s\n", git_commit_encoding);
+       add_buffer(&buffer, &size, "\n");
 
        /* And add the comment */
        while (fgets(comment, sizeof(comment), stdin) != NULL)
@@ -138,7 +147,7 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix)
 
        /* And check the encoding */
        buffer[size] = '\0';
-       if (!strcmp(git_commit_encoding, "utf-8") && !is_utf8(buffer))
+       if (encoding_is_utf8 && !is_utf8(buffer))
                fprintf(stderr, commit_utf8_warn);
 
        if (!write_sha1_file(buffer, size, commit_type, commit_sha1)) {
index 8df3c1394a0b70548708137bfe20126fbea01709..a59b4acef1bc45f07f15e10c0501aa23de3eed7d 100644 (file)
@@ -20,6 +20,8 @@ void add_head(struct rev_info *revs);
 static void cmd_log_init(int argc, const char **argv, const char *prefix,
                      struct rev_info *rev)
 {
+       int i;
+
        rev->abbrev = DEFAULT_ABBREV;
        rev->commit_format = CMIT_FMT_DEFAULT;
        rev->verbose_header = 1;
@@ -27,8 +29,18 @@ static void cmd_log_init(int argc, const char **argv, const char *prefix,
        argc = setup_revisions(argc, argv, rev, "HEAD");
        if (rev->diffopt.pickaxe || rev->diffopt.filter)
                rev->always_show_header = 0;
-       if (argc > 1)
-               die("unrecognized argument: %s", argv[1]);
+       for (i = 1; i < argc; i++) {
+               const char *arg = argv[i];
+               if (!strncmp(arg, "--encoding=", 11)) {
+                       arg += 11;
+                       if (strcmp(arg, "none"))
+                               git_log_output_encoding = strdup(arg);
+                       else
+                               git_log_output_encoding = "";
+               }
+               else
+                       die("unrecognized argument: %s", arg);
+       }
 }
 
 static int cmd_log_walk(struct rev_info *rev)
index e6472293d47611d415276f6057227d9c93788f63..a67f3eb90b6f715714c6fa7bb931044630c74111 100644 (file)
@@ -4,6 +4,7 @@
  */
 #include "cache.h"
 #include "builtin.h"
+#include "utf8.h"
 
 static FILE *cmitmsg, *patchfile, *fin, *fout;
 
@@ -510,40 +511,18 @@ static int decode_b_segment(char *in, char *ot, char *ep)
 
 static void convert_to_utf8(char *line, char *charset)
 {
-#ifndef NO_ICONV
-       char *in, *out;
-       size_t insize, outsize, nrc;
-       char outbuf[4096]; /* cheat */
        static char latin_one[] = "latin1";
        char *input_charset = *charset ? charset : latin_one;
-       iconv_t conv = iconv_open(metainfo_charset, input_charset);
-
-       if (conv == (iconv_t) -1) {
-               static int warned_latin1_once = 0;
-               if (input_charset != latin_one) {
-                       fprintf(stderr, "cannot convert from %s to %s\n",
-                               input_charset, metainfo_charset);
-                       *charset = 0;
-               }
-               else if (!warned_latin1_once) {
-                       warned_latin1_once = 1;
-                       fprintf(stderr, "tried to convert from %s to %s, "
-                               "but your iconv does not work with it.\n",
-                               input_charset, metainfo_charset);
-               }
+       char *out = reencode_string(line, metainfo_charset, input_charset);
+
+       if (!out) {
+               fprintf(stderr, "cannot convert from %s to %s\n",
+                       input_charset, metainfo_charset);
+               *charset = 0;
                return;
        }
-       in = line;
-       insize = strlen(in);
-       out = outbuf;
-       outsize = sizeof(outbuf);
-       nrc = iconv(conv, &in, &insize, &out, &outsize);
-       iconv_close(conv);
-       if (nrc == (size_t) -1)
-               return;
-       *out = 0;
-       strcpy(line, outbuf);
-#endif
+       strcpy(line, out);
+       free(out);
 }
 
 static int decode_header_bq(char *it)
@@ -827,7 +806,8 @@ int cmd_mailinfo(int argc, const char **argv, const char *prefix)
                if (!strcmp(argv[1], "-k"))
                        keep_subject = 1;
                else if (!strcmp(argv[1], "-u"))
-                       metainfo_charset = git_commit_encoding;
+                       metainfo_charset = (git_commit_encoding
+                                           ? git_commit_encoding : "utf-8");
                else if (!strncmp(argv[1], "--encoding=", 11))
                        metainfo_charset = argv[1] + 11;
                else
diff --git a/cache.h b/cache.h
index 4943056c19ffb72a7cfb994daaa788ec1b01d60b..29dd290c9253bd96f086432e24d98cb0b43fa096 100644 (file)
--- a/cache.h
+++ b/cache.h
@@ -416,8 +416,8 @@ extern int check_repository_format_version(const char *var, const char *value);
 extern char git_default_email[MAX_GITNAME];
 extern char git_default_name[MAX_GITNAME];
 
-#define MAX_ENCODING_LENGTH 64
-extern char git_commit_encoding[MAX_ENCODING_LENGTH];
+extern char *git_commit_encoding;
+extern char *git_log_output_encoding;
 
 extern int copy_fd(int ifd, int ofd);
 extern void write_or_die(int fd, const void *buf, size_t count);
index 59ea77c577886a3ba5b4ef385099d7a564af173c..eb06afbbe0f00ac4f553e37c50eca290418a7907 100644 (file)
--- a/commit.c
+++ b/commit.c
@@ -2,6 +2,7 @@
 #include "tag.h"
 #include "commit.h"
 #include "pkt-line.h"
+#include "utf8.h"
 
 int save_commit_buffer = 1;
 
@@ -597,10 +598,61 @@ static int add_merge_info(enum cmit_fmt fmt, char *buf, const struct commit *com
        return offset;
 }
 
-unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit,
-                                 unsigned long len, char *buf, unsigned long space,
+static char *get_header(const struct commit *commit, const char *key)
+{
+       int key_len = strlen(key);
+       const char *line = commit->buffer;
+
+       for (;;) {
+               const char *eol = strchr(line, '\n'), *next;
+
+               if (line == eol)
+                       return NULL;
+               if (!eol) {
+                       eol = line + strlen(line);
+                       next = NULL;
+               } else
+                       next = eol + 1;
+               if (!strncmp(line, key, key_len) && line[key_len] == ' ') {
+                       int len = eol - line - key_len;
+                       char *ret = xmalloc(len);
+                       memcpy(ret, line + key_len + 1, len - 1);
+                       ret[len - 1] = '\0';
+                       return ret;
+               }
+               line = next;
+       }
+}
+
+static char *logmsg_reencode(const struct commit *commit)
+{
+       char *encoding;
+       char *out;
+       char *output_encoding = (git_log_output_encoding
+                                ? git_log_output_encoding
+                                : git_commit_encoding);
+
+       if (!output_encoding)
+               return NULL;
+       encoding = get_header(commit, "encoding");
+       if (!encoding || !strcmp(encoding, output_encoding)) {
+               free(encoding);
+               return NULL;
+       }
+       out = reencode_string(commit->buffer, output_encoding, encoding);
+       free(encoding);
+       if (!out)
+               return NULL;
+       return out;
+}
+
+unsigned long pretty_print_commit(enum cmit_fmt fmt,
+                                 const struct commit *commit,
+                                 unsigned long len,
+                                 char *buf, unsigned long space,
                                  int abbrev, const char *subject,
-                                 const char *after_subject, int relative_date)
+                                 const char *after_subject,
+                                 int relative_date)
 {
        int hdr = 1, body = 0;
        unsigned long offset = 0;
@@ -608,6 +660,10 @@ unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit
        int parents_shown = 0;
        const char *msg = commit->buffer;
        int plain_non_ascii = 0;
+       char *reencoded = logmsg_reencode(commit);
+
+       if (reencoded)
+               msg = reencoded;
 
        if (fmt == CMIT_FMT_ONELINE || fmt == CMIT_FMT_EMAIL)
                indent = 0;
@@ -624,7 +680,7 @@ unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit
                for (in_body = i = 0; (ch = msg[i]) && i < len; i++) {
                        if (!in_body) {
                                /* author could be non 7-bit ASCII but
-                                * the log may so; skip over the
+                                * the log may be so; skip over the
                                 * header part first.
                                 */
                                if (ch == '\n' &&
@@ -755,6 +811,8 @@ unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit
        if (fmt == CMIT_FMT_EMAIL && !body)
                buf[offset++] = '\n';
        buf[offset] = '\0';
+
+       free(reencoded);
        return offset;
 }
 
index 1662a4626e569b07d96c622b357928216a24538c..fcccf7e2a4f3b7487af10d4f7b505c7ef492b9e8 100644 (file)
--- a/config.c
+++ b/config.c
@@ -309,10 +309,16 @@ int git_default_config(const char *var, const char *value)
        }
 
        if (!strcmp(var, "i18n.commitencoding")) {
-               strlcpy(git_commit_encoding, value, sizeof(git_commit_encoding));
+               git_commit_encoding = strdup(value);
                return 0;
        }
 
+       if (!strcmp(var, "i18n.logoutputencoding")) {
+               git_log_output_encoding = strdup(value);
+               return 0;
+       }
+
+
        if (!strcmp(var, "pager.color") || !strcmp(var, "color.pager")) {
                pager_use_color = git_config_bool(var,value);
                return 0;
index 234cd0954b888d814d8d4d86bb41983b80fddade..7c7520ea29dfb26ffafc355ea739046b6d821e28 100755 (executable)
@@ -711,6 +711,7 @@ _git_repo_config ()
                core.compression
                core.legacyHeaders
                i18n.commitEncoding
+               i18n.logOutputEncoding
                diff.color
                color.diff
                diff.renameLimit
index f8c7dbceadf2190997816f9a21b211a6a97efe48..a1502c4e87c0067c8cc276006317005a0da21a49 100644 (file)
@@ -18,7 +18,8 @@ int prefer_symlink_refs;
 int log_all_ref_updates;
 int warn_ambiguous_refs = 1;
 int repository_format_version;
-char git_commit_encoding[MAX_ENCODING_LENGTH] = "utf-8";
+char *git_commit_encoding;
+char *git_log_output_encoding;
 int shared_repository = PERM_UMASK;
 const char *apply_default_whitespace;
 int zlib_compression_level = Z_DEFAULT_COMPRESSION;
index ec991e5c57039a57af7c63db483e5b108a25ad16..8f7907d7abdb764317612d868f58ab88f7175b59 100644 (file)
@@ -72,6 +72,7 @@ struct rev_info {
        const char      *ref_message_id;
        const char      *add_signoff;
        const char      *extra_headers;
+       const char      *log_reencode;
 
        /* Filter by commit log message */
        struct grep_opt *grep_filter;
diff --git a/t/t3900-i18n-commit.sh b/t/t3900-i18n-commit.sh
new file mode 100755 (executable)
index 0000000..46fd47c
--- /dev/null
@@ -0,0 +1,115 @@
+#!/bin/sh
+#
+# Copyright (c) 2006 Junio C Hamano
+#
+
+test_description='commit and log output encodings'
+
+. ./test-lib.sh
+
+compare_with () {
+       git-show -s "$1" | sed -e '1,/^$/d' -e 's/^    //' -e '$d' >current &&
+       diff -u current "$2"
+}
+
+test_expect_success setup '
+       : >F &&
+       git-add F &&
+       T=$(git-write-tree) &&
+       C=$(git-commit-tree $T <../t3900/1-UTF-8.txt) &&
+       git-update-ref HEAD $C &&
+       git-tag C0
+'
+
+test_expect_success 'no encoding header for base case' '
+       E=$(git-cat-file commit C0 | sed -ne "s/^encoding //p") &&
+       test z = "z$E"
+'
+
+for H in ISO-8859-1 EUCJP ISO-2022-JP
+do
+       test_expect_success "$H setup" '
+               git-repo-config i18n.commitencoding $H &&
+               git-checkout -b $H C0 &&
+               echo $H >F &&
+               git-commit -a -F ../t3900/$H.txt
+       '
+done
+
+for H in ISO-8859-1 EUCJP ISO-2022-JP
+do
+       test_expect_success "check encoding header for $H" '
+               E=$(git-cat-file commit '$H' | sed -ne "s/^encoding //p") &&
+               test "z$E" = "z'$H'"
+       '
+done
+
+test_expect_success 'repo-config to remove customization' '
+       git-repo-config --unset-all i18n.commitencoding &&
+       if Z=$(git-repo-config --get-all i18n.commitencoding)
+       then
+               echo Oops, should have failed.
+               false
+       else
+               test z = "z$Z"
+       fi &&
+       git-repo-config i18n.commitencoding utf-8
+'
+
+test_expect_success 'ISO-8859-1 should be shown in UTF-8 now' '
+       compare_with ISO-8859-1 ../t3900/1-UTF-8.txt
+'
+
+for H in EUCJP ISO-2022-JP
+do
+       test_expect_success "$H should be shown in UTF-8 now" '
+               compare_with '$H' ../t3900/2-UTF-8.txt
+       '
+done
+
+test_expect_success 'repo-config to add customization' '
+       git-repo-config --unset-all i18n.commitencoding &&
+       if Z=$(git-repo-config --get-all i18n.commitencoding)
+       then
+               echo Oops, should have failed.
+               false
+       else
+               test z = "z$Z"
+       fi
+'
+
+for H in ISO-8859-1 EUCJP ISO-2022-JP
+do
+       test_expect_success "$H should be shown in itself now" '
+               git-repo-config i18n.commitencoding '$H' &&
+               compare_with '$H' ../t3900/'$H'.txt
+       '
+done
+
+test_expect_success 'repo-config to tweak customization' '
+       git-repo-config i18n.logoutputencoding utf-8
+'
+
+test_expect_success 'ISO-8859-1 should be shown in UTF-8 now' '
+       compare_with ISO-8859-1 ../t3900/1-UTF-8.txt
+'
+
+for H in EUCJP ISO-2022-JP
+do
+       test_expect_success "$H should be shown in UTF-8 now" '
+               compare_with '$H' ../t3900/2-UTF-8.txt
+       '
+done
+
+for J in EUCJP ISO-2022-JP
+do
+       git-repo-config i18n.logoutputencoding $J
+       for H in EUCJP ISO-2022-JP
+       do
+               test_expect_success "$H should be shown in $J now" '
+                       compare_with '$H' ../t3900/'$J'.txt
+               '
+       done
+done
+
+test_done
diff --git a/t/t3900/1-UTF-8.txt b/t/t3900/1-UTF-8.txt
new file mode 100644 (file)
index 0000000..ee31e19
--- /dev/null
@@ -0,0 +1,3 @@
+ÄËÑÏÖ
+
+Ábçdèfg
diff --git a/t/t3900/2-UTF-8.txt b/t/t3900/2-UTF-8.txt
new file mode 100644 (file)
index 0000000..63f4f8f
--- /dev/null
@@ -0,0 +1,4 @@
+はれひほふ
+
+しているのが、いるので。
+濱浜ほれぷりぽれまびぐりろへ。
diff --git a/t/t3900/EUCJP.txt b/t/t3900/EUCJP.txt
new file mode 100644 (file)
index 0000000..546f2aa
--- /dev/null
@@ -0,0 +1,4 @@
+¤Ï¤ì¤Ò¤Û¤Õ
+
+¤·¤Æ¤¤¤ë¤Î¤¬¡¢¤¤¤ë¤Î¤Ç¡£
+ßÀÉͤۤì¤×¤ê¤Ý¤ì¤Þ¤Ó¤°¤ê¤í¤Ø¡£
diff --git a/t/t3900/ISO-2022-JP.txt b/t/t3900/ISO-2022-JP.txt
new file mode 100644 (file)
index 0000000..74b5330
--- /dev/null
@@ -0,0 +1,4 @@
+\e$B$O$l$R$[$U\e(B
+
+\e$B$7$F$$$k$N$,!"$$$k$N$G!#\e(B
+\e$B_@IM$[$l$W$j$]$l$^$S$0$j$m$X!#\e(B
diff --git a/t/t3900/ISO-8859-1.txt b/t/t3900/ISO-8859-1.txt
new file mode 100644 (file)
index 0000000..7cbef0e
--- /dev/null
@@ -0,0 +1,3 @@
+ÄËÑÏÖ
+
+Ábçdèfg
diff --git a/utf8.c b/utf8.c
index 8fa62571aa959897275e694a4370e99c46cccd6a..1eedd8b61aeed9867366df0b70ac849cdef985b9 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -276,3 +276,57 @@ void print_wrapped_text(const char *text, int indent, int indent2, int width)
                }
        }
 }
+
+/*
+ * Given a buffer and its encoding, return it re-encoded
+ * with iconv.  If the conversion fails, returns NULL.
+ */
+#ifndef NO_ICONV
+char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding)
+{
+       iconv_t conv;
+       size_t insz, outsz, outalloc;
+       char *out, *outpos, *cp;
+
+       if (!in_encoding)
+               return NULL;
+       conv = iconv_open(out_encoding, in_encoding);
+       if (conv == (iconv_t) -1)
+               return NULL;
+       insz = strlen(in);
+       outsz = insz;
+       outalloc = outsz + 1; /* for terminating NUL */
+       out = xmalloc(outalloc);
+       outpos = out;
+       cp = (char *)in;
+
+       while (1) {
+               size_t cnt = iconv(conv, &cp, &insz, &outpos, &outsz);
+
+               if (cnt == -1) {
+                       size_t sofar;
+                       if (errno != E2BIG) {
+                               free(out);
+                               iconv_close(conv);
+                               return NULL;
+                       }
+                       /* insz has remaining number of bytes.
+                        * since we started outsz the same as insz,
+                        * it is likely that insz is not enough for
+                        * converting the rest.
+                        */
+                       sofar = outpos - out;
+                       outalloc = sofar + insz * 2 + 32;
+                       out = xrealloc(out, outalloc);
+                       outpos = out + sofar;
+                       outsz = outalloc - sofar - 1;
+               }
+               else {
+                       *outpos = '\0';
+                       break;
+               }
+       }
+       iconv_close(conv);
+       return out;
+}
+#endif
diff --git a/utf8.h b/utf8.h
index a0d7f591adc3d86e5c813d79059c7c0335b32dc3..cae2a8e665c2cbe7bf31a49deed84250eaa37a33 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -5,4 +5,10 @@ int utf8_width(const char **start);
 int is_utf8(const char *text);
 void print_wrapped_text(const char *text, int indent, int indent2, int len);
 
+#ifndef NO_ICONV
+char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding);
+#else
+#define reencode_string(a,b,c) NULL
+#endif
+
 #endif