Merge branch 'en/fast-export-encoding'
authorJunio C Hamano <gitster@pobox.com>
Thu, 13 Jun 2019 20:19:39 +0000 (13:19 -0700)
committerJunio C Hamano <gitster@pobox.com>
Thu, 13 Jun 2019 20:19:40 +0000 (13:19 -0700)
The "git fast-export/import" pair has been taught to handle commits
with log messages in encoding other than UTF-8 better.

* en/fast-export-encoding:
fast-export: do automatic reencoding of commit messages only if requested
fast-export: differentiate between explicitly UTF-8 and implicitly UTF-8
fast-export: avoid stripping encoding header if we cannot reencode
fast-import: support 'encoding' commit header
t9350: fix encoding test to actually test reencoding

Documentation/git-fast-export.txt
Documentation/git-fast-import.txt
builtin/fast-export.c
fast-import.c
t/t9300-fast-import.sh
t/t9350-fast-export.sh
t/t9350/broken-iso-8859-7-commit-message.txt [new file with mode: 0644]
t/t9350/simple-iso-8859-7-commit-message.txt [new file with mode: 0644]
index 64c01ba91884df1ec8e49ddc8fe852f1fb2a9425..11427acdde68e659ca5d443beff037df8c9f3ebb 100644 (file)
@@ -129,6 +129,13 @@ marks the same across runs.
        for intermediary filters (e.g. for rewriting commit messages
        which refer to older commits, or for stripping blobs by id).
 
+--reencode=(yes|no|abort)::
+       Specify how to handle `encoding` header in commit objects.  When
+       asking to 'abort' (which is the default), this program will die
+       when encountering such a commit object.  With 'yes', the commit
+       message will be reencoded into UTF-8.  With 'no', the original
+       encoding will be preserved.
+
 --refspec::
        Apply the specified refspec to each ref exported. Multiple of them can
        be specified.
index d65cdb3d08fd745bd4996d0e45259077ea8eb000..7baf9e47b5e61391fbfd5fe44acbfa9943bbe397 100644 (file)
@@ -388,6 +388,7 @@ change to the project.
        original-oid?
        ('author' (SP <name>)? SP LT <email> GT SP <when> LF)?
        'committer' (SP <name>)? SP LT <email> GT SP <when> LF
+       ('encoding' SP <encoding>)?
        data
        ('from' SP <commit-ish> LF)?
        ('merge' SP <commit-ish> LF)?
@@ -455,6 +456,12 @@ that was selected by the --date-format=<fmt> command-line option.
 See ``Date Formats'' above for the set of supported formats, and
 their syntax.
 
+`encoding`
+^^^^^^^^^^
+The optional `encoding` command indicates the encoding of the commit
+message.  Most commits are UTF-8 and the encoding is omitted, but this
+allows importing commit messages into git without first reencoding them.
+
 `from`
 ^^^^^^
 The `from` command is used to specify the commit to initialize
index 9e283482efcfa6de0376cc9306061cea149b12df..c22cef3b2faff945148029197a32253540c24f73 100644 (file)
@@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
 static int progress;
 static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
 static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
+static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT;
 static int fake_missing_tagger;
 static int use_done_feature;
 static int no_data;
@@ -77,6 +78,31 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
        return 0;
 }
 
+static int parse_opt_reencode_mode(const struct option *opt,
+                                  const char *arg, int unset)
+{
+       if (unset) {
+               reencode_mode = REENCODE_ABORT;
+               return 0;
+       }
+
+       switch (git_parse_maybe_bool(arg)) {
+       case 0:
+               reencode_mode = REENCODE_NO;
+               break;
+       case 1:
+               reencode_mode = REENCODE_YES;
+               break;
+       default:
+               if (!strcasecmp(arg, "abort"))
+                       reencode_mode = REENCODE_ABORT;
+               else
+                       return error("Unknown reencoding mode: %s", arg);
+       }
+
+       return 0;
+}
+
 static struct decoration idnums;
 static uint32_t last_idnum;
 
@@ -453,7 +479,7 @@ static const char *find_encoding(const char *begin, const char *end)
        bol = memmem(begin, end ? end - begin : strlen(begin),
                     needle, strlen(needle));
        if (!bol)
-               return git_commit_encoding;
+               return NULL;
        bol += strlen(needle);
        eol = strchrnul(bol, '\n');
        *eol = '\0';
@@ -633,18 +659,32 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
        }
 
        mark_next_object(&commit->object);
-       if (anonymize)
+       if (anonymize) {
                reencoded = anonymize_commit_message(message);
-       else if (!is_encoding_utf8(encoding))
-               reencoded = reencode_string(message, "UTF-8", encoding);
+       } else if (encoding) {
+               switch(reencode_mode) {
+               case REENCODE_YES:
+                       reencoded = reencode_string(message, "UTF-8", encoding);
+                       break;
+               case REENCODE_NO:
+                       break;
+               case REENCODE_ABORT:
+                       die("Encountered commit-specific encoding %s in commit "
+                           "%s; use --reencode=[yes|no] to handle it",
+                           encoding, oid_to_hex(&commit->object.oid));
+               }
+       }
        if (!commit->parents)
                printf("reset %s\n", refname);
        printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
        if (show_original_ids)
                printf("original-oid %s\n", oid_to_hex(&commit->object.oid));
-       printf("%.*s\n%.*s\ndata %u\n%s",
+       printf("%.*s\n%.*s\n",
               (int)(author_end - author), author,
-              (int)(committer_end - committer), committer,
+              (int)(committer_end - committer), committer);
+       if (!reencoded && encoding)
+               printf("encoding %s\n", encoding);
+       printf("data %u\n%s",
               (unsigned)(reencoded
                          ? strlen(reencoded) : message
                          ? strlen(message) : 0),
@@ -1088,6 +1128,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
                OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
                             N_("select handling of tags that tag filtered objects"),
                             parse_opt_tag_of_filtered_mode),
+               OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
+                            N_("select handling of commit messages in an alternate encoding"),
+                            parse_opt_reencode_mode),
                OPT_STRING(0, "export-marks", &export_filename, N_("file"),
                             N_("Dump marks to this file")),
                OPT_STRING(0, "import-marks", &import_filename, N_("file"),
index f38d04fa58510bb7ab35caf4c43d5b2d954cc292..76a7bd369987f7fed63c0f602efe59e0edbd01b8 100644 (file)
@@ -2585,6 +2585,7 @@ static void parse_new_commit(const char *arg)
        struct branch *b;
        char *author = NULL;
        char *committer = NULL;
+       const char *encoding = NULL;
        struct hash_list *merge_list = NULL;
        unsigned int merge_count;
        unsigned char prev_fanout, new_fanout;
@@ -2607,6 +2608,8 @@ static void parse_new_commit(const char *arg)
        }
        if (!committer)
                die("Expected committer but didn't get one");
+       if (skip_prefix(command_buf.buf, "encoding ", &encoding))
+               read_next_command();
        parse_data(&msg, 0, NULL);
        read_next_command();
        parse_from(b);
@@ -2670,9 +2673,13 @@ static void parse_new_commit(const char *arg)
        }
        strbuf_addf(&new_data,
                "author %s\n"
-               "committer %s\n"
-               "\n",
+               "committer %s\n",
                author ? author : committer, committer);
+       if (encoding)
+               strbuf_addf(&new_data,
+                       "encoding %s\n",
+                       encoding);
+       strbuf_addch(&new_data, '\n');
        strbuf_addbuf(&new_data, &msg);
        free(author);
        free(committer);
index 3668263c4046d96fdc79ea3ebe0c28bcb1f2de24..141b7fa35e74b860d91ea7cdabf48730442ed635 100755 (executable)
@@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi
        sed -e s/LFs/LLL/ W-input | tr L "\n" | test_must_fail git fast-import
 '
 
+###
+### series X (other new features)
+###
+
+test_expect_success 'X: handling encoding' '
+       test_tick &&
+       cat >input <<-INPUT_END &&
+       commit refs/heads/encoding
+       committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+       encoding iso-8859-7
+       data <<COMMIT
+       INPUT_END
+
+       printf "Pi: \360\nCOMMIT\n" >>input &&
+
+       git fast-import <input &&
+       git cat-file -p encoding | grep $(printf "\360") &&
+       git log -1 --format=%B encoding | grep $(printf "\317\200")
+'
+
 test_done
index 5690fe28106624f3d666ccdda1dbcdd58673fb13..b4004e05c2a72c4daf95c4ce74b2145a9486efc4 100755 (executable)
@@ -94,22 +94,83 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' '
        test $MUSS = $(git rev-parse --verify refs/tags/muss)
 '
 
-test_expect_success 'iso-8859-1' '
+test_expect_success 'reencoding iso-8859-7' '
 
-       git config i18n.commitencoding ISO8859-1 &&
-       # use author and committer name in ISO-8859-1 to match it.
-       . "$TEST_DIRECTORY"/t3901/8859-1.txt &&
+       test_when_finished "git reset --hard HEAD~1" &&
+       test_config i18n.commitencoding iso-8859-7 &&
        test_tick &&
        echo rosten >file &&
-       git commit -s -m den file &&
-       git fast-export wer^..wer >iso8859-1.fi &&
-       sed "s/wer/i18n/" iso8859-1.fi |
+       git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+       git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
+       sed "s/wer/i18n/" iso-8859-7.fi |
                (cd new &&
                 git fast-import &&
+                # The commit object, if not re-encoded, would be 240 bytes.
+                # Removing the "encoding iso-8859-7\n" header drops 20 bytes.
+                # Re-encoding the Pi character from \xF0 (\360) in iso-8859-7
+                # to \xCF\x80 (\317\200) in UTF-8 adds a byte.  Check for
+                # the expected size.
+                test 221 -eq "$(git cat-file -s i18n)" &&
+                # ...and for the expected translation of bytes.
                 git cat-file commit i18n >actual &&
-                grep "Áéí óú" actual)
+                grep $(printf "\317\200") actual &&
+                # Also make sure the commit does not have the "encoding" header
+                ! grep ^encoding actual)
+'
+
+test_expect_success 'aborting on iso-8859-7' '
 
+       test_when_finished "git reset --hard HEAD~1" &&
+       test_config i18n.commitencoding iso-8859-7 &&
+       echo rosten >file &&
+       git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+       test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
 '
+
+test_expect_success 'preserving iso-8859-7' '
+
+       test_when_finished "git reset --hard HEAD~1" &&
+       test_config i18n.commitencoding iso-8859-7 &&
+       echo rosten >file &&
+       git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+       git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
+       sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
+               (cd new &&
+                git fast-import &&
+                # The commit object, if not re-encoded, is 240 bytes.
+                # Removing the "encoding iso-8859-7\n" header would drops 20
+                # bytes.  Re-encoding the Pi character from \xF0 (\360) in
+                # iso-8859-7 to \xCF\x80 (\317\200) in UTF-8 adds a byte.
+                # Check for the expected size...
+                test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
+                # ...as well as the expected byte.
+                git cat-file commit i18n-no-recoding >actual &&
+                grep $(printf "\360") actual &&
+                # Also make sure the commit has the "encoding" header
+                grep ^encoding actual)
+'
+
+test_expect_success 'encoding preserved if reencoding fails' '
+
+       test_when_finished "git reset --hard HEAD~1" &&
+       test_config i18n.commitencoding iso-8859-7 &&
+       echo rosten >file &&
+       git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
+       git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
+       sed "s/wer/i18n-invalid/" iso-8859-7.fi |
+               (cd new &&
+                git fast-import &&
+                git cat-file commit i18n-invalid >actual &&
+                # Make sure the commit still has the encoding header
+                grep ^encoding actual &&
+                # Verify that the commit has the expected size; i.e.
+                # that no bytes were re-encoded to a different encoding.
+                test 252 -eq "$(git cat-file -s i18n-invalid)" &&
+                # ...and check for the original special bytes
+                grep $(printf "\360") actual &&
+                grep $(printf "\377") actual)
+'
+
 test_expect_success 'import/export-marks' '
 
        git checkout -b marks master &&
@@ -224,7 +285,6 @@ GIT_COMMITTER_NAME='C O Mitter'; export GIT_COMMITTER_NAME
 
 test_expect_success 'setup copies' '
 
-       git config --unset i18n.commitencoding &&
        git checkout -b copy rein &&
        git mv file file3 &&
        git commit -m move1 &&
diff --git a/t/t9350/broken-iso-8859-7-commit-message.txt b/t/t9350/broken-iso-8859-7-commit-message.txt
new file mode 100644 (file)
index 0000000..d06ad75
--- /dev/null
@@ -0,0 +1 @@
+Pi: ð; Invalid: ÿ
\ No newline at end of file
diff --git a/t/t9350/simple-iso-8859-7-commit-message.txt b/t/t9350/simple-iso-8859-7-commit-message.txt
new file mode 100644 (file)
index 0000000..8b3f0c3
--- /dev/null
@@ -0,0 +1 @@
+Pi: ð
\ No newline at end of file