fast-export: do automatic reencoding of commit messages only if requested
authorElijah Newren <newren@gmail.com>
Tue, 14 May 2019 04:31:02 +0000 (21:31 -0700)
committerJunio C Hamano <gitster@pobox.com>
Tue, 14 May 2019 07:48:56 +0000 (16:48 +0900)
Automatic re-encoding of commit messages (and dropping of the encoding
header) hurts attempts to do reversible history rewrites (e.g. sha1sum
<-> sha256sum transitions, some subtree rewrites), and seems
inconsistent with the general principle followed elsewhere in
fast-export of requiring explicit user requests to modify the output
(e.g. --signed-tags=strip, --tag-of-filtered-object=rewrite). Add a
--reencode flag that the user can use to specify, and like other
fast-export flags, default it to 'abort'.

Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Documentation/git-fast-export.txt
builtin/fast-export.c
t/t9350-fast-export.sh
index 64c01ba91884df1ec8e49ddc8fe852f1fb2a9425..11427acdde68e659ca5d443beff037df8c9f3ebb 100644 (file)
@@ -129,6 +129,13 @@ marks the same across runs.
        for intermediary filters (e.g. for rewriting commit messages
        which refer to older commits, or for stripping blobs by id).
 
        for intermediary filters (e.g. for rewriting commit messages
        which refer to older commits, or for stripping blobs by id).
 
+--reencode=(yes|no|abort)::
+       Specify how to handle `encoding` header in commit objects.  When
+       asking to 'abort' (which is the default), this program will die
+       when encountering such a commit object.  With 'yes', the commit
+       message will be reencoded into UTF-8.  With 'no', the original
+       encoding will be preserved.
+
 --refspec::
        Apply the specified refspec to each ref exported. Multiple of them can
        be specified.
 --refspec::
        Apply the specified refspec to each ref exported. Multiple of them can
        be specified.
index 66331fa4010c6c2a66feefd7734bba426f3062af..c22cef3b2faff945148029197a32253540c24f73 100644 (file)
@@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
 static int progress;
 static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
 static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
 static int progress;
 static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
 static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
+static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT;
 static int fake_missing_tagger;
 static int use_done_feature;
 static int no_data;
 static int fake_missing_tagger;
 static int use_done_feature;
 static int no_data;
@@ -77,6 +78,31 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
        return 0;
 }
 
        return 0;
 }
 
+static int parse_opt_reencode_mode(const struct option *opt,
+                                  const char *arg, int unset)
+{
+       if (unset) {
+               reencode_mode = REENCODE_ABORT;
+               return 0;
+       }
+
+       switch (git_parse_maybe_bool(arg)) {
+       case 0:
+               reencode_mode = REENCODE_NO;
+               break;
+       case 1:
+               reencode_mode = REENCODE_YES;
+               break;
+       default:
+               if (!strcasecmp(arg, "abort"))
+                       reencode_mode = REENCODE_ABORT;
+               else
+                       return error("Unknown reencoding mode: %s", arg);
+       }
+
+       return 0;
+}
+
 static struct decoration idnums;
 static uint32_t last_idnum;
 
 static struct decoration idnums;
 static uint32_t last_idnum;
 
@@ -633,10 +659,21 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
        }
 
        mark_next_object(&commit->object);
        }
 
        mark_next_object(&commit->object);
-       if (anonymize)
+       if (anonymize) {
                reencoded = anonymize_commit_message(message);
                reencoded = anonymize_commit_message(message);
-       else if (!is_encoding_utf8(encoding))
-               reencoded = reencode_string(message, "UTF-8", encoding);
+       } else if (encoding) {
+               switch(reencode_mode) {
+               case REENCODE_YES:
+                       reencoded = reencode_string(message, "UTF-8", encoding);
+                       break;
+               case REENCODE_NO:
+                       break;
+               case REENCODE_ABORT:
+                       die("Encountered commit-specific encoding %s in commit "
+                           "%s; use --reencode=[yes|no] to handle it",
+                           encoding, oid_to_hex(&commit->object.oid));
+               }
+       }
        if (!commit->parents)
                printf("reset %s\n", refname);
        printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
        if (!commit->parents)
                printf("reset %s\n", refname);
        printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
@@ -1091,6 +1128,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
                OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
                             N_("select handling of tags that tag filtered objects"),
                             parse_opt_tag_of_filtered_mode),
                OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
                             N_("select handling of tags that tag filtered objects"),
                             parse_opt_tag_of_filtered_mode),
+               OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
+                            N_("select handling of commit messages in an alternate encoding"),
+                            parse_opt_reencode_mode),
                OPT_STRING(0, "export-marks", &export_filename, N_("file"),
                             N_("Dump marks to this file")),
                OPT_STRING(0, "import-marks", &import_filename, N_("file"),
                OPT_STRING(0, "export-marks", &export_filename, N_("file"),
                             N_("Dump marks to this file")),
                OPT_STRING(0, "import-marks", &import_filename, N_("file"),
index e2ab8eddc06509a25a25062760ca9ff8cb0000c5..b4004e05c2a72c4daf95c4ce74b2145a9486efc4 100755 (executable)
@@ -94,14 +94,14 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' '
        test $MUSS = $(git rev-parse --verify refs/tags/muss)
 '
 
        test $MUSS = $(git rev-parse --verify refs/tags/muss)
 '
 
-test_expect_success 'iso-8859-7' '
+test_expect_success 'reencoding iso-8859-7' '
 
        test_when_finished "git reset --hard HEAD~1" &&
        test_config i18n.commitencoding iso-8859-7 &&
        test_tick &&
        echo rosten >file &&
        git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
 
        test_when_finished "git reset --hard HEAD~1" &&
        test_config i18n.commitencoding iso-8859-7 &&
        test_tick &&
        echo rosten >file &&
        git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
-       git fast-export wer^..wer >iso-8859-7.fi &&
+       git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
        sed "s/wer/i18n/" iso-8859-7.fi |
                (cd new &&
                 git fast-import &&
        sed "s/wer/i18n/" iso-8859-7.fi |
                (cd new &&
                 git fast-import &&
@@ -118,13 +118,45 @@ test_expect_success 'iso-8859-7' '
                 ! grep ^encoding actual)
 '
 
                 ! grep ^encoding actual)
 '
 
+test_expect_success 'aborting on iso-8859-7' '
+
+       test_when_finished "git reset --hard HEAD~1" &&
+       test_config i18n.commitencoding iso-8859-7 &&
+       echo rosten >file &&
+       git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+       test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
+'
+
+test_expect_success 'preserving iso-8859-7' '
+
+       test_when_finished "git reset --hard HEAD~1" &&
+       test_config i18n.commitencoding iso-8859-7 &&
+       echo rosten >file &&
+       git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+       git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
+       sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
+               (cd new &&
+                git fast-import &&
+                # The commit object, if not re-encoded, is 240 bytes.
+                # Removing the "encoding iso-8859-7\n" header would drops 20
+                # bytes.  Re-encoding the Pi character from \xF0 (\360) in
+                # iso-8859-7 to \xCF\x80 (\317\200) in UTF-8 adds a byte.
+                # Check for the expected size...
+                test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
+                # ...as well as the expected byte.
+                git cat-file commit i18n-no-recoding >actual &&
+                grep $(printf "\360") actual &&
+                # Also make sure the commit has the "encoding" header
+                grep ^encoding actual)
+'
+
 test_expect_success 'encoding preserved if reencoding fails' '
 
        test_when_finished "git reset --hard HEAD~1" &&
        test_config i18n.commitencoding iso-8859-7 &&
        echo rosten >file &&
        git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
 test_expect_success 'encoding preserved if reencoding fails' '
 
        test_when_finished "git reset --hard HEAD~1" &&
        test_config i18n.commitencoding iso-8859-7 &&
        echo rosten >file &&
        git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
-       git fast-export wer^..wer >iso-8859-7.fi &&
+       git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
        sed "s/wer/i18n-invalid/" iso-8859-7.fi |
                (cd new &&
                 git fast-import &&
        sed "s/wer/i18n-invalid/" iso-8859-7.fi |
                (cd new &&
                 git fast-import &&