git-archimport.perlon commit Merge branch 'jc/forbid-symbolic-ref-d-HEAD' into maint (d664531)
   1#!/usr/bin/perl
   2#
   3# This tool is copyright (c) 2005, Martin Langhoff.
   4# It is released under the Gnu Public License, version 2.
   5#
   6# The basic idea is to walk the output of tla abrowse,
   7# fetch the changesets and apply them.
   8#
   9
  10=head1 Invocation
  11
  12    git archimport [ -h ] [ -v ] [ -o ] [ -a ] [ -f ] [ -T ]
  13        [ -D depth] [ -t tempdir ] <archive>/<branch> [ <archive>/<branch> ]
  14
  15Imports a project from one or more Arch repositories. It will follow branches
  16and repositories within the namespaces defined by the <archive/branch>
  17parameters supplied. If it cannot find the remote branch a merge comes from
  18it will just import it as a regular commit. If it can find it, it will mark it
  19as a merge whenever possible.
  20
  21See man (1) git-archimport for more details.
  22
  23=head1 TODO
  24
  25 - create tag objects instead of ref tags
  26 - audit shell-escaping of filenames
  27 - hide our private tags somewhere smarter
  28 - find a way to make "cat *patches | patch" safe even when patchfiles are missing newlines
  29 - sort and apply patches by graphing ancestry relations instead of just
  30   relying in dates supplied in the changeset itself.
  31   tla ancestry-graph -m could be helpful here...
  32
  33=head1 Devel tricks
  34
  35Add print in front of the shell commands invoked via backticks.
  36
  37=head1 Devel Notes
  38
  39There are several places where Arch and git terminology are intermixed
  40and potentially confused.
  41
  42The notion of a "branch" in git is approximately equivalent to
  43a "archive/category--branch--version" in Arch.  Also, it should be noted
  44that the "--branch" portion of "archive/category--branch--version" is really
  45optional in Arch although not many people (nor tools!) seem to know this.
  46This means that "archive/category--version" is also a valid "branch"
  47in git terms.
  48
  49We always refer to Arch names by their fully qualified variant (which
  50means the "archive" name is prefixed.
  51
  52For people unfamiliar with Arch, an "archive" is the term for "repository",
  53and can contain multiple, unrelated branches.
  54
  55=cut
  56
  57use 5.008;
  58use strict;
  59use warnings;
  60use Getopt::Std;
  61use File::Temp qw(tempdir);
  62use File::Path qw(mkpath rmtree);
  63use File::Basename qw(basename dirname);
  64use Data::Dumper qw/ Dumper /;
  65use IPC::Open2;
  66
  67$SIG{'PIPE'}="IGNORE";
  68$ENV{'TZ'}="UTC";
  69
  70my $git_dir = $ENV{"GIT_DIR"} || ".git";
  71$ENV{"GIT_DIR"} = $git_dir;
  72my $ptag_dir = "$git_dir/archimport/tags";
  73
  74our($opt_h,$opt_f,$opt_v,$opt_T,$opt_t,$opt_D,$opt_a,$opt_o);
  75
  76sub usage() {
  77    print STDERR <<END;
  78usage: git archimport     # fetch/update GIT from Arch
  79       [ -h ] [ -v ] [ -o ] [ -a ] [ -f ] [ -T ] [ -D depth ] [ -t tempdir ]
  80       repository/arch-branch [ repository/arch-branch] ...
  81END
  82    exit(1);
  83}
  84
  85getopts("fThvat:D:") or usage();
  86usage if $opt_h;
  87
  88@ARGV >= 1 or usage();
  89# $arch_branches:
  90# values associated with keys:
  91#   =1 - Arch version / git 'branch' detected via abrowse on a limit
  92#   >1 - Arch version / git 'branch' of an auxiliary branch we've merged
  93my %arch_branches = map { my $branch = $_; $branch =~ s/:[^:]*$//; $branch => 1 } @ARGV;
  94
  95# $branch_name_map:
  96# maps arch branches to git branch names
  97my %branch_name_map = map { m/^(.*):([^:]*)$/; $1 => $2 } grep { m/:/ } @ARGV;
  98
  99$ENV{'TMPDIR'} = $opt_t if $opt_t; # $ENV{TMPDIR} will affect tempdir() calls:
 100my $tmp = tempdir('git-archimport-XXXXXX', TMPDIR => 1, CLEANUP => 1);
 101$opt_v && print "+ Using $tmp as temporary directory\n";
 102
 103unless (-d $git_dir) { # initial import needs empty directory
 104    opendir DIR, '.' or die "Unable to open current directory: $!\n";
 105    while (my $entry = readdir DIR) {
 106        $entry =~ /^\.\.?$/ or
 107            die "Initial import needs an empty current working directory.\n"
 108    }
 109    closedir DIR
 110}
 111
 112my $default_archive;            # default Arch archive
 113my %reachable = ();             # Arch repositories we can access
 114my %unreachable = ();           # Arch repositories we can't access :<
 115my @psets  = ();                # the collection
 116my %psets  = ();                # the collection, by name
 117my %stats  = (                  # Track which strategy we used to import:
 118        get_tag => 0, replay => 0, get_new => 0, get_delta => 0,
 119        simple_changeset => 0, import_or_tag => 0
 120);
 121
 122my %rptags = ();                # my reverse private tags
 123                                # to map a SHA1 to a commitid
 124my $TLA = $ENV{'ARCH_CLIENT'} || 'tla';
 125
 126sub do_abrowse {
 127    my $stage = shift;
 128    while (my ($limit, $level) = each %arch_branches) {
 129        next unless $level == $stage;
 130
 131        open ABROWSE, "$TLA abrowse -fkD --merges $limit |"
 132                                or die "Problems with tla abrowse: $!";
 133
 134        my %ps        = ();         # the current one
 135        my $lastseen  = '';
 136
 137        while (<ABROWSE>) {
 138            chomp;
 139
 140            # first record padded w 8 spaces
 141            if (s/^\s{8}\b//) {
 142                my ($id, $type) = split(m/\s+/, $_, 2);
 143
 144                my %last_ps;
 145                # store the record we just captured
 146                if (%ps && !exists $psets{ $ps{id} }) {
 147                    %last_ps = %ps; # break references
 148                    push (@psets, \%last_ps);
 149                    $psets{ $last_ps{id} } = \%last_ps;
 150                }
 151
 152                my $branch = extract_versionname($id);
 153                %ps = ( id => $id, branch => $branch );
 154                if (%last_ps && ($last_ps{branch} eq $branch)) {
 155                    $ps{parent_id} = $last_ps{id};
 156                }
 157
 158                $arch_branches{$branch} = 1;
 159                $lastseen = 'id';
 160
 161                # deal with types (should work with baz or tla):
 162                if ($type =~ m/\(.*changeset\)/) {
 163                    $ps{type} = 's';
 164                } elsif ($type =~ /\(.*import\)/) {
 165                    $ps{type} = 'i';
 166                } elsif ($type =~ m/\(tag.*?(\S+\@\S+).*?\)/) {
 167                    $ps{type} = 't';
 168                    # read which revision we've tagged when we parse the log
 169                    $ps{tag}  = $1;
 170                } else {
 171                    warn "Unknown type $type";
 172                }
 173
 174                $arch_branches{$branch} = 1;
 175                $lastseen = 'id';
 176            } elsif (s/^\s{10}//) {
 177                # 10 leading spaces or more
 178                # indicate commit metadata
 179
 180                # date
 181                if ($lastseen eq 'id' && m/^(\d{4}-\d\d-\d\d \d\d:\d\d:\d\d)/){
 182                    $ps{date}   = $1;
 183                    $lastseen = 'date';
 184                } elsif ($_ eq 'merges in:') {
 185                    $ps{merges} = [];
 186                    $lastseen = 'merges';
 187                } elsif ($lastseen eq 'merges' && s/^\s{2}//) {
 188                    my $id = $_;
 189                    push (@{$ps{merges}}, $id);
 190
 191                    # aggressive branch finding:
 192                    if ($opt_D) {
 193                        my $branch = extract_versionname($id);
 194                        my $repo = extract_reponame($branch);
 195
 196                        if (archive_reachable($repo) &&
 197                                !defined $arch_branches{$branch}) {
 198                            $arch_branches{$branch} = $stage + 1;
 199                        }
 200                    }
 201                } else {
 202                    warn "more metadata after merges!?: $_\n" unless /^\s*$/;
 203                }
 204            }
 205        }
 206
 207        if (%ps && !exists $psets{ $ps{id} }) {
 208            my %temp = %ps;         # break references
 209            if (@psets && $psets[$#psets]{branch} eq $ps{branch}) {
 210                $temp{parent_id} = $psets[$#psets]{id};
 211            }
 212            push (@psets, \%temp);
 213            $psets{ $temp{id} } = \%temp;
 214        }
 215
 216        close ABROWSE or die "$TLA abrowse failed on $limit\n";
 217    }
 218}                               # end foreach $root
 219
 220do_abrowse(1);
 221my $depth = 2;
 222$opt_D ||= 0;
 223while ($depth <= $opt_D) {
 224    do_abrowse($depth);
 225    $depth++;
 226}
 227
 228## Order patches by time
 229# FIXME see if we can find a more optimal way to do this by graphing
 230# the ancestry data and walking it, that way we won't have to rely on
 231# client-supplied dates
 232@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
 233
 234#print Dumper \@psets;
 235
 236##
 237## TODO cleanup irrelevant patches
 238##      and put an initial import
 239##      or a full tag
 240my $import = 0;
 241unless (-d $git_dir) { # initial import
 242    if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
 243        print "Starting import from $psets[0]{id}\n";
 244        `git-init`;
 245        die $! if $?;
 246        $import = 1;
 247    } else {
 248        die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
 249    }
 250} else {    # progressing an import
 251    # load the rptags
 252    opendir(DIR, $ptag_dir)
 253        || die "can't opendir: $!";
 254    while (my $file = readdir(DIR)) {
 255        # skip non-interesting-files
 256        next unless -f "$ptag_dir/$file";
 257
 258        # convert first '--' to '/' from old git-archimport to use
 259        # as an archivename/c--b--v private tag
 260        if ($file !~ m!,!) {
 261            my $oldfile = $file;
 262            $file =~ s!--!,!;
 263            print STDERR "converting old tag $oldfile to $file\n";
 264            rename("$ptag_dir/$oldfile", "$ptag_dir/$file") or die $!;
 265        }
 266        my $sha = ptag($file);
 267        chomp $sha;
 268        $rptags{$sha} = $file;
 269    }
 270    closedir DIR;
 271}
 272
 273# process patchsets
 274# extract the Arch repository name (Arch "archive" in Arch-speak)
 275sub extract_reponame {
 276    my $fq_cvbr = shift; # archivename/[[[[category]branch]version]revision]
 277    return (split(/\//, $fq_cvbr))[0];
 278}
 279
 280sub extract_versionname {
 281    my $name = shift;
 282    $name =~ s/--(?:patch|version(?:fix)?|base)-\d+$//;
 283    return $name;
 284}
 285
 286# convert a fully-qualified revision or version to a unique dirname:
 287#   normalperson@yhbt.net-05/mpd--uclinux--1--patch-2
 288# becomes: normalperson@yhbt.net-05,mpd--uclinux--1
 289#
 290# the git notion of a branch is closer to
 291# archive/category--branch--version than archive/category--branch, so we
 292# use this to convert to git branch names.
 293# Also, keep archive names but replace '/' with ',' since it won't require
 294# subdirectories, and is safer than swapping '--' which could confuse
 295# reverse-mapping when dealing with bastard branches that
 296# are just archive/category--version  (no --branch)
 297sub tree_dirname {
 298    my $revision = shift;
 299    my $name = extract_versionname($revision);
 300    $name =~ s#/#,#;
 301    return $name;
 302}
 303
 304# old versions of git-archimport just use the <category--branch> part:
 305sub old_style_branchname {
 306    my $id = shift;
 307    my $ret = safe_pipe_capture($TLA,'parse-package-name','-p',$id);
 308    chomp $ret;
 309    return $ret;
 310}
 311
 312*git_default_branchname = $opt_o ? *old_style_branchname : *tree_dirname;
 313
 314# retrieve default archive, since $branch_name_map keys might not include it
 315sub get_default_archive {
 316    if (!defined $default_archive) {
 317        $default_archive = safe_pipe_capture($TLA,'my-default-archive');
 318        chomp $default_archive;
 319    }
 320    return $default_archive;
 321}
 322
 323sub git_branchname {
 324    my $revision = shift;
 325    my $name = extract_versionname($revision);
 326
 327    if (exists $branch_name_map{$name}) {
 328        return $branch_name_map{$name};
 329
 330    } elsif ($name =~ m#^([^/]*)/(.*)$#
 331             && $1 eq get_default_archive()
 332             && exists $branch_name_map{$2}) {
 333        # the names given in the command-line lacked the archive.
 334        return $branch_name_map{$2};
 335
 336    } else {
 337        return git_default_branchname($revision);
 338    }
 339}
 340
 341sub process_patchset_accurate {
 342    my $ps = shift;
 343
 344    # switch to that branch if we're not already in that branch:
 345    if (-e "$git_dir/refs/heads/$ps->{branch}") {
 346       system('git-checkout','-f',$ps->{branch}) == 0 or die "$! $?\n";
 347
 348       # remove any old stuff that got leftover:
 349       my $rm = safe_pipe_capture('git-ls-files','--others','-z');
 350       rmtree(split(/\0/,$rm)) if $rm;
 351    }
 352
 353    # Apply the import/changeset/merge into the working tree
 354    my $dir = sync_to_ps($ps);
 355    # read the new log entry:
 356    my @commitlog = safe_pipe_capture($TLA,'cat-log','-d',$dir,$ps->{id});
 357    die "Error in cat-log: $!" if $?;
 358    chomp @commitlog;
 359
 360    # grab variables we want from the log, new fields get added to $ps:
 361    # (author, date, email, summary, message body ...)
 362    parselog($ps, \@commitlog);
 363
 364    if ($ps->{id} =~ /--base-0$/ && $ps->{id} ne $psets[0]{id}) {
 365        # this should work when importing continuations
 366        if ($ps->{tag} && (my $branchpoint = eval { ptag($ps->{tag}) })) {
 367
 368            # find where we are supposed to branch from
 369            if (! -e "$git_dir/refs/heads/$ps->{branch}") {
 370                system('git-branch',$ps->{branch},$branchpoint) == 0 or die "$! $?\n";
 371
 372                # We trust Arch with the fact that this is just a tag,
 373                # and it does not affect the state of the tree, so
 374                # we just tag and move on.  If the user really wants us
 375                # to consolidate more branches into one, don't tag because
 376                # the tag name would be already taken.
 377                tag($ps->{id}, $branchpoint);
 378                ptag($ps->{id}, $branchpoint);
 379                print " * Tagged $ps->{id} at $branchpoint\n";
 380            }
 381            system('git-checkout','-f',$ps->{branch}) == 0 or die "$! $?\n";
 382
 383            # remove any old stuff that got leftover:
 384            my $rm = safe_pipe_capture('git-ls-files','--others','-z');
 385            rmtree(split(/\0/,$rm)) if $rm;
 386            return 0;
 387        } else {
 388            warn "Tagging from unknown id unsupported\n" if $ps->{tag};
 389        }
 390        # allow multiple bases/imports here since Arch supports cherry-picks
 391        # from unrelated trees
 392    }
 393
 394    # update the index with all the changes we got
 395    system('git-diff-files --name-only -z | '.
 396            'git-update-index --remove -z --stdin') == 0 or die "$! $?\n";
 397    system('git-ls-files --others -z | '.
 398            'git-update-index --add -z --stdin') == 0 or die "$! $?\n";
 399    return 1;
 400}
 401
 402# the native changeset processing strategy.  This is very fast, but
 403# does not handle permissions or any renames involving directories
 404sub process_patchset_fast {
 405    my $ps = shift;
 406    #
 407    # create the branch if needed
 408    #
 409    if ($ps->{type} eq 'i' && !$import) {
 410        die "Should not have more than one 'Initial import' per GIT import: $ps->{id}";
 411    }
 412
 413    unless ($import) { # skip for import
 414        if ( -e "$git_dir/refs/heads/$ps->{branch}") {
 415            # we know about this branch
 416            system('git-checkout',$ps->{branch});
 417        } else {
 418            # new branch! we need to verify a few things
 419            die "Branch on a non-tag!" unless $ps->{type} eq 't';
 420            my $branchpoint = ptag($ps->{tag});
 421            die "Tagging from unknown id unsupported: $ps->{tag}"
 422                unless $branchpoint;
 423
 424            # find where we are supposed to branch from
 425            if (! -e "$git_dir/refs/heads/$ps->{branch}") {
 426                system('git-branch',$ps->{branch},$branchpoint) == 0 or die "$! $?\n";
 427
 428                # We trust Arch with the fact that this is just a tag,
 429                # and it does not affect the state of the tree, so
 430                # we just tag and move on.  If the user really wants us
 431                # to consolidate more branches into one, don't tag because
 432                # the tag name would be already taken.
 433                tag($ps->{id}, $branchpoint);
 434                ptag($ps->{id}, $branchpoint);
 435                print " * Tagged $ps->{id} at $branchpoint\n";
 436            }
 437            system('git-checkout',$ps->{branch}) == 0 or die "$! $?\n";
 438            return 0;
 439        }
 440        die $! if $?;
 441    }
 442
 443    #
 444    # Apply the import/changeset/merge into the working tree
 445    #
 446    if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
 447        apply_import($ps) or die $!;
 448        $stats{import_or_tag}++;
 449        $import=0;
 450    } elsif ($ps->{type} eq 's') {
 451        apply_cset($ps);
 452        $stats{simple_changeset}++;
 453    }
 454
 455    #
 456    # prepare update git's index, based on what arch knows
 457    # about the pset, resolve parents, etc
 458    #
 459
 460    my @commitlog = safe_pipe_capture($TLA,'cat-archive-log',$ps->{id});
 461    die "Error in cat-archive-log: $!" if $?;
 462
 463    parselog($ps,\@commitlog);
 464
 465    # imports don't give us good info
 466    # on added files. Shame on them
 467    if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
 468        system('git-ls-files --deleted -z | '.
 469                'git-update-index --remove -z --stdin') == 0 or die "$! $?\n";
 470        system('git-ls-files --others -z | '.
 471                'git-update-index --add -z --stdin') == 0 or die "$! $?\n";
 472    }
 473
 474    # TODO: handle removed_directories and renamed_directories:
 475
 476    if (my $del = $ps->{removed_files}) {
 477        unlink @$del;
 478        while (@$del) {
 479            my @slice = splice(@$del, 0, 100);
 480            system('git-update-index','--remove','--',@slice) == 0 or
 481                            die "Error in git-update-index --remove: $! $?\n";
 482        }
 483    }
 484
 485    if (my $ren = $ps->{renamed_files}) {                # renamed
 486        if (@$ren % 2) {
 487            die "Odd number of entries in rename!?";
 488        }
 489
 490        while (@$ren) {
 491            my $from = shift @$ren;
 492            my $to   = shift @$ren;
 493
 494            unless (-d dirname($to)) {
 495                mkpath(dirname($to)); # will die on err
 496            }
 497            # print "moving $from $to";
 498            rename($from, $to) or die "Error renaming '$from' '$to': $!\n";
 499            system('git-update-index','--remove','--',$from) == 0 or
 500                            die "Error in git-update-index --remove: $! $?\n";
 501            system('git-update-index','--add','--',$to) == 0 or
 502                            die "Error in git-update-index --add: $! $?\n";
 503        }
 504    }
 505
 506    if (my $add = $ps->{new_files}) {
 507        while (@$add) {
 508            my @slice = splice(@$add, 0, 100);
 509            system('git-update-index','--add','--',@slice) == 0 or
 510                            die "Error in git-update-index --add: $! $?\n";
 511        }
 512    }
 513
 514    if (my $mod = $ps->{modified_files}) {
 515        while (@$mod) {
 516            my @slice = splice(@$mod, 0, 100);
 517            system('git-update-index','--',@slice) == 0 or
 518                            die "Error in git-update-index: $! $?\n";
 519        }
 520    }
 521    return 1; # we successfully applied the changeset
 522}
 523
 524if ($opt_f) {
 525    print "Will import patchsets using the fast strategy\n",
 526            "Renamed directories and permission changes will be missed\n";
 527    *process_patchset = *process_patchset_fast;
 528} else {
 529    print "Using the default (accurate) import strategy.\n",
 530            "Things may be a bit slow\n";
 531    *process_patchset = *process_patchset_accurate;
 532}
 533
 534foreach my $ps (@psets) {
 535    # process patchsets
 536    $ps->{branch} = git_branchname($ps->{id});
 537
 538    #
 539    # ensure we have a clean state
 540    #
 541    if (my $dirty = `git-diff-files`) {
 542        die "Unclean tree when about to process $ps->{id} " .
 543            " - did we fail to commit cleanly before?\n$dirty";
 544    }
 545    die $! if $?;
 546
 547    #
 548    # skip commits already in repo
 549    #
 550    if (ptag($ps->{id})) {
 551      $opt_v && print " * Skipping already imported: $ps->{id}\n";
 552      next;
 553    }
 554
 555    print " * Starting to work on $ps->{id}\n";
 556
 557    process_patchset($ps) or next;
 558
 559    # warn "errors when running git-update-index! $!";
 560    my $tree = `git-write-tree`;
 561    die "cannot write tree $!" if $?;
 562    chomp $tree;
 563
 564    #
 565    # Who's your daddy?
 566    #
 567    my @par;
 568    if ( -e "$git_dir/refs/heads/$ps->{branch}") {
 569        if (open HEAD, "<","$git_dir/refs/heads/$ps->{branch}") {
 570            my $p = <HEAD>;
 571            close HEAD;
 572            chomp $p;
 573            push @par, '-p', $p;
 574        } else {
 575            if ($ps->{type} eq 's') {
 576                warn "Could not find the right head for the branch $ps->{branch}";
 577            }
 578        }
 579    }
 580
 581    if ($ps->{merges}) {
 582        push @par, find_parents($ps);
 583    }
 584
 585    #
 586    # Commit, tag and clean state
 587    #
 588    $ENV{TZ}                  = 'GMT';
 589    $ENV{GIT_AUTHOR_NAME}     = $ps->{author};
 590    $ENV{GIT_AUTHOR_EMAIL}    = $ps->{email};
 591    $ENV{GIT_AUTHOR_DATE}     = $ps->{date};
 592    $ENV{GIT_COMMITTER_NAME}  = $ps->{author};
 593    $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
 594    $ENV{GIT_COMMITTER_DATE}  = $ps->{date};
 595
 596    my $pid = open2(*READER, *WRITER,'git-commit-tree',$tree,@par)
 597        or die $!;
 598    print WRITER $ps->{summary},"\n\n";
 599
 600    # only print message if it's not empty, to avoid a spurious blank line;
 601    # also append an extra newline, so there's a blank line before the
 602    # following "git-archimport-id:" line.
 603    print WRITER $ps->{message},"\n\n" if ($ps->{message} ne "");
 604
 605    # make it easy to backtrack and figure out which Arch revision this was:
 606    print WRITER 'git-archimport-id: ',$ps->{id},"\n";
 607
 608    close WRITER;
 609    my $commitid = <READER>;    # read
 610    chomp $commitid;
 611    close READER;
 612    waitpid $pid,0;             # close;
 613
 614    if (length $commitid != 40) {
 615        die "Something went wrong with the commit! $! $commitid";
 616    }
 617    #
 618    # Update the branch
 619    #
 620    open  HEAD, ">","$git_dir/refs/heads/$ps->{branch}";
 621    print HEAD $commitid;
 622    close HEAD;
 623    system('git-update-ref', 'HEAD', "$ps->{branch}");
 624
 625    # tag accordingly
 626    ptag($ps->{id}, $commitid); # private tag
 627    if ($opt_T || $ps->{type} eq 't' || $ps->{type} eq 'i') {
 628        tag($ps->{id}, $commitid);
 629    }
 630    print " * Committed $ps->{id}\n";
 631    print "   + tree   $tree\n";
 632    print "   + commit $commitid\n";
 633    $opt_v && print "   + commit date is  $ps->{date} \n";
 634    $opt_v && print "   + parents:  ",join(' ',@par),"\n";
 635}
 636
 637if ($opt_v) {
 638    foreach (sort keys %stats) {
 639        print" $_: $stats{$_}\n";
 640    }
 641}
 642exit 0;
 643
 644# used by the accurate strategy:
 645sub sync_to_ps {
 646    my $ps = shift;
 647    my $tree_dir = $tmp.'/'.tree_dirname($ps->{id});
 648
 649    $opt_v && print "sync_to_ps($ps->{id}) method: ";
 650
 651    if (-d $tree_dir) {
 652        if ($ps->{type} eq 't') {
 653            $opt_v && print "get (tag)\n";
 654            # looks like a tag-only or (worse,) a mixed tags/changeset branch,
 655            # can't rely on replay to work correctly on these
 656            rmtree($tree_dir);
 657            safe_pipe_capture($TLA,'get','--no-pristine',$ps->{id},$tree_dir);
 658            $stats{get_tag}++;
 659        } else {
 660                my $tree_id = arch_tree_id($tree_dir);
 661                if ($ps->{parent_id} && ($ps->{parent_id} eq $tree_id)) {
 662                    # the common case (hopefully)
 663                    $opt_v && print "replay\n";
 664                    safe_pipe_capture($TLA,'replay','-d',$tree_dir,$ps->{id});
 665                    $stats{replay}++;
 666                } else {
 667                    # getting one tree is usually faster than getting two trees
 668                    # and applying the delta ...
 669                    rmtree($tree_dir);
 670                    $opt_v && print "apply-delta\n";
 671                    safe_pipe_capture($TLA,'get','--no-pristine',
 672                                        $ps->{id},$tree_dir);
 673                    $stats{get_delta}++;
 674                }
 675        }
 676    } else {
 677        # new branch work
 678        $opt_v && print "get (new tree)\n";
 679        safe_pipe_capture($TLA,'get','--no-pristine',$ps->{id},$tree_dir);
 680        $stats{get_new}++;
 681    }
 682
 683    # added -I flag to rsync since we're going to fast! AIEEEEE!!!!
 684    system('rsync','-aI','--delete','--exclude',$git_dir,
 685#               '--exclude','.arch-inventory',
 686                '--exclude','.arch-ids','--exclude','{arch}',
 687                '--exclude','+*','--exclude',',*',
 688                "$tree_dir/",'./') == 0 or die "Cannot rsync $tree_dir: $! $?";
 689    return $tree_dir;
 690}
 691
 692sub apply_import {
 693    my $ps = shift;
 694    my $bname = git_branchname($ps->{id});
 695
 696    mkpath($tmp);
 697
 698    safe_pipe_capture($TLA,'get','-s','--no-pristine',$ps->{id},"$tmp/import");
 699    die "Cannot get import: $!" if $?;
 700    system('rsync','-aI','--delete', '--exclude',$git_dir,
 701                '--exclude','.arch-ids','--exclude','{arch}',
 702                "$tmp/import/", './');
 703    die "Cannot rsync import:$!" if $?;
 704
 705    rmtree("$tmp/import");
 706    die "Cannot remove tempdir: $!" if $?;
 707
 708
 709    return 1;
 710}
 711
 712sub apply_cset {
 713    my $ps = shift;
 714
 715    mkpath($tmp);
 716
 717    # get the changeset
 718    safe_pipe_capture($TLA,'get-changeset',$ps->{id},"$tmp/changeset");
 719    die "Cannot get changeset: $!" if $?;
 720
 721    # apply patches
 722    if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
 723        # this can be sped up considerably by doing
 724        #    (find | xargs cat) | patch
 725        # but that can get mucked up by patches
 726        # with missing trailing newlines or the standard
 727        # 'missing newline' flag in the patch - possibly
 728        # produced with an old/buggy diff.
 729        # slow and safe, we invoke patch once per patchfile
 730        `find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
 731        die "Problem applying patches! $!" if $?;
 732    }
 733
 734    # apply changed binary files
 735    if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
 736        foreach my $mod (@modified) {
 737            chomp $mod;
 738            my $orig = $mod;
 739            $orig =~ s/\.modified$//; # lazy
 740            $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
 741            #print "rsync -p '$mod' '$orig'";
 742            system('rsync','-p',$mod,"./$orig");
 743            die "Problem applying binary changes! $!" if $?;
 744        }
 745    }
 746
 747    # bring in new files
 748    system('rsync','-aI','--exclude',$git_dir,
 749                '--exclude','.arch-ids',
 750                '--exclude', '{arch}',
 751                "$tmp/changeset/new-files-archive/",'./');
 752
 753    # deleted files are hinted from the commitlog processing
 754
 755    rmtree("$tmp/changeset");
 756}
 757
 758
 759# =for reference
 760# notes: *-files/-directories keys cannot have spaces, they're always
 761# pika-escaped.  Everything after the first newline
 762# A log entry looks like:
 763# Revision: moodle-org--moodle--1.3.3--patch-15
 764# Archive: arch-eduforge@catalyst.net.nz--2004
 765# Creator: Penny Leach <penny@catalyst.net.nz>
 766# Date: Wed May 25 14:15:34 NZST 2005
 767# Standard-date: 2005-05-25 02:15:34 GMT
 768# New-files: lang/de/.arch-ids/block_glossary_random.php.id
 769#     lang/de/.arch-ids/block_html.php.id
 770# New-directories: lang/de/help/questionnaire
 771#     lang/de/help/questionnaire/.arch-ids
 772# Renamed-files: .arch-ids/db_sears.sql.id db/.arch-ids/db_sears.sql.id
 773#    db_sears.sql db/db_sears.sql
 774# Removed-files: lang/be/docs/.arch-ids/release.html.id
 775#     lang/be/docs/.arch-ids/releaseold.html.id
 776# Modified-files: admin/cron.php admin/delete.php
 777#     admin/editor.html backup/lib.php backup/restore.php
 778# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
 779# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
 780#   summary can be multiline with a leading space just like the above fields
 781# Keywords:
 782#
 783# Updating yadda tadda tadda madda
 784sub parselog {
 785    my ($ps, $log) = @_;
 786    my $key = undef;
 787
 788    # headers we want that contain filenames:
 789    my %want_headers = (
 790        new_files => 1,
 791        modified_files => 1,
 792        renamed_files => 1,
 793        renamed_directories => 1,
 794        removed_files => 1,
 795        removed_directories => 1,
 796    );
 797
 798    chomp (@$log);
 799    while ($_ = shift @$log) {
 800        if (/^Continuation-of:\s*(.*)/) {
 801            $ps->{tag} = $1;
 802            $key = undef;
 803        } elsif (/^Summary:\s*(.*)$/ ) {
 804            # summary can be multiline as long as it has a leading space.
 805            # we squeeze it onto a single line, though.
 806            $ps->{summary} = [ $1 ];
 807            $key = 'summary';
 808        } elsif (/^Creator: (.*)\s*<([^\>]+)>/) {
 809            $ps->{author} = $1;
 810            $ps->{email} = $2;
 811            $key = undef;
 812        # any *-files or *-directories can be read here:
 813        } elsif (/^([A-Z][a-z\-]+):\s*(.*)$/) {
 814            my $val = $2;
 815            $key = lc $1;
 816            $key =~ tr/-/_/; # too lazy to quote :P
 817            if ($want_headers{$key}) {
 818                push @{$ps->{$key}}, split(/\s+/, $val);
 819            } else {
 820                $key = undef;
 821            }
 822        } elsif (/^$/) {
 823            last; # remainder of @$log that didn't get shifted off is message
 824        } elsif ($key) {
 825            if (/^\s+(.*)$/) {
 826                if ($key eq 'summary') {
 827                    push @{$ps->{$key}}, $1;
 828                } else { # files/directories:
 829                    push @{$ps->{$key}}, split(/\s+/, $1);
 830                }
 831            } else {
 832                $key = undef;
 833            }
 834        }
 835    }
 836
 837    # drop leading empty lines from the log message
 838    while (@$log && $log->[0] eq '') {
 839        shift @$log;
 840    }
 841    if (exists $ps->{summary} && @{$ps->{summary}}) {
 842        $ps->{summary} = join(' ', @{$ps->{summary}});
 843    }
 844    elsif (@$log == 0) {
 845        $ps->{summary} = 'empty commit message';
 846    } else {
 847        $ps->{summary} = $log->[0] . '...';
 848    }
 849    $ps->{message} = join("\n",@$log);
 850
 851    # skip Arch control files, unescape pika-escaped files
 852    foreach my $k (keys %want_headers) {
 853        next unless (defined $ps->{$k});
 854        my @tmp = ();
 855        foreach my $t (@{$ps->{$k}}) {
 856           next unless length ($t);
 857           next if $t =~ m!\{arch\}/!;
 858           next if $t =~ m!\.arch-ids/!;
 859           # should we skip this?
 860           next if $t =~ m!\.arch-inventory$!;
 861           # tla cat-archive-log will give us filenames with spaces as file\(sp)name - why?
 862           # we can assume that any filename with \ indicates some pika escaping that we want to get rid of.
 863           if ($t =~ /\\/ ){
 864               $t = (safe_pipe_capture($TLA,'escape','--unescaped',$t))[0];
 865           }
 866           push @tmp, $t;
 867        }
 868        $ps->{$k} = \@tmp;
 869    }
 870}
 871
 872# write/read a tag
 873sub tag {
 874    my ($tag, $commit) = @_;
 875
 876    if ($opt_o) {
 877        $tag =~ s|/|--|g;
 878    } else {
 879        my $patchname = $tag;
 880        $patchname =~ s/.*--//;
 881        $tag = git_branchname ($tag) . '--' . $patchname;
 882    }
 883
 884    if ($commit) {
 885        open(C,">","$git_dir/refs/tags/$tag")
 886            or die "Cannot create tag $tag: $!\n";
 887        print C "$commit\n"
 888            or die "Cannot write tag $tag: $!\n";
 889        close(C)
 890            or die "Cannot write tag $tag: $!\n";
 891        print " * Created tag '$tag' on '$commit'\n" if $opt_v;
 892    } else {                    # read
 893        open(C,"<","$git_dir/refs/tags/$tag")
 894            or die "Cannot read tag $tag: $!\n";
 895        $commit = <C>;
 896        chomp $commit;
 897        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 898        close(C)
 899            or die "Cannot read tag $tag: $!\n";
 900        return $commit;
 901    }
 902}
 903
 904# write/read a private tag
 905# reads fail softly if the tag isn't there
 906sub ptag {
 907    my ($tag, $commit) = @_;
 908
 909    # don't use subdirs for tags yet, it could screw up other porcelains
 910    $tag =~ s|/|,|g;
 911
 912    my $tag_file = "$ptag_dir/$tag";
 913    my $tag_branch_dir = dirname($tag_file);
 914    mkpath($tag_branch_dir) unless (-d $tag_branch_dir);
 915
 916    if ($commit) {              # write
 917        open(C,">",$tag_file)
 918            or die "Cannot create tag $tag: $!\n";
 919        print C "$commit\n"
 920            or die "Cannot write tag $tag: $!\n";
 921        close(C)
 922            or die "Cannot write tag $tag: $!\n";
 923        $rptags{$commit} = $tag
 924            unless $tag =~ m/--base-0$/;
 925    } else {                    # read
 926        # if the tag isn't there, return 0
 927        unless ( -s $tag_file) {
 928            return 0;
 929        }
 930        open(C,"<",$tag_file)
 931            or die "Cannot read tag $tag: $!\n";
 932        $commit = <C>;
 933        chomp $commit;
 934        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 935        close(C)
 936            or die "Cannot read tag $tag: $!\n";
 937        unless (defined $rptags{$commit}) {
 938            $rptags{$commit} = $tag;
 939        }
 940        return $commit;
 941    }
 942}
 943
 944sub find_parents {
 945    #
 946    # Identify what branches are merging into me
 947    # and whether we are fully merged
 948    # git-merge-base <headsha> <headsha> should tell
 949    # me what the base of the merge should be
 950    #
 951    my $ps = shift;
 952
 953    my %branches; # holds an arrayref per branch
 954                  # the arrayref contains a list of
 955                  # merged patches between the base
 956                  # of the merge and the current head
 957
 958    my @parents;  # parents found for this commit
 959
 960    # simple loop to split the merges
 961    # per branch
 962    foreach my $merge (@{$ps->{merges}}) {
 963        my $branch = git_branchname($merge);
 964        unless (defined $branches{$branch} ){
 965            $branches{$branch} = [];
 966        }
 967        push @{$branches{$branch}}, $merge;
 968    }
 969
 970    #
 971    # foreach branch find a merge base and walk it to the
 972    # head where we are, collecting the merged patchsets that
 973    # Arch has recorded. Keep that in @have
 974    # Compare that with the commits on the other branch
 975    # between merge-base and the tip of the branch (@need)
 976    # and see if we have a series of consecutive patches
 977    # starting from the merge base. The tip of the series
 978    # of consecutive patches merged is our new parent for
 979    # that branch.
 980    #
 981    foreach my $branch (keys %branches) {
 982
 983        # check that we actually know about the branch
 984        next unless -e "$git_dir/refs/heads/$branch";
 985
 986        my $mergebase = `git-merge-base $branch $ps->{branch}`;
 987        if ($?) {
 988            # Don't die here, Arch supports one-way cherry-picking
 989            # between branches with no common base (or any relationship
 990            # at all beforehand)
 991            warn "Cannot find merge base for $branch and $ps->{branch}";
 992            next;
 993        }
 994        chomp $mergebase;
 995
 996        # now walk up to the mergepoint collecting what patches we have
 997        my $branchtip = git_rev_parse($ps->{branch});
 998        my @ancestors = `git-rev-list --topo-order $branchtip ^$mergebase`;
 999        my %have; # collected merges this branch has
1000        foreach my $merge (@{$ps->{merges}}) {
1001            $have{$merge} = 1;
1002        }
1003        my %ancestorshave;
1004        foreach my $par (@ancestors) {
1005            $par = commitid2pset($par);
1006            if (defined $par->{merges}) {
1007                foreach my $merge (@{$par->{merges}}) {
1008                    $ancestorshave{$merge}=1;
1009                }
1010            }
1011        }
1012        # print "++++ Merges in $ps->{id} are....\n";
1013        # my @have = sort keys %have;   print Dumper(\@have);
1014
1015        # merge what we have with what ancestors have
1016        %have = (%have, %ancestorshave);
1017
1018        # see what the remote branch has - these are the merges we
1019        # will want to have in a consecutive series from the mergebase
1020        my $otherbranchtip = git_rev_parse($branch);
1021        my @needraw = `git-rev-list --topo-order $otherbranchtip ^$mergebase`;
1022        my @need;
1023        foreach my $needps (@needraw) {         # get the psets
1024            $needps = commitid2pset($needps);
1025            # git-rev-list will also
1026            # list commits merged in via earlier
1027            # merges. we are only interested in commits
1028            # from the branch we're looking at
1029            if ($branch eq $needps->{branch}) {
1030                push @need, $needps->{id};
1031            }
1032        }
1033
1034        # print "++++ Merges from $branch we want are....\n";
1035        # print Dumper(\@need);
1036
1037        my $newparent;
1038        while (my $needed_commit = pop @need) {
1039            if ($have{$needed_commit}) {
1040                $newparent = $needed_commit;
1041            } else {
1042                last; # break out of the while
1043            }
1044        }
1045        if ($newparent) {
1046            push @parents, $newparent;
1047        }
1048
1049
1050    } # end foreach branch
1051
1052    # prune redundant parents
1053    my %parents;
1054    foreach my $p (@parents) {
1055        $parents{$p} = 1;
1056    }
1057    foreach my $p (@parents) {
1058        next unless exists $psets{$p}{merges};
1059        next unless ref    $psets{$p}{merges};
1060        my @merges = @{$psets{$p}{merges}};
1061        foreach my $merge (@merges) {
1062            if ($parents{$merge}) {
1063                delete $parents{$merge};
1064            }
1065        }
1066    }
1067
1068    @parents = ();
1069    foreach (keys %parents) {
1070        push @parents, '-p', ptag($_);
1071    }
1072    return @parents;
1073}
1074
1075sub git_rev_parse {
1076    my $name = shift;
1077    my $val  = `git-rev-parse $name`;
1078    die "Error: git-rev-parse $name" if $?;
1079    chomp $val;
1080    return $val;
1081}
1082
1083# resolve a SHA1 to a known patchset
1084sub commitid2pset {
1085    my $commitid = shift;
1086    chomp $commitid;
1087    my $name = $rptags{$commitid}
1088        || die "Cannot find reverse tag mapping for $commitid";
1089    $name =~ s|,|/|;
1090    my $ps   = $psets{$name}
1091        || (print Dumper(sort keys %psets)) && die "Cannot find patchset for $name";
1092    return $ps;
1093}
1094
1095
1096# an alternative to `command` that allows input to be passed as an array
1097# to work around shell problems with weird characters in arguments
1098sub safe_pipe_capture {
1099    my @output;
1100    if (my $pid = open my $child, '-|') {
1101        @output = (<$child>);
1102        close $child or die join(' ',@_).": $! $?";
1103    } else {
1104        exec(@_) or die "$! $?"; # exec() can fail the executable can't be found
1105    }
1106    return wantarray ? @output : join('',@output);
1107}
1108
1109# `tla logs -rf -d <dir> | head -n1` or `baz tree-id <dir>`
1110sub arch_tree_id {
1111    my $dir = shift;
1112    chomp( my $ret = (safe_pipe_capture($TLA,'logs','-rf','-d',$dir))[0] );
1113    return $ret;
1114}
1115
1116sub archive_reachable {
1117    my $archive = shift;
1118    return 1 if $reachable{$archive};
1119    return 0 if $unreachable{$archive};
1120
1121    if (system "$TLA whereis-archive $archive >/dev/null") {
1122        if ($opt_a && (system($TLA,'register-archive',
1123                      "http://mirrors.sourcecontrol.net/$archive") == 0)) {
1124            $reachable{$archive} = 1;
1125            return 1;
1126        }
1127        print STDERR "Archive is unreachable: $archive\n";
1128        $unreachable{$archive} = 1;
1129        return 0;
1130    } else {
1131        $reachable{$archive} = 1;
1132        return 1;
1133    }
1134}