git-archimport.perlon commit t3901: Use ISO8859-1 instead of ISO-8859-1 for backward compatibility (1e6bca0)
   1#!/usr/bin/perl -w
   2#
   3# This tool is copyright (c) 2005, Martin Langhoff.
   4# It is released under the Gnu Public License, version 2.
   5#
   6# The basic idea is to walk the output of tla abrowse,
   7# fetch the changesets and apply them.
   8#
   9
  10=head1 Invocation
  11
  12    git archimport [ -h ] [ -v ] [ -o ] [ -a ] [ -f ] [ -T ]
  13        [ -D depth] [ -t tempdir ] <archive>/<branch> [ <archive>/<branch> ]
  14
  15Imports a project from one or more Arch repositories. It will follow branches
  16and repositories within the namespaces defined by the <archive/branch>
  17parameters supplied. If it cannot find the remote branch a merge comes from
  18it will just import it as a regular commit. If it can find it, it will mark it
  19as a merge whenever possible.
  20
  21See man (1) git-archimport for more details.
  22
  23=head1 TODO
  24
  25 - create tag objects instead of ref tags
  26 - audit shell-escaping of filenames
  27 - hide our private tags somewhere smarter
  28 - find a way to make "cat *patches | patch" safe even when patchfiles are missing newlines
  29 - sort and apply patches by graphing ancestry relations instead of just
  30   relying in dates supplied in the changeset itself.
  31   tla ancestry-graph -m could be helpful here...
  32
  33=head1 Devel tricks
  34
  35Add print in front of the shell commands invoked via backticks.
  36
  37=head1 Devel Notes
  38
  39There are several places where Arch and git terminology are intermixed
  40and potentially confused.
  41
  42The notion of a "branch" in git is approximately equivalent to
  43a "archive/category--branch--version" in Arch.  Also, it should be noted
  44that the "--branch" portion of "archive/category--branch--version" is really
  45optional in Arch although not many people (nor tools!) seem to know this.
  46This means that "archive/category--version" is also a valid "branch"
  47in git terms.
  48
  49We always refer to Arch names by their fully qualified variant (which
  50means the "archive" name is prefixed.
  51
  52For people unfamiliar with Arch, an "archive" is the term for "repository",
  53and can contain multiple, unrelated branches.
  54
  55=cut
  56
  57use strict;
  58use warnings;
  59use Getopt::Std;
  60use File::Temp qw(tempdir);
  61use File::Path qw(mkpath rmtree);
  62use File::Basename qw(basename dirname);
  63use Data::Dumper qw/ Dumper /;
  64use IPC::Open2;
  65
  66$SIG{'PIPE'}="IGNORE";
  67$ENV{'TZ'}="UTC";
  68
  69my $git_dir = $ENV{"GIT_DIR"} || ".git";
  70$ENV{"GIT_DIR"} = $git_dir;
  71my $ptag_dir = "$git_dir/archimport/tags";
  72
  73our($opt_h,$opt_f,$opt_v,$opt_T,$opt_t,$opt_D,$opt_a,$opt_o);
  74
  75sub usage() {
  76    print STDERR <<END;
  77Usage: git archimport     # fetch/update GIT from Arch
  78       [ -h ] [ -v ] [ -o ] [ -a ] [ -f ] [ -T ] [ -D depth ] [ -t tempdir ]
  79       repository/arch-branch [ repository/arch-branch] ...
  80END
  81    exit(1);
  82}
  83
  84getopts("fThvat:D:") or usage();
  85usage if $opt_h;
  86
  87@ARGV >= 1 or usage();
  88# $arch_branches:
  89# values associated with keys:
  90#   =1 - Arch version / git 'branch' detected via abrowse on a limit
  91#   >1 - Arch version / git 'branch' of an auxiliary branch we've merged
  92my %arch_branches = map { my $branch = $_; $branch =~ s/:[^:]*$//; $branch => 1 } @ARGV;
  93
  94# $branch_name_map:
  95# maps arch branches to git branch names
  96my %branch_name_map = map { m/^(.*):([^:]*)$/; $1 => $2 } grep { m/:/ } @ARGV;
  97
  98$ENV{'TMPDIR'} = $opt_t if $opt_t; # $ENV{TMPDIR} will affect tempdir() calls:
  99my $tmp = tempdir('git-archimport-XXXXXX', TMPDIR => 1, CLEANUP => 1);
 100$opt_v && print "+ Using $tmp as temporary directory\n";
 101
 102unless (-d $git_dir) { # initial import needs empty directory
 103    opendir DIR, '.' or die "Unable to open current directory: $!\n";
 104    while (my $entry = readdir DIR) {
 105        $entry =~ /^\.\.?$/ or
 106            die "Initial import needs an empty current working directory.\n"
 107    }
 108    closedir DIR
 109}
 110
 111my $default_archive;            # default Arch archive
 112my %reachable = ();             # Arch repositories we can access
 113my %unreachable = ();           # Arch repositories we can't access :<
 114my @psets  = ();                # the collection
 115my %psets  = ();                # the collection, by name
 116my %stats  = (                  # Track which strategy we used to import:
 117        get_tag => 0, replay => 0, get_new => 0, get_delta => 0,
 118        simple_changeset => 0, import_or_tag => 0
 119);
 120
 121my %rptags = ();                # my reverse private tags
 122                                # to map a SHA1 to a commitid
 123my $TLA = $ENV{'ARCH_CLIENT'} || 'tla';
 124
 125sub do_abrowse {
 126    my $stage = shift;
 127    while (my ($limit, $level) = each %arch_branches) {
 128        next unless $level == $stage;
 129
 130        open ABROWSE, "$TLA abrowse -fkD --merges $limit |"
 131                                or die "Problems with tla abrowse: $!";
 132
 133        my %ps        = ();         # the current one
 134        my $lastseen  = '';
 135
 136        while (<ABROWSE>) {
 137            chomp;
 138
 139            # first record padded w 8 spaces
 140            if (s/^\s{8}\b//) {
 141                my ($id, $type) = split(m/\s+/, $_, 2);
 142
 143                my %last_ps;
 144                # store the record we just captured
 145                if (%ps && !exists $psets{ $ps{id} }) {
 146                    %last_ps = %ps; # break references
 147                    push (@psets, \%last_ps);
 148                    $psets{ $last_ps{id} } = \%last_ps;
 149                }
 150
 151                my $branch = extract_versionname($id);
 152                %ps = ( id => $id, branch => $branch );
 153                if (%last_ps && ($last_ps{branch} eq $branch)) {
 154                    $ps{parent_id} = $last_ps{id};
 155                }
 156
 157                $arch_branches{$branch} = 1;
 158                $lastseen = 'id';
 159
 160                # deal with types (should work with baz or tla):
 161                if ($type =~ m/\(.*changeset\)/) {
 162                    $ps{type} = 's';
 163                } elsif ($type =~ /\(.*import\)/) {
 164                    $ps{type} = 'i';
 165                } elsif ($type =~ m/\(tag.*?(\S+\@\S+).*?\)/) {
 166                    $ps{type} = 't';
 167                    # read which revision we've tagged when we parse the log
 168                    $ps{tag}  = $1;
 169                } else {
 170                    warn "Unknown type $type";
 171                }
 172
 173                $arch_branches{$branch} = 1;
 174                $lastseen = 'id';
 175            } elsif (s/^\s{10}//) {
 176                # 10 leading spaces or more
 177                # indicate commit metadata
 178
 179                # date
 180                if ($lastseen eq 'id' && m/^(\d{4}-\d\d-\d\d \d\d:\d\d:\d\d)/){
 181                    $ps{date}   = $1;
 182                    $lastseen = 'date';
 183                } elsif ($_ eq 'merges in:') {
 184                    $ps{merges} = [];
 185                    $lastseen = 'merges';
 186                } elsif ($lastseen eq 'merges' && s/^\s{2}//) {
 187                    my $id = $_;
 188                    push (@{$ps{merges}}, $id);
 189
 190                    # aggressive branch finding:
 191                    if ($opt_D) {
 192                        my $branch = extract_versionname($id);
 193                        my $repo = extract_reponame($branch);
 194
 195                        if (archive_reachable($repo) &&
 196                                !defined $arch_branches{$branch}) {
 197                            $arch_branches{$branch} = $stage + 1;
 198                        }
 199                    }
 200                } else {
 201                    warn "more metadata after merges!?: $_\n" unless /^\s*$/;
 202                }
 203            }
 204        }
 205
 206        if (%ps && !exists $psets{ $ps{id} }) {
 207            my %temp = %ps;         # break references
 208            if (@psets && $psets[$#psets]{branch} eq $ps{branch}) {
 209                $temp{parent_id} = $psets[$#psets]{id};
 210            }
 211            push (@psets, \%temp);
 212            $psets{ $temp{id} } = \%temp;
 213        }
 214
 215        close ABROWSE or die "$TLA abrowse failed on $limit\n";
 216    }
 217}                               # end foreach $root
 218
 219do_abrowse(1);
 220my $depth = 2;
 221$opt_D ||= 0;
 222while ($depth <= $opt_D) {
 223    do_abrowse($depth);
 224    $depth++;
 225}
 226
 227## Order patches by time
 228# FIXME see if we can find a more optimal way to do this by graphing
 229# the ancestry data and walking it, that way we won't have to rely on
 230# client-supplied dates
 231@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
 232
 233#print Dumper \@psets;
 234
 235##
 236## TODO cleanup irrelevant patches
 237##      and put an initial import
 238##      or a full tag
 239my $import = 0;
 240unless (-d $git_dir) { # initial import
 241    if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
 242        print "Starting import from $psets[0]{id}\n";
 243        `git-init`;
 244        die $! if $?;
 245        $import = 1;
 246    } else {
 247        die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
 248    }
 249} else {    # progressing an import
 250    # load the rptags
 251    opendir(DIR, $ptag_dir)
 252        || die "can't opendir: $!";
 253    while (my $file = readdir(DIR)) {
 254        # skip non-interesting-files
 255        next unless -f "$ptag_dir/$file";
 256
 257        # convert first '--' to '/' from old git-archimport to use
 258        # as an archivename/c--b--v private tag
 259        if ($file !~ m!,!) {
 260            my $oldfile = $file;
 261            $file =~ s!--!,!;
 262            print STDERR "converting old tag $oldfile to $file\n";
 263            rename("$ptag_dir/$oldfile", "$ptag_dir/$file") or die $!;
 264        }
 265        my $sha = ptag($file);
 266        chomp $sha;
 267        $rptags{$sha} = $file;
 268    }
 269    closedir DIR;
 270}
 271
 272# process patchsets
 273# extract the Arch repository name (Arch "archive" in Arch-speak)
 274sub extract_reponame {
 275    my $fq_cvbr = shift; # archivename/[[[[category]branch]version]revision]
 276    return (split(/\//, $fq_cvbr))[0];
 277}
 278
 279sub extract_versionname {
 280    my $name = shift;
 281    $name =~ s/--(?:patch|version(?:fix)?|base)-\d+$//;
 282    return $name;
 283}
 284
 285# convert a fully-qualified revision or version to a unique dirname:
 286#   normalperson@yhbt.net-05/mpd--uclinux--1--patch-2
 287# becomes: normalperson@yhbt.net-05,mpd--uclinux--1
 288#
 289# the git notion of a branch is closer to
 290# archive/category--branch--version than archive/category--branch, so we
 291# use this to convert to git branch names.
 292# Also, keep archive names but replace '/' with ',' since it won't require
 293# subdirectories, and is safer than swapping '--' which could confuse
 294# reverse-mapping when dealing with bastard branches that
 295# are just archive/category--version  (no --branch)
 296sub tree_dirname {
 297    my $revision = shift;
 298    my $name = extract_versionname($revision);
 299    $name =~ s#/#,#;
 300    return $name;
 301}
 302
 303# old versions of git-archimport just use the <category--branch> part:
 304sub old_style_branchname {
 305    my $id = shift;
 306    my $ret = safe_pipe_capture($TLA,'parse-package-name','-p',$id);
 307    chomp $ret;
 308    return $ret;
 309}
 310
 311*git_default_branchname = $opt_o ? *old_style_branchname : *tree_dirname;
 312
 313# retrieve default archive, since $branch_name_map keys might not include it
 314sub get_default_archive {
 315    if (!defined $default_archive) {
 316        $default_archive = safe_pipe_capture($TLA,'my-default-archive');
 317        chomp $default_archive;
 318    }
 319    return $default_archive;
 320}
 321
 322sub git_branchname {
 323    my $revision = shift;
 324    my $name = extract_versionname($revision);
 325
 326    if (exists $branch_name_map{$name}) {
 327        return $branch_name_map{$name};
 328
 329    } elsif ($name =~ m#^([^/]*)/(.*)$#
 330             && $1 eq get_default_archive()
 331             && exists $branch_name_map{$2}) {
 332        # the names given in the command-line lacked the archive.
 333        return $branch_name_map{$2};
 334
 335    } else {
 336        return git_default_branchname($revision);
 337    }
 338}
 339
 340sub process_patchset_accurate {
 341    my $ps = shift;
 342
 343    # switch to that branch if we're not already in that branch:
 344    if (-e "$git_dir/refs/heads/$ps->{branch}") {
 345       system('git-checkout','-f',$ps->{branch}) == 0 or die "$! $?\n";
 346
 347       # remove any old stuff that got leftover:
 348       my $rm = safe_pipe_capture('git-ls-files','--others','-z');
 349       rmtree(split(/\0/,$rm)) if $rm;
 350    }
 351
 352    # Apply the import/changeset/merge into the working tree
 353    my $dir = sync_to_ps($ps);
 354    # read the new log entry:
 355    my @commitlog = safe_pipe_capture($TLA,'cat-log','-d',$dir,$ps->{id});
 356    die "Error in cat-log: $!" if $?;
 357    chomp @commitlog;
 358
 359    # grab variables we want from the log, new fields get added to $ps:
 360    # (author, date, email, summary, message body ...)
 361    parselog($ps, \@commitlog);
 362
 363    if ($ps->{id} =~ /--base-0$/ && $ps->{id} ne $psets[0]{id}) {
 364        # this should work when importing continuations
 365        if ($ps->{tag} && (my $branchpoint = eval { ptag($ps->{tag}) })) {
 366
 367            # find where we are supposed to branch from
 368            if (! -e "$git_dir/refs/heads/$ps->{branch}") {
 369                system('git-branch',$ps->{branch},$branchpoint) == 0 or die "$! $?\n";
 370
 371                # We trust Arch with the fact that this is just a tag,
 372                # and it does not affect the state of the tree, so
 373                # we just tag and move on.  If the user really wants us
 374                # to consolidate more branches into one, don't tag because
 375                # the tag name would be already taken.
 376                tag($ps->{id}, $branchpoint);
 377                ptag($ps->{id}, $branchpoint);
 378                print " * Tagged $ps->{id} at $branchpoint\n";
 379            }
 380            system('git-checkout','-f',$ps->{branch}) == 0 or die "$! $?\n";
 381
 382            # remove any old stuff that got leftover:
 383            my $rm = safe_pipe_capture('git-ls-files','--others','-z');
 384            rmtree(split(/\0/,$rm)) if $rm;
 385            return 0;
 386        } else {
 387            warn "Tagging from unknown id unsupported\n" if $ps->{tag};
 388        }
 389        # allow multiple bases/imports here since Arch supports cherry-picks
 390        # from unrelated trees
 391    }
 392
 393    # update the index with all the changes we got
 394    system('git-diff-files --name-only -z | '.
 395            'git-update-index --remove -z --stdin') == 0 or die "$! $?\n";
 396    system('git-ls-files --others -z | '.
 397            'git-update-index --add -z --stdin') == 0 or die "$! $?\n";
 398    return 1;
 399}
 400
 401# the native changeset processing strategy.  This is very fast, but
 402# does not handle permissions or any renames involving directories
 403sub process_patchset_fast {
 404    my $ps = shift;
 405    #
 406    # create the branch if needed
 407    #
 408    if ($ps->{type} eq 'i' && !$import) {
 409        die "Should not have more than one 'Initial import' per GIT import: $ps->{id}";
 410    }
 411
 412    unless ($import) { # skip for import
 413        if ( -e "$git_dir/refs/heads/$ps->{branch}") {
 414            # we know about this branch
 415            system('git-checkout',$ps->{branch});
 416        } else {
 417            # new branch! we need to verify a few things
 418            die "Branch on a non-tag!" unless $ps->{type} eq 't';
 419            my $branchpoint = ptag($ps->{tag});
 420            die "Tagging from unknown id unsupported: $ps->{tag}"
 421                unless $branchpoint;
 422
 423            # find where we are supposed to branch from
 424            if (! -e "$git_dir/refs/heads/$ps->{branch}") {
 425                system('git-branch',$ps->{branch},$branchpoint) == 0 or die "$! $?\n";
 426
 427                # We trust Arch with the fact that this is just a tag,
 428                # and it does not affect the state of the tree, so
 429                # we just tag and move on.  If the user really wants us
 430                # to consolidate more branches into one, don't tag because
 431                # the tag name would be already taken.
 432                tag($ps->{id}, $branchpoint);
 433                ptag($ps->{id}, $branchpoint);
 434                print " * Tagged $ps->{id} at $branchpoint\n";
 435            }
 436            system('git-checkout',$ps->{branch}) == 0 or die "$! $?\n";
 437            return 0;
 438        }
 439        die $! if $?;
 440    }
 441
 442    #
 443    # Apply the import/changeset/merge into the working tree
 444    #
 445    if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
 446        apply_import($ps) or die $!;
 447        $stats{import_or_tag}++;
 448        $import=0;
 449    } elsif ($ps->{type} eq 's') {
 450        apply_cset($ps);
 451        $stats{simple_changeset}++;
 452    }
 453
 454    #
 455    # prepare update git's index, based on what arch knows
 456    # about the pset, resolve parents, etc
 457    #
 458
 459    my @commitlog = safe_pipe_capture($TLA,'cat-archive-log',$ps->{id});
 460    die "Error in cat-archive-log: $!" if $?;
 461
 462    parselog($ps,\@commitlog);
 463
 464    # imports don't give us good info
 465    # on added files. Shame on them
 466    if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
 467        system('git-ls-files --deleted -z | '.
 468                'git-update-index --remove -z --stdin') == 0 or die "$! $?\n";
 469        system('git-ls-files --others -z | '.
 470                'git-update-index --add -z --stdin') == 0 or die "$! $?\n";
 471    }
 472
 473    # TODO: handle removed_directories and renamed_directories:
 474
 475    if (my $del = $ps->{removed_files}) {
 476        unlink @$del;
 477        while (@$del) {
 478            my @slice = splice(@$del, 0, 100);
 479            system('git-update-index','--remove','--',@slice) == 0 or
 480                            die "Error in git-update-index --remove: $! $?\n";
 481        }
 482    }
 483
 484    if (my $ren = $ps->{renamed_files}) {                # renamed
 485        if (@$ren % 2) {
 486            die "Odd number of entries in rename!?";
 487        }
 488
 489        while (@$ren) {
 490            my $from = shift @$ren;
 491            my $to   = shift @$ren;
 492
 493            unless (-d dirname($to)) {
 494                mkpath(dirname($to)); # will die on err
 495            }
 496            # print "moving $from $to";
 497            rename($from, $to) or die "Error renaming '$from' '$to': $!\n";
 498            system('git-update-index','--remove','--',$from) == 0 or
 499                            die "Error in git-update-index --remove: $! $?\n";
 500            system('git-update-index','--add','--',$to) == 0 or
 501                            die "Error in git-update-index --add: $! $?\n";
 502        }
 503    }
 504
 505    if (my $add = $ps->{new_files}) {
 506        while (@$add) {
 507            my @slice = splice(@$add, 0, 100);
 508            system('git-update-index','--add','--',@slice) == 0 or
 509                            die "Error in git-update-index --add: $! $?\n";
 510        }
 511    }
 512
 513    if (my $mod = $ps->{modified_files}) {
 514        while (@$mod) {
 515            my @slice = splice(@$mod, 0, 100);
 516            system('git-update-index','--',@slice) == 0 or
 517                            die "Error in git-update-index: $! $?\n";
 518        }
 519    }
 520    return 1; # we successfully applied the changeset
 521}
 522
 523if ($opt_f) {
 524    print "Will import patchsets using the fast strategy\n",
 525            "Renamed directories and permission changes will be missed\n";
 526    *process_patchset = *process_patchset_fast;
 527} else {
 528    print "Using the default (accurate) import strategy.\n",
 529            "Things may be a bit slow\n";
 530    *process_patchset = *process_patchset_accurate;
 531}
 532
 533foreach my $ps (@psets) {
 534    # process patchsets
 535    $ps->{branch} = git_branchname($ps->{id});
 536
 537    #
 538    # ensure we have a clean state
 539    #
 540    if (my $dirty = `git-diff-files`) {
 541        die "Unclean tree when about to process $ps->{id} " .
 542            " - did we fail to commit cleanly before?\n$dirty";
 543    }
 544    die $! if $?;
 545
 546    #
 547    # skip commits already in repo
 548    #
 549    if (ptag($ps->{id})) {
 550      $opt_v && print " * Skipping already imported: $ps->{id}\n";
 551      next;
 552    }
 553
 554    print " * Starting to work on $ps->{id}\n";
 555
 556    process_patchset($ps) or next;
 557
 558    # warn "errors when running git-update-index! $!";
 559    my $tree = `git-write-tree`;
 560    die "cannot write tree $!" if $?;
 561    chomp $tree;
 562
 563    #
 564    # Who's your daddy?
 565    #
 566    my @par;
 567    if ( -e "$git_dir/refs/heads/$ps->{branch}") {
 568        if (open HEAD, "<","$git_dir/refs/heads/$ps->{branch}") {
 569            my $p = <HEAD>;
 570            close HEAD;
 571            chomp $p;
 572            push @par, '-p', $p;
 573        } else {
 574            if ($ps->{type} eq 's') {
 575                warn "Could not find the right head for the branch $ps->{branch}";
 576            }
 577        }
 578    }
 579
 580    if ($ps->{merges}) {
 581        push @par, find_parents($ps);
 582    }
 583
 584    #
 585    # Commit, tag and clean state
 586    #
 587    $ENV{TZ}                  = 'GMT';
 588    $ENV{GIT_AUTHOR_NAME}     = $ps->{author};
 589    $ENV{GIT_AUTHOR_EMAIL}    = $ps->{email};
 590    $ENV{GIT_AUTHOR_DATE}     = $ps->{date};
 591    $ENV{GIT_COMMITTER_NAME}  = $ps->{author};
 592    $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
 593    $ENV{GIT_COMMITTER_DATE}  = $ps->{date};
 594
 595    my $pid = open2(*READER, *WRITER,'git-commit-tree',$tree,@par)
 596        or die $!;
 597    print WRITER $ps->{summary},"\n\n";
 598
 599    # only print message if it's not empty, to avoid a spurious blank line;
 600    # also append an extra newline, so there's a blank line before the
 601    # following "git-archimport-id:" line.
 602    print WRITER $ps->{message},"\n\n" if ($ps->{message} ne "");
 603
 604    # make it easy to backtrack and figure out which Arch revision this was:
 605    print WRITER 'git-archimport-id: ',$ps->{id},"\n";
 606
 607    close WRITER;
 608    my $commitid = <READER>;    # read
 609    chomp $commitid;
 610    close READER;
 611    waitpid $pid,0;             # close;
 612
 613    if (length $commitid != 40) {
 614        die "Something went wrong with the commit! $! $commitid";
 615    }
 616    #
 617    # Update the branch
 618    #
 619    open  HEAD, ">","$git_dir/refs/heads/$ps->{branch}";
 620    print HEAD $commitid;
 621    close HEAD;
 622    system('git-update-ref', 'HEAD', "$ps->{branch}");
 623
 624    # tag accordingly
 625    ptag($ps->{id}, $commitid); # private tag
 626    if ($opt_T || $ps->{type} eq 't' || $ps->{type} eq 'i') {
 627        tag($ps->{id}, $commitid);
 628    }
 629    print " * Committed $ps->{id}\n";
 630    print "   + tree   $tree\n";
 631    print "   + commit $commitid\n";
 632    $opt_v && print "   + commit date is  $ps->{date} \n";
 633    $opt_v && print "   + parents:  ",join(' ',@par),"\n";
 634}
 635
 636if ($opt_v) {
 637    foreach (sort keys %stats) {
 638        print" $_: $stats{$_}\n";
 639    }
 640}
 641exit 0;
 642
 643# used by the accurate strategy:
 644sub sync_to_ps {
 645    my $ps = shift;
 646    my $tree_dir = $tmp.'/'.tree_dirname($ps->{id});
 647
 648    $opt_v && print "sync_to_ps($ps->{id}) method: ";
 649
 650    if (-d $tree_dir) {
 651        if ($ps->{type} eq 't') {
 652            $opt_v && print "get (tag)\n";
 653            # looks like a tag-only or (worse,) a mixed tags/changeset branch,
 654            # can't rely on replay to work correctly on these
 655            rmtree($tree_dir);
 656            safe_pipe_capture($TLA,'get','--no-pristine',$ps->{id},$tree_dir);
 657            $stats{get_tag}++;
 658        } else {
 659                my $tree_id = arch_tree_id($tree_dir);
 660                if ($ps->{parent_id} && ($ps->{parent_id} eq $tree_id)) {
 661                    # the common case (hopefully)
 662                    $opt_v && print "replay\n";
 663                    safe_pipe_capture($TLA,'replay','-d',$tree_dir,$ps->{id});
 664                    $stats{replay}++;
 665                } else {
 666                    # getting one tree is usually faster than getting two trees
 667                    # and applying the delta ...
 668                    rmtree($tree_dir);
 669                    $opt_v && print "apply-delta\n";
 670                    safe_pipe_capture($TLA,'get','--no-pristine',
 671                                        $ps->{id},$tree_dir);
 672                    $stats{get_delta}++;
 673                }
 674        }
 675    } else {
 676        # new branch work
 677        $opt_v && print "get (new tree)\n";
 678        safe_pipe_capture($TLA,'get','--no-pristine',$ps->{id},$tree_dir);
 679        $stats{get_new}++;
 680    }
 681
 682    # added -I flag to rsync since we're going to fast! AIEEEEE!!!!
 683    system('rsync','-aI','--delete','--exclude',$git_dir,
 684#               '--exclude','.arch-inventory',
 685                '--exclude','.arch-ids','--exclude','{arch}',
 686                '--exclude','+*','--exclude',',*',
 687                "$tree_dir/",'./') == 0 or die "Cannot rsync $tree_dir: $! $?";
 688    return $tree_dir;
 689}
 690
 691sub apply_import {
 692    my $ps = shift;
 693    my $bname = git_branchname($ps->{id});
 694
 695    mkpath($tmp);
 696
 697    safe_pipe_capture($TLA,'get','-s','--no-pristine',$ps->{id},"$tmp/import");
 698    die "Cannot get import: $!" if $?;
 699    system('rsync','-aI','--delete', '--exclude',$git_dir,
 700                '--exclude','.arch-ids','--exclude','{arch}',
 701                "$tmp/import/", './');
 702    die "Cannot rsync import:$!" if $?;
 703
 704    rmtree("$tmp/import");
 705    die "Cannot remove tempdir: $!" if $?;
 706
 707
 708    return 1;
 709}
 710
 711sub apply_cset {
 712    my $ps = shift;
 713
 714    mkpath($tmp);
 715
 716    # get the changeset
 717    safe_pipe_capture($TLA,'get-changeset',$ps->{id},"$tmp/changeset");
 718    die "Cannot get changeset: $!" if $?;
 719
 720    # apply patches
 721    if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
 722        # this can be sped up considerably by doing
 723        #    (find | xargs cat) | patch
 724        # but that can get mucked up by patches
 725        # with missing trailing newlines or the standard
 726        # 'missing newline' flag in the patch - possibly
 727        # produced with an old/buggy diff.
 728        # slow and safe, we invoke patch once per patchfile
 729        `find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
 730        die "Problem applying patches! $!" if $?;
 731    }
 732
 733    # apply changed binary files
 734    if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
 735        foreach my $mod (@modified) {
 736            chomp $mod;
 737            my $orig = $mod;
 738            $orig =~ s/\.modified$//; # lazy
 739            $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
 740            #print "rsync -p '$mod' '$orig'";
 741            system('rsync','-p',$mod,"./$orig");
 742            die "Problem applying binary changes! $!" if $?;
 743        }
 744    }
 745
 746    # bring in new files
 747    system('rsync','-aI','--exclude',$git_dir,
 748                '--exclude','.arch-ids',
 749                '--exclude', '{arch}',
 750                "$tmp/changeset/new-files-archive/",'./');
 751
 752    # deleted files are hinted from the commitlog processing
 753
 754    rmtree("$tmp/changeset");
 755}
 756
 757
 758# =for reference
 759# notes: *-files/-directories keys cannot have spaces, they're always
 760# pika-escaped.  Everything after the first newline
 761# A log entry looks like:
 762# Revision: moodle-org--moodle--1.3.3--patch-15
 763# Archive: arch-eduforge@catalyst.net.nz--2004
 764# Creator: Penny Leach <penny@catalyst.net.nz>
 765# Date: Wed May 25 14:15:34 NZST 2005
 766# Standard-date: 2005-05-25 02:15:34 GMT
 767# New-files: lang/de/.arch-ids/block_glossary_random.php.id
 768#     lang/de/.arch-ids/block_html.php.id
 769# New-directories: lang/de/help/questionnaire
 770#     lang/de/help/questionnaire/.arch-ids
 771# Renamed-files: .arch-ids/db_sears.sql.id db/.arch-ids/db_sears.sql.id
 772#    db_sears.sql db/db_sears.sql
 773# Removed-files: lang/be/docs/.arch-ids/release.html.id
 774#     lang/be/docs/.arch-ids/releaseold.html.id
 775# Modified-files: admin/cron.php admin/delete.php
 776#     admin/editor.html backup/lib.php backup/restore.php
 777# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
 778# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
 779#   summary can be multiline with a leading space just like the above fields
 780# Keywords:
 781#
 782# Updating yadda tadda tadda madda
 783sub parselog {
 784    my ($ps, $log) = @_;
 785    my $key = undef;
 786
 787    # headers we want that contain filenames:
 788    my %want_headers = (
 789        new_files => 1,
 790        modified_files => 1,
 791        renamed_files => 1,
 792        renamed_directories => 1,
 793        removed_files => 1,
 794        removed_directories => 1,
 795    );
 796
 797    chomp (@$log);
 798    while ($_ = shift @$log) {
 799        if (/^Continuation-of:\s*(.*)/) {
 800            $ps->{tag} = $1;
 801            $key = undef;
 802        } elsif (/^Summary:\s*(.*)$/ ) {
 803            # summary can be multiline as long as it has a leading space.
 804            # we squeeze it onto a single line, though.
 805            $ps->{summary} = [ $1 ];
 806            $key = 'summary';
 807        } elsif (/^Creator: (.*)\s*<([^\>]+)>/) {
 808            $ps->{author} = $1;
 809            $ps->{email} = $2;
 810            $key = undef;
 811        # any *-files or *-directories can be read here:
 812        } elsif (/^([A-Z][a-z\-]+):\s*(.*)$/) {
 813            my $val = $2;
 814            $key = lc $1;
 815            $key =~ tr/-/_/; # too lazy to quote :P
 816            if ($want_headers{$key}) {
 817                push @{$ps->{$key}}, split(/\s+/, $val);
 818            } else {
 819                $key = undef;
 820            }
 821        } elsif (/^$/) {
 822            last; # remainder of @$log that didn't get shifted off is message
 823        } elsif ($key) {
 824            if (/^\s+(.*)$/) {
 825                if ($key eq 'summary') {
 826                    push @{$ps->{$key}}, $1;
 827                } else { # files/directories:
 828                    push @{$ps->{$key}}, split(/\s+/, $1);
 829                }
 830            } else {
 831                $key = undef;
 832            }
 833        }
 834    }
 835
 836    # drop leading empty lines from the log message
 837    while (@$log && $log->[0] eq '') {
 838        shift @$log;
 839    }
 840    if (exists $ps->{summary} && @{$ps->{summary}}) {
 841        $ps->{summary} = join(' ', @{$ps->{summary}});
 842    }
 843    elsif (@$log == 0) {
 844        $ps->{summary} = 'empty commit message';
 845    } else {
 846        $ps->{summary} = $log->[0] . '...';
 847    }
 848    $ps->{message} = join("\n",@$log);
 849
 850    # skip Arch control files, unescape pika-escaped files
 851    foreach my $k (keys %want_headers) {
 852        next unless (defined $ps->{$k});
 853        my @tmp = ();
 854        foreach my $t (@{$ps->{$k}}) {
 855           next unless length ($t);
 856           next if $t =~ m!\{arch\}/!;
 857           next if $t =~ m!\.arch-ids/!;
 858           # should we skip this?
 859           next if $t =~ m!\.arch-inventory$!;
 860           # tla cat-archive-log will give us filenames with spaces as file\(sp)name - why?
 861           # we can assume that any filename with \ indicates some pika escaping that we want to get rid of.
 862           if ($t =~ /\\/ ){
 863               $t = (safe_pipe_capture($TLA,'escape','--unescaped',$t))[0];
 864           }
 865           push @tmp, $t;
 866        }
 867        $ps->{$k} = \@tmp;
 868    }
 869}
 870
 871# write/read a tag
 872sub tag {
 873    my ($tag, $commit) = @_;
 874
 875    if ($opt_o) {
 876        $tag =~ s|/|--|g;
 877    } else {
 878        my $patchname = $tag;
 879        $patchname =~ s/.*--//;
 880        $tag = git_branchname ($tag) . '--' . $patchname;
 881    }
 882
 883    if ($commit) {
 884        open(C,">","$git_dir/refs/tags/$tag")
 885            or die "Cannot create tag $tag: $!\n";
 886        print C "$commit\n"
 887            or die "Cannot write tag $tag: $!\n";
 888        close(C)
 889            or die "Cannot write tag $tag: $!\n";
 890        print " * Created tag '$tag' on '$commit'\n" if $opt_v;
 891    } else {                    # read
 892        open(C,"<","$git_dir/refs/tags/$tag")
 893            or die "Cannot read tag $tag: $!\n";
 894        $commit = <C>;
 895        chomp $commit;
 896        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 897        close(C)
 898            or die "Cannot read tag $tag: $!\n";
 899        return $commit;
 900    }
 901}
 902
 903# write/read a private tag
 904# reads fail softly if the tag isn't there
 905sub ptag {
 906    my ($tag, $commit) = @_;
 907
 908    # don't use subdirs for tags yet, it could screw up other porcelains
 909    $tag =~ s|/|,|g;
 910
 911    my $tag_file = "$ptag_dir/$tag";
 912    my $tag_branch_dir = dirname($tag_file);
 913    mkpath($tag_branch_dir) unless (-d $tag_branch_dir);
 914
 915    if ($commit) {              # write
 916        open(C,">",$tag_file)
 917            or die "Cannot create tag $tag: $!\n";
 918        print C "$commit\n"
 919            or die "Cannot write tag $tag: $!\n";
 920        close(C)
 921            or die "Cannot write tag $tag: $!\n";
 922        $rptags{$commit} = $tag
 923            unless $tag =~ m/--base-0$/;
 924    } else {                    # read
 925        # if the tag isn't there, return 0
 926        unless ( -s $tag_file) {
 927            return 0;
 928        }
 929        open(C,"<",$tag_file)
 930            or die "Cannot read tag $tag: $!\n";
 931        $commit = <C>;
 932        chomp $commit;
 933        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 934        close(C)
 935            or die "Cannot read tag $tag: $!\n";
 936        unless (defined $rptags{$commit}) {
 937            $rptags{$commit} = $tag;
 938        }
 939        return $commit;
 940    }
 941}
 942
 943sub find_parents {
 944    #
 945    # Identify what branches are merging into me
 946    # and whether we are fully merged
 947    # git-merge-base <headsha> <headsha> should tell
 948    # me what the base of the merge should be
 949    #
 950    my $ps = shift;
 951
 952    my %branches; # holds an arrayref per branch
 953                  # the arrayref contains a list of
 954                  # merged patches between the base
 955                  # of the merge and the current head
 956
 957    my @parents;  # parents found for this commit
 958
 959    # simple loop to split the merges
 960    # per branch
 961    foreach my $merge (@{$ps->{merges}}) {
 962        my $branch = git_branchname($merge);
 963        unless (defined $branches{$branch} ){
 964            $branches{$branch} = [];
 965        }
 966        push @{$branches{$branch}}, $merge;
 967    }
 968
 969    #
 970    # foreach branch find a merge base and walk it to the
 971    # head where we are, collecting the merged patchsets that
 972    # Arch has recorded. Keep that in @have
 973    # Compare that with the commits on the other branch
 974    # between merge-base and the tip of the branch (@need)
 975    # and see if we have a series of consecutive patches
 976    # starting from the merge base. The tip of the series
 977    # of consecutive patches merged is our new parent for
 978    # that branch.
 979    #
 980    foreach my $branch (keys %branches) {
 981
 982        # check that we actually know about the branch
 983        next unless -e "$git_dir/refs/heads/$branch";
 984
 985        my $mergebase = `git-merge-base $branch $ps->{branch}`;
 986        if ($?) {
 987            # Don't die here, Arch supports one-way cherry-picking
 988            # between branches with no common base (or any relationship
 989            # at all beforehand)
 990            warn "Cannot find merge base for $branch and $ps->{branch}";
 991            next;
 992        }
 993        chomp $mergebase;
 994
 995        # now walk up to the mergepoint collecting what patches we have
 996        my $branchtip = git_rev_parse($ps->{branch});
 997        my @ancestors = `git-rev-list --topo-order $branchtip ^$mergebase`;
 998        my %have; # collected merges this branch has
 999        foreach my $merge (@{$ps->{merges}}) {
1000            $have{$merge} = 1;
1001        }
1002        my %ancestorshave;
1003        foreach my $par (@ancestors) {
1004            $par = commitid2pset($par);
1005            if (defined $par->{merges}) {
1006                foreach my $merge (@{$par->{merges}}) {
1007                    $ancestorshave{$merge}=1;
1008                }
1009            }
1010        }
1011        # print "++++ Merges in $ps->{id} are....\n";
1012        # my @have = sort keys %have;   print Dumper(\@have);
1013
1014        # merge what we have with what ancestors have
1015        %have = (%have, %ancestorshave);
1016
1017        # see what the remote branch has - these are the merges we
1018        # will want to have in a consecutive series from the mergebase
1019        my $otherbranchtip = git_rev_parse($branch);
1020        my @needraw = `git-rev-list --topo-order $otherbranchtip ^$mergebase`;
1021        my @need;
1022        foreach my $needps (@needraw) {         # get the psets
1023            $needps = commitid2pset($needps);
1024            # git-rev-list will also
1025            # list commits merged in via earlier
1026            # merges. we are only interested in commits
1027            # from the branch we're looking at
1028            if ($branch eq $needps->{branch}) {
1029                push @need, $needps->{id};
1030            }
1031        }
1032
1033        # print "++++ Merges from $branch we want are....\n";
1034        # print Dumper(\@need);
1035
1036        my $newparent;
1037        while (my $needed_commit = pop @need) {
1038            if ($have{$needed_commit}) {
1039                $newparent = $needed_commit;
1040            } else {
1041                last; # break out of the while
1042            }
1043        }
1044        if ($newparent) {
1045            push @parents, $newparent;
1046        }
1047
1048
1049    } # end foreach branch
1050
1051    # prune redundant parents
1052    my %parents;
1053    foreach my $p (@parents) {
1054        $parents{$p} = 1;
1055    }
1056    foreach my $p (@parents) {
1057        next unless exists $psets{$p}{merges};
1058        next unless ref    $psets{$p}{merges};
1059        my @merges = @{$psets{$p}{merges}};
1060        foreach my $merge (@merges) {
1061            if ($parents{$merge}) {
1062                delete $parents{$merge};
1063            }
1064        }
1065    }
1066
1067    @parents = ();
1068    foreach (keys %parents) {
1069        push @parents, '-p', ptag($_);
1070    }
1071    return @parents;
1072}
1073
1074sub git_rev_parse {
1075    my $name = shift;
1076    my $val  = `git-rev-parse $name`;
1077    die "Error: git-rev-parse $name" if $?;
1078    chomp $val;
1079    return $val;
1080}
1081
1082# resolve a SHA1 to a known patchset
1083sub commitid2pset {
1084    my $commitid = shift;
1085    chomp $commitid;
1086    my $name = $rptags{$commitid}
1087        || die "Cannot find reverse tag mapping for $commitid";
1088    $name =~ s|,|/|;
1089    my $ps   = $psets{$name}
1090        || (print Dumper(sort keys %psets)) && die "Cannot find patchset for $name";
1091    return $ps;
1092}
1093
1094
1095# an alternative to `command` that allows input to be passed as an array
1096# to work around shell problems with weird characters in arguments
1097sub safe_pipe_capture {
1098    my @output;
1099    if (my $pid = open my $child, '-|') {
1100        @output = (<$child>);
1101        close $child or die join(' ',@_).": $! $?";
1102    } else {
1103        exec(@_) or die "$! $?"; # exec() can fail the executable can't be found
1104    }
1105    return wantarray ? @output : join('',@output);
1106}
1107
1108# `tla logs -rf -d <dir> | head -n1` or `baz tree-id <dir>`
1109sub arch_tree_id {
1110    my $dir = shift;
1111    chomp( my $ret = (safe_pipe_capture($TLA,'logs','-rf','-d',$dir))[0] );
1112    return $ret;
1113}
1114
1115sub archive_reachable {
1116    my $archive = shift;
1117    return 1 if $reachable{$archive};
1118    return 0 if $unreachable{$archive};
1119
1120    if (system "$TLA whereis-archive $archive >/dev/null") {
1121        if ($opt_a && (system($TLA,'register-archive',
1122                      "http://mirrors.sourcecontrol.net/$archive") == 0)) {
1123            $reachable{$archive} = 1;
1124            return 1;
1125        }
1126        print STDERR "Archive is unreachable: $archive\n";
1127        $unreachable{$archive} = 1;
1128        return 0;
1129    } else {
1130        $reachable{$archive} = 1;
1131        return 1;
1132    }
1133}