git-archimport.perlon commit git-merge: finish when git-read-tree fails (2be08a8)
   1#!/usr/bin/perl -w
   2#
   3# This tool is copyright (c) 2005, Martin Langhoff.
   4# It is released under the Gnu Public License, version 2.
   5#
   6# The basic idea is to walk the output of tla abrowse, 
   7# fetch the changesets and apply them. 
   8#
   9
  10=head1 Invocation
  11
  12    git-archimport [ -h ] [ -v ] [ -o ] [ -a ] [ -f ] [ -T ] 
  13        [ -D depth] [ -t tempdir ] <archive>/<branch> [ <archive>/<branch> ]
  14
  15Imports a project from one or more Arch repositories. It will follow branches
  16and repositories within the namespaces defined by the <archive/branch>
  17parameters supplied. If it cannot find the remote branch a merge comes from
  18it will just import it as a regular commit. If it can find it, it will mark it 
  19as a merge whenever possible.
  20
  21See man (1) git-archimport for more details.
  22
  23=head1 TODO
  24
  25 - create tag objects instead of ref tags
  26 - audit shell-escaping of filenames
  27 - hide our private tags somewhere smarter
  28 - find a way to make "cat *patches | patch" safe even when patchfiles are missing newlines  
  29 - sort and apply patches by graphing ancestry relations instead of just
  30   relying in dates supplied in the changeset itself.
  31   tla ancestry-graph -m could be helpful here...
  32
  33=head1 Devel tricks
  34
  35Add print in front of the shell commands invoked via backticks. 
  36
  37=head1 Devel Notes
  38
  39There are several places where Arch and git terminology are intermixed
  40and potentially confused.
  41
  42The notion of a "branch" in git is approximately equivalent to
  43a "archive/category--branch--version" in Arch.  Also, it should be noted
  44that the "--branch" portion of "archive/category--branch--version" is really
  45optional in Arch although not many people (nor tools!) seem to know this.
  46This means that "archive/category--version" is also a valid "branch"
  47in git terms.
  48
  49We always refer to Arch names by their fully qualified variant (which
  50means the "archive" name is prefixed.
  51
  52For people unfamiliar with Arch, an "archive" is the term for "repository",
  53and can contain multiple, unrelated branches.
  54
  55=cut
  56
  57use strict;
  58use warnings;
  59use Getopt::Std;
  60use File::Temp qw(tempdir);
  61use File::Path qw(mkpath rmtree);
  62use File::Basename qw(basename dirname);
  63use Data::Dumper qw/ Dumper /;
  64use IPC::Open2;
  65
  66$SIG{'PIPE'}="IGNORE";
  67$ENV{'TZ'}="UTC";
  68
  69my $git_dir = $ENV{"GIT_DIR"} || ".git";
  70$ENV{"GIT_DIR"} = $git_dir;
  71my $ptag_dir = "$git_dir/archimport/tags";
  72
  73our($opt_h,$opt_f,$opt_v,$opt_T,$opt_t,$opt_D,$opt_a,$opt_o);
  74
  75sub usage() {
  76    print STDERR <<END;
  77Usage: ${\basename $0}     # fetch/update GIT from Arch
  78       [ -h ] [ -v ] [ -o ] [ -a ] [ -f ] [ -T ] [ -D depth ] [ -t tempdir ]
  79       repository/arch-branch [ repository/arch-branch] ...
  80END
  81    exit(1);
  82}
  83
  84getopts("fThvat:D:") or usage();
  85usage if $opt_h;
  86
  87@ARGV >= 1 or usage();
  88# $arch_branches:
  89# values associated with keys:
  90#   =1 - Arch version / git 'branch' detected via abrowse on a limit
  91#   >1 - Arch version / git 'branch' of an auxiliary branch we've merged
  92my %arch_branches = map { $_ => 1 } @ARGV;
  93
  94$ENV{'TMPDIR'} = $opt_t if $opt_t; # $ENV{TMPDIR} will affect tempdir() calls:
  95my $tmp = tempdir('git-archimport-XXXXXX', TMPDIR => 1, CLEANUP => 1);
  96$opt_v && print "+ Using $tmp as temporary directory\n";
  97
  98unless (-d $git_dir) { # initial import needs empty directory
  99    opendir DIR, '.' or die "Unable to open current directory: $!\n";
 100    while (my $entry = readdir DIR) {
 101        $entry =~ /^\.\.?$/ or
 102            die "Initial import needs an empty current working directory.\n"
 103    }
 104    closedir DIR
 105}
 106
 107my %reachable = ();             # Arch repositories we can access
 108my %unreachable = ();           # Arch repositories we can't access :<
 109my @psets  = ();                # the collection
 110my %psets  = ();                # the collection, by name
 111my %stats  = (                  # Track which strategy we used to import:
 112        get_tag => 0, replay => 0, get_new => 0, get_delta => 0,
 113        simple_changeset => 0, import_or_tag => 0
 114);
 115
 116my %rptags = ();                # my reverse private tags
 117                                # to map a SHA1 to a commitid
 118my $TLA = $ENV{'ARCH_CLIENT'} || 'tla';
 119
 120sub do_abrowse {
 121    my $stage = shift;
 122    while (my ($limit, $level) = each %arch_branches) {
 123        next unless $level == $stage;
 124        
 125        open ABROWSE, "$TLA abrowse -fkD --merges $limit |" 
 126                                or die "Problems with tla abrowse: $!";
 127    
 128        my %ps        = ();         # the current one
 129        my $lastseen  = '';
 130    
 131        while (<ABROWSE>) {
 132            chomp;
 133            
 134            # first record padded w 8 spaces
 135            if (s/^\s{8}\b//) {
 136                my ($id, $type) = split(m/\s+/, $_, 2);
 137
 138                my %last_ps;
 139                # store the record we just captured
 140                if (%ps && !exists $psets{ $ps{id} }) {
 141                    %last_ps = %ps; # break references
 142                    push (@psets, \%last_ps);
 143                    $psets{ $last_ps{id} } = \%last_ps;
 144                }
 145                
 146                my $branch = extract_versionname($id);
 147                %ps = ( id => $id, branch => $branch );
 148                if (%last_ps && ($last_ps{branch} eq $branch)) {
 149                    $ps{parent_id} = $last_ps{id};
 150                }
 151                
 152                $arch_branches{$branch} = 1;
 153                $lastseen = 'id';
 154
 155                # deal with types (should work with baz or tla):
 156                if ($type =~ m/\(.*changeset\)/) {
 157                    $ps{type} = 's';
 158                } elsif ($type =~ /\(.*import\)/) {
 159                    $ps{type} = 'i';
 160                } elsif ($type =~ m/\(tag.*?(\S+\@\S+).*?\)/) {
 161                    $ps{type} = 't';
 162                    # read which revision we've tagged when we parse the log
 163                    $ps{tag}  = $1;
 164                } else { 
 165                    warn "Unknown type $type";
 166                }
 167
 168                $arch_branches{$branch} = 1;
 169                $lastseen = 'id';
 170            } elsif (s/^\s{10}//) { 
 171                # 10 leading spaces or more 
 172                # indicate commit metadata
 173                
 174                # date
 175                if ($lastseen eq 'id' && m/^(\d{4}-\d\d-\d\d \d\d:\d\d:\d\d)/){
 176                    $ps{date}   = $1;
 177                    $lastseen = 'date';
 178                } elsif ($_ eq 'merges in:') {
 179                    $ps{merges} = [];
 180                    $lastseen = 'merges';
 181                } elsif ($lastseen eq 'merges' && s/^\s{2}//) {
 182                    my $id = $_;
 183                    push (@{$ps{merges}}, $id);
 184                   
 185                    # aggressive branch finding:
 186                    if ($opt_D) {
 187                        my $branch = extract_versionname($id);
 188                        my $repo = extract_reponame($branch);
 189                        
 190                        if (archive_reachable($repo) &&
 191                                !defined $arch_branches{$branch}) {
 192                            $arch_branches{$branch} = $stage + 1;
 193                        }
 194                    }
 195                } else {
 196                    warn "more metadata after merges!?: $_\n" unless /^\s*$/;
 197                }
 198            }
 199        }
 200
 201        if (%ps && !exists $psets{ $ps{id} }) {
 202            my %temp = %ps;         # break references
 203            if (@psets && $psets[$#psets]{branch} eq $ps{branch}) {
 204                $temp{parent_id} = $psets[$#psets]{id};
 205            }
 206            push (@psets, \%temp);  
 207            $psets{ $temp{id} } = \%temp;
 208        }    
 209        
 210        close ABROWSE or die "$TLA abrowse failed on $limit\n";
 211    }
 212}                               # end foreach $root
 213
 214do_abrowse(1);
 215my $depth = 2;
 216$opt_D ||= 0;
 217while ($depth <= $opt_D) {
 218    do_abrowse($depth);
 219    $depth++;
 220}
 221
 222## Order patches by time
 223# FIXME see if we can find a more optimal way to do this by graphing
 224# the ancestry data and walking it, that way we won't have to rely on
 225# client-supplied dates
 226@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
 227
 228#print Dumper \@psets;
 229
 230##
 231## TODO cleanup irrelevant patches
 232##      and put an initial import
 233##      or a full tag
 234my $import = 0;
 235unless (-d $git_dir) { # initial import
 236    if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
 237        print "Starting import from $psets[0]{id}\n";
 238        `git-init`;
 239        die $! if $?;
 240        $import = 1;
 241    } else {
 242        die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
 243    }
 244} else {    # progressing an import
 245    # load the rptags
 246    opendir(DIR, $ptag_dir)
 247        || die "can't opendir: $!";
 248    while (my $file = readdir(DIR)) {
 249        # skip non-interesting-files
 250        next unless -f "$ptag_dir/$file";
 251   
 252        # convert first '--' to '/' from old git-archimport to use
 253        # as an archivename/c--b--v private tag
 254        if ($file !~ m!,!) {
 255            my $oldfile = $file;
 256            $file =~ s!--!,!;
 257            print STDERR "converting old tag $oldfile to $file\n";
 258            rename("$ptag_dir/$oldfile", "$ptag_dir/$file") or die $!;
 259        }
 260        my $sha = ptag($file);
 261        chomp $sha;
 262        $rptags{$sha} = $file;
 263    }
 264    closedir DIR;
 265}
 266
 267# process patchsets
 268# extract the Arch repository name (Arch "archive" in Arch-speak)
 269sub extract_reponame {
 270    my $fq_cvbr = shift; # archivename/[[[[category]branch]version]revision]
 271    return (split(/\//, $fq_cvbr))[0];
 272}
 273 
 274sub extract_versionname {
 275    my $name = shift;
 276    $name =~ s/--(?:patch|version(?:fix)?|base)-\d+$//;
 277    return $name;
 278}
 279
 280# convert a fully-qualified revision or version to a unique dirname:
 281#   normalperson@yhbt.net-05/mpd--uclinux--1--patch-2 
 282# becomes: normalperson@yhbt.net-05,mpd--uclinux--1
 283#
 284# the git notion of a branch is closer to
 285# archive/category--branch--version than archive/category--branch, so we
 286# use this to convert to git branch names.
 287# Also, keep archive names but replace '/' with ',' since it won't require
 288# subdirectories, and is safer than swapping '--' which could confuse
 289# reverse-mapping when dealing with bastard branches that
 290# are just archive/category--version  (no --branch)
 291sub tree_dirname {
 292    my $revision = shift;
 293    my $name = extract_versionname($revision);
 294    $name =~ s#/#,#;
 295    return $name;
 296}
 297
 298# old versions of git-archimport just use the <category--branch> part:
 299sub old_style_branchname {
 300    my $id = shift;
 301    my $ret = safe_pipe_capture($TLA,'parse-package-name','-p',$id);
 302    chomp $ret;
 303    return $ret;
 304}
 305
 306*git_branchname = $opt_o ? *old_style_branchname : *tree_dirname;
 307
 308sub process_patchset_accurate {
 309    my $ps = shift;
 310    
 311    # switch to that branch if we're not already in that branch:
 312    if (-e "$git_dir/refs/heads/$ps->{branch}") {
 313       system('git-checkout','-f',$ps->{branch}) == 0 or die "$! $?\n";
 314
 315       # remove any old stuff that got leftover:
 316       my $rm = safe_pipe_capture('git-ls-files','--others','-z');
 317       rmtree(split(/\0/,$rm)) if $rm;
 318    }
 319    
 320    # Apply the import/changeset/merge into the working tree
 321    my $dir = sync_to_ps($ps);
 322    # read the new log entry:
 323    my @commitlog = safe_pipe_capture($TLA,'cat-log','-d',$dir,$ps->{id});
 324    die "Error in cat-log: $!" if $?;
 325    chomp @commitlog;
 326
 327    # grab variables we want from the log, new fields get added to $ps:
 328    # (author, date, email, summary, message body ...)
 329    parselog($ps, \@commitlog);
 330
 331    if ($ps->{id} =~ /--base-0$/ && $ps->{id} ne $psets[0]{id}) {
 332        # this should work when importing continuations 
 333        if ($ps->{tag} && (my $branchpoint = eval { ptag($ps->{tag}) })) {
 334            
 335            # find where we are supposed to branch from
 336            system('git-checkout','-f','-b',$ps->{branch},
 337                            $branchpoint) == 0 or die "$! $?\n";
 338            
 339            # remove any old stuff that got leftover:
 340            my $rm = safe_pipe_capture('git-ls-files','--others','-z');
 341            rmtree(split(/\0/,$rm)) if $rm;
 342
 343            # If we trust Arch with the fact that this is just 
 344            # a tag, and it does not affect the state of the tree
 345            # then we just tag and move on
 346            tag($ps->{id}, $branchpoint);
 347            ptag($ps->{id}, $branchpoint);
 348            print " * Tagged $ps->{id} at $branchpoint\n";
 349            return 0;
 350        } else {
 351            warn "Tagging from unknown id unsupported\n" if $ps->{tag};
 352        }
 353        # allow multiple bases/imports here since Arch supports cherry-picks
 354        # from unrelated trees
 355    } 
 356    
 357    # update the index with all the changes we got
 358    system('git-diff-files --name-only -z | '.
 359            'git-update-index --remove -z --stdin') == 0 or die "$! $?\n";
 360    system('git-ls-files --others -z | '.
 361            'git-update-index --add -z --stdin') == 0 or die "$! $?\n";
 362    return 1;
 363}
 364
 365# the native changeset processing strategy.  This is very fast, but
 366# does not handle permissions or any renames involving directories
 367sub process_patchset_fast {
 368    my $ps = shift;
 369    # 
 370    # create the branch if needed
 371    #
 372    if ($ps->{type} eq 'i' && !$import) {
 373        die "Should not have more than one 'Initial import' per GIT import: $ps->{id}";
 374    }
 375
 376    unless ($import) { # skip for import
 377        if ( -e "$git_dir/refs/heads/$ps->{branch}") {
 378            # we know about this branch
 379            system('git-checkout',$ps->{branch});
 380        } else {
 381            # new branch! we need to verify a few things
 382            die "Branch on a non-tag!" unless $ps->{type} eq 't';
 383            my $branchpoint = ptag($ps->{tag});
 384            die "Tagging from unknown id unsupported: $ps->{tag}" 
 385                unless $branchpoint;
 386            
 387            # find where we are supposed to branch from
 388            system('git-checkout','-b',$ps->{branch},$branchpoint);
 389
 390            # If we trust Arch with the fact that this is just 
 391            # a tag, and it does not affect the state of the tree
 392            # then we just tag and move on
 393            tag($ps->{id}, $branchpoint);
 394            ptag($ps->{id}, $branchpoint);
 395            print " * Tagged $ps->{id} at $branchpoint\n";
 396            return 0;
 397        } 
 398        die $! if $?;
 399    } 
 400
 401    #
 402    # Apply the import/changeset/merge into the working tree
 403    # 
 404    if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
 405        apply_import($ps) or die $!;
 406        $stats{import_or_tag}++;
 407        $import=0;
 408    } elsif ($ps->{type} eq 's') {
 409        apply_cset($ps);
 410        $stats{simple_changeset}++;
 411    }
 412
 413    #
 414    # prepare update git's index, based on what arch knows
 415    # about the pset, resolve parents, etc
 416    #
 417    
 418    my @commitlog = safe_pipe_capture($TLA,'cat-archive-log',$ps->{id}); 
 419    die "Error in cat-archive-log: $!" if $?;
 420        
 421    parselog($ps,\@commitlog);
 422
 423    # imports don't give us good info
 424    # on added files. Shame on them
 425    if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
 426        system('git-ls-files --deleted -z | '.
 427                'git-update-index --remove -z --stdin') == 0 or die "$! $?\n";
 428        system('git-ls-files --others -z | '.
 429                'git-update-index --add -z --stdin') == 0 or die "$! $?\n";
 430    }
 431
 432    # TODO: handle removed_directories and renamed_directories:
 433
 434    if (my $del = $ps->{removed_files}) {
 435        unlink @$del;
 436        while (@$del) {
 437            my @slice = splice(@$del, 0, 100);
 438            system('git-update-index','--remove','--',@slice) == 0 or
 439                            die "Error in git-update-index --remove: $! $?\n";
 440        }
 441    }
 442
 443    if (my $ren = $ps->{renamed_files}) {                # renamed
 444        if (@$ren % 2) {
 445            die "Odd number of entries in rename!?";
 446        }
 447        
 448        while (@$ren) {
 449            my $from = shift @$ren;
 450            my $to   = shift @$ren;           
 451
 452            unless (-d dirname($to)) {
 453                mkpath(dirname($to)); # will die on err
 454            }
 455            # print "moving $from $to";
 456            rename($from, $to) or die "Error renaming '$from' '$to': $!\n";
 457            system('git-update-index','--remove','--',$from) == 0 or
 458                            die "Error in git-update-index --remove: $! $?\n";
 459            system('git-update-index','--add','--',$to) == 0 or
 460                            die "Error in git-update-index --add: $! $?\n";
 461        }
 462    }
 463
 464    if (my $add = $ps->{new_files}) {
 465        while (@$add) {
 466            my @slice = splice(@$add, 0, 100);
 467            system('git-update-index','--add','--',@slice) == 0 or
 468                            die "Error in git-update-index --add: $! $?\n";
 469        }
 470    }
 471
 472    if (my $mod = $ps->{modified_files}) {
 473        while (@$mod) {
 474            my @slice = splice(@$mod, 0, 100);
 475            system('git-update-index','--',@slice) == 0 or
 476                            die "Error in git-update-index: $! $?\n";
 477        }
 478    }
 479    return 1; # we successfully applied the changeset
 480}
 481
 482if ($opt_f) {
 483    print "Will import patchsets using the fast strategy\n",
 484            "Renamed directories and permission changes will be missed\n";
 485    *process_patchset = *process_patchset_fast;
 486} else {
 487    print "Using the default (accurate) import strategy.\n",
 488            "Things may be a bit slow\n";
 489    *process_patchset = *process_patchset_accurate;
 490}
 491    
 492foreach my $ps (@psets) {
 493    # process patchsets
 494    $ps->{branch} = git_branchname($ps->{id});
 495
 496    #
 497    # ensure we have a clean state 
 498    # 
 499    if (my $dirty = `git-diff-files`) {
 500        die "Unclean tree when about to process $ps->{id} " .
 501            " - did we fail to commit cleanly before?\n$dirty";
 502    }
 503    die $! if $?;
 504    
 505    #
 506    # skip commits already in repo
 507    #
 508    if (ptag($ps->{id})) {
 509      $opt_v && print " * Skipping already imported: $ps->{id}\n";
 510      next;
 511    }
 512
 513    print " * Starting to work on $ps->{id}\n";
 514
 515    process_patchset($ps) or next;
 516
 517    # warn "errors when running git-update-index! $!";
 518    my $tree = `git-write-tree`;
 519    die "cannot write tree $!" if $?;
 520    chomp $tree;
 521    
 522    #
 523    # Who's your daddy?
 524    #
 525    my @par;
 526    if ( -e "$git_dir/refs/heads/$ps->{branch}") {
 527        if (open HEAD, "<","$git_dir/refs/heads/$ps->{branch}") {
 528            my $p = <HEAD>;
 529            close HEAD;
 530            chomp $p;
 531            push @par, '-p', $p;
 532        } else { 
 533            if ($ps->{type} eq 's') {
 534                warn "Could not find the right head for the branch $ps->{branch}";
 535            }
 536        }
 537    }
 538    
 539    if ($ps->{merges}) {
 540        push @par, find_parents($ps);
 541    }
 542
 543    #    
 544    # Commit, tag and clean state
 545    #
 546    $ENV{TZ}                  = 'GMT';
 547    $ENV{GIT_AUTHOR_NAME}     = $ps->{author};
 548    $ENV{GIT_AUTHOR_EMAIL}    = $ps->{email};
 549    $ENV{GIT_AUTHOR_DATE}     = $ps->{date};
 550    $ENV{GIT_COMMITTER_NAME}  = $ps->{author};
 551    $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
 552    $ENV{GIT_COMMITTER_DATE}  = $ps->{date};
 553
 554    my $pid = open2(*READER, *WRITER,'git-commit-tree',$tree,@par) 
 555        or die $!;
 556    print WRITER $ps->{summary},"\n\n";
 557    print WRITER $ps->{message},"\n";
 558    
 559    # make it easy to backtrack and figure out which Arch revision this was:
 560    print WRITER 'git-archimport-id: ',$ps->{id},"\n";
 561    
 562    close WRITER;
 563    my $commitid = <READER>;    # read
 564    chomp $commitid;
 565    close READER;
 566    waitpid $pid,0;             # close;
 567
 568    if (length $commitid != 40) {
 569        die "Something went wrong with the commit! $! $commitid";
 570    }
 571    #
 572    # Update the branch
 573    # 
 574    open  HEAD, ">","$git_dir/refs/heads/$ps->{branch}";
 575    print HEAD $commitid;
 576    close HEAD;
 577    system('git-update-ref', 'HEAD', "$ps->{branch}");
 578
 579    # tag accordingly
 580    ptag($ps->{id}, $commitid); # private tag
 581    if ($opt_T || $ps->{type} eq 't' || $ps->{type} eq 'i') {
 582        tag($ps->{id}, $commitid);
 583    }
 584    print " * Committed $ps->{id}\n";
 585    print "   + tree   $tree\n";
 586    print "   + commit $commitid\n";
 587    $opt_v && print "   + commit date is  $ps->{date} \n";
 588    $opt_v && print "   + parents:  ",join(' ',@par),"\n";
 589}
 590
 591if ($opt_v) {
 592    foreach (sort keys %stats) {
 593        print" $_: $stats{$_}\n";
 594    }
 595}
 596exit 0;
 597
 598# used by the accurate strategy:
 599sub sync_to_ps {
 600    my $ps = shift;
 601    my $tree_dir = $tmp.'/'.tree_dirname($ps->{id});
 602    
 603    $opt_v && print "sync_to_ps($ps->{id}) method: ";
 604
 605    if (-d $tree_dir) {
 606        if ($ps->{type} eq 't') {
 607            $opt_v && print "get (tag)\n";
 608            # looks like a tag-only or (worse,) a mixed tags/changeset branch,
 609            # can't rely on replay to work correctly on these
 610            rmtree($tree_dir);
 611            safe_pipe_capture($TLA,'get','--no-pristine',$ps->{id},$tree_dir);
 612            $stats{get_tag}++;
 613        } else {
 614                my $tree_id = arch_tree_id($tree_dir);
 615                if ($ps->{parent_id} && ($ps->{parent_id} eq $tree_id)) {
 616                    # the common case (hopefully)
 617                    $opt_v && print "replay\n";
 618                    safe_pipe_capture($TLA,'replay','-d',$tree_dir,$ps->{id});
 619                    $stats{replay}++;
 620                } else {
 621                    # getting one tree is usually faster than getting two trees
 622                    # and applying the delta ...
 623                    rmtree($tree_dir);
 624                    $opt_v && print "apply-delta\n";
 625                    safe_pipe_capture($TLA,'get','--no-pristine',
 626                                        $ps->{id},$tree_dir);
 627                    $stats{get_delta}++;
 628                }
 629        }
 630    } else {
 631        # new branch work
 632        $opt_v && print "get (new tree)\n";
 633        safe_pipe_capture($TLA,'get','--no-pristine',$ps->{id},$tree_dir);
 634        $stats{get_new}++;
 635    }
 636   
 637    # added -I flag to rsync since we're going to fast! AIEEEEE!!!!
 638    system('rsync','-aI','--delete','--exclude',$git_dir,
 639#               '--exclude','.arch-inventory',
 640                '--exclude','.arch-ids','--exclude','{arch}',
 641                '--exclude','+*','--exclude',',*',
 642                "$tree_dir/",'./') == 0 or die "Cannot rsync $tree_dir: $! $?";
 643    return $tree_dir;
 644}
 645
 646sub apply_import {
 647    my $ps = shift;
 648    my $bname = git_branchname($ps->{id});
 649
 650    mkpath($tmp);
 651
 652    safe_pipe_capture($TLA,'get','-s','--no-pristine',$ps->{id},"$tmp/import");
 653    die "Cannot get import: $!" if $?;    
 654    system('rsync','-aI','--delete', '--exclude',$git_dir,
 655                '--exclude','.arch-ids','--exclude','{arch}',
 656                "$tmp/import/", './');
 657    die "Cannot rsync import:$!" if $?;
 658    
 659    rmtree("$tmp/import");
 660    die "Cannot remove tempdir: $!" if $?;
 661    
 662
 663    return 1;
 664}
 665
 666sub apply_cset {
 667    my $ps = shift;
 668
 669    mkpath($tmp);
 670
 671    # get the changeset
 672    safe_pipe_capture($TLA,'get-changeset',$ps->{id},"$tmp/changeset");
 673    die "Cannot get changeset: $!" if $?;
 674    
 675    # apply patches
 676    if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
 677        # this can be sped up considerably by doing
 678        #    (find | xargs cat) | patch
 679        # but that can get mucked up by patches
 680        # with missing trailing newlines or the standard 
 681        # 'missing newline' flag in the patch - possibly
 682        # produced with an old/buggy diff.
 683        # slow and safe, we invoke patch once per patchfile
 684        `find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
 685        die "Problem applying patches! $!" if $?;
 686    }
 687
 688    # apply changed binary files
 689    if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
 690        foreach my $mod (@modified) {
 691            chomp $mod;
 692            my $orig = $mod;
 693            $orig =~ s/\.modified$//; # lazy
 694            $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
 695            #print "rsync -p '$mod' '$orig'";
 696            system('rsync','-p',$mod,"./$orig");
 697            die "Problem applying binary changes! $!" if $?;
 698        }
 699    }
 700
 701    # bring in new files
 702    system('rsync','-aI','--exclude',$git_dir,
 703                '--exclude','.arch-ids',
 704                '--exclude', '{arch}',
 705                "$tmp/changeset/new-files-archive/",'./');
 706
 707    # deleted files are hinted from the commitlog processing
 708
 709    rmtree("$tmp/changeset");
 710}
 711
 712
 713# =for reference
 714# notes: *-files/-directories keys cannot have spaces, they're always
 715# pika-escaped.  Everything after the first newline
 716# A log entry looks like:
 717# Revision: moodle-org--moodle--1.3.3--patch-15
 718# Archive: arch-eduforge@catalyst.net.nz--2004
 719# Creator: Penny Leach <penny@catalyst.net.nz>
 720# Date: Wed May 25 14:15:34 NZST 2005
 721# Standard-date: 2005-05-25 02:15:34 GMT
 722# New-files: lang/de/.arch-ids/block_glossary_random.php.id
 723#     lang/de/.arch-ids/block_html.php.id
 724# New-directories: lang/de/help/questionnaire
 725#     lang/de/help/questionnaire/.arch-ids
 726# Renamed-files: .arch-ids/db_sears.sql.id db/.arch-ids/db_sears.sql.id
 727#    db_sears.sql db/db_sears.sql
 728# Removed-files: lang/be/docs/.arch-ids/release.html.id
 729#     lang/be/docs/.arch-ids/releaseold.html.id
 730# Modified-files: admin/cron.php admin/delete.php
 731#     admin/editor.html backup/lib.php backup/restore.php
 732# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
 733# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
 734#   summary can be multiline with a leading space just like the above fields
 735# Keywords:
 736#
 737# Updating yadda tadda tadda madda
 738sub parselog {
 739    my ($ps, $log) = @_;
 740    my $key = undef;
 741
 742    # headers we want that contain filenames:
 743    my %want_headers = (
 744        new_files => 1,
 745        modified_files => 1,
 746        renamed_files => 1,
 747        renamed_directories => 1,
 748        removed_files => 1,
 749        removed_directories => 1,
 750    );
 751    
 752    chomp (@$log);
 753    while ($_ = shift @$log) {
 754        if (/^Continuation-of:\s*(.*)/) {
 755            $ps->{tag} = $1;
 756            $key = undef;
 757        } elsif (/^Summary:\s*(.*)$/ ) {
 758            # summary can be multiline as long as it has a leading space.
 759            # we squeeze it onto a single line, though.
 760            $ps->{summary} = [ $1 ];
 761            $key = 'summary';
 762        } elsif (/^Creator: (.*)\s*<([^\>]+)>/) {
 763            $ps->{author} = $1;
 764            $ps->{email} = $2;
 765            $key = undef;
 766        # any *-files or *-directories can be read here:
 767        } elsif (/^([A-Z][a-z\-]+):\s*(.*)$/) {
 768            my $val = $2;
 769            $key = lc $1;
 770            $key =~ tr/-/_/; # too lazy to quote :P
 771            if ($want_headers{$key}) {
 772                push @{$ps->{$key}}, split(/\s+/, $val);
 773            } else {
 774                $key = undef;
 775            }
 776        } elsif (/^$/) {
 777            last; # remainder of @$log that didn't get shifted off is message
 778        } elsif ($key) {
 779            if (/^\s+(.*)$/) {
 780                if ($key eq 'summary') {
 781                    push @{$ps->{$key}}, $1;
 782                } else { # files/directories:
 783                    push @{$ps->{$key}}, split(/\s+/, $1);
 784                }
 785            } else {
 786                $key = undef;
 787            }
 788        }
 789    }
 790   
 791    # drop leading empty lines from the log message
 792    while (@$log && $log->[0] eq '') {
 793        shift @$log;
 794    }
 795    if (exists $ps->{summary} && @{$ps->{summary}}) {
 796        $ps->{summary} = join(' ', @{$ps->{summary}});
 797    }
 798    elsif (@$log == 0) {
 799        $ps->{summary} = 'empty commit message';
 800    } else {
 801        $ps->{summary} = $log->[0] . '...';
 802    }
 803    $ps->{message} = join("\n",@$log);
 804    
 805    # skip Arch control files, unescape pika-escaped files
 806    foreach my $k (keys %want_headers) {
 807        next unless (defined $ps->{$k});
 808        my @tmp = ();
 809        foreach my $t (@{$ps->{$k}}) {
 810           next unless length ($t);
 811           next if $t =~ m!\{arch\}/!;
 812           next if $t =~ m!\.arch-ids/!;
 813           # should we skip this?
 814           next if $t =~ m!\.arch-inventory$!;
 815           # tla cat-archive-log will give us filenames with spaces as file\(sp)name - why?
 816           # we can assume that any filename with \ indicates some pika escaping that we want to get rid of.
 817           if ($t =~ /\\/ ){
 818               $t = (safe_pipe_capture($TLA,'escape','--unescaped',$t))[0];
 819           }
 820           push @tmp, $t;
 821        }
 822        $ps->{$k} = \@tmp;
 823    }
 824}
 825
 826# write/read a tag
 827sub tag {
 828    my ($tag, $commit) = @_;
 829 
 830    if ($opt_o) {
 831        $tag =~ s|/|--|g;
 832    } else {
 833        # don't use subdirs for tags yet, it could screw up other porcelains
 834        $tag =~ s|/|,|g;
 835    }
 836    
 837    if ($commit) {
 838        open(C,">","$git_dir/refs/tags/$tag")
 839            or die "Cannot create tag $tag: $!\n";
 840        print C "$commit\n"
 841            or die "Cannot write tag $tag: $!\n";
 842        close(C)
 843            or die "Cannot write tag $tag: $!\n";
 844        print " * Created tag '$tag' on '$commit'\n" if $opt_v;
 845    } else {                    # read
 846        open(C,"<","$git_dir/refs/tags/$tag")
 847            or die "Cannot read tag $tag: $!\n";
 848        $commit = <C>;
 849        chomp $commit;
 850        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 851        close(C)
 852            or die "Cannot read tag $tag: $!\n";
 853        return $commit;
 854    }
 855}
 856
 857# write/read a private tag
 858# reads fail softly if the tag isn't there
 859sub ptag {
 860    my ($tag, $commit) = @_;
 861
 862    # don't use subdirs for tags yet, it could screw up other porcelains
 863    $tag =~ s|/|,|g; 
 864    
 865    my $tag_file = "$ptag_dir/$tag";
 866    my $tag_branch_dir = dirname($tag_file);
 867    mkpath($tag_branch_dir) unless (-d $tag_branch_dir);
 868
 869    if ($commit) {              # write
 870        open(C,">",$tag_file)
 871            or die "Cannot create tag $tag: $!\n";
 872        print C "$commit\n"
 873            or die "Cannot write tag $tag: $!\n";
 874        close(C)
 875            or die "Cannot write tag $tag: $!\n";
 876        $rptags{$commit} = $tag 
 877            unless $tag =~ m/--base-0$/;
 878    } else {                    # read
 879        # if the tag isn't there, return 0
 880        unless ( -s $tag_file) {
 881            return 0;
 882        }
 883        open(C,"<",$tag_file)
 884            or die "Cannot read tag $tag: $!\n";
 885        $commit = <C>;
 886        chomp $commit;
 887        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 888        close(C)
 889            or die "Cannot read tag $tag: $!\n";
 890        unless (defined $rptags{$commit}) {
 891            $rptags{$commit} = $tag;
 892        }
 893        return $commit;
 894    }
 895}
 896
 897sub find_parents {
 898    #
 899    # Identify what branches are merging into me
 900    # and whether we are fully merged
 901    # git-merge-base <headsha> <headsha> should tell
 902    # me what the base of the merge should be 
 903    #
 904    my $ps = shift;
 905
 906    my %branches; # holds an arrayref per branch
 907                  # the arrayref contains a list of
 908                  # merged patches between the base
 909                  # of the merge and the current head
 910
 911    my @parents;  # parents found for this commit
 912
 913    # simple loop to split the merges
 914    # per branch
 915    foreach my $merge (@{$ps->{merges}}) {
 916        my $branch = git_branchname($merge);
 917        unless (defined $branches{$branch} ){
 918            $branches{$branch} = [];
 919        }
 920        push @{$branches{$branch}}, $merge;
 921    }
 922
 923    #
 924    # foreach branch find a merge base and walk it to the 
 925    # head where we are, collecting the merged patchsets that
 926    # Arch has recorded. Keep that in @have
 927    # Compare that with the commits on the other branch
 928    # between merge-base and the tip of the branch (@need)
 929    # and see if we have a series of consecutive patches
 930    # starting from the merge base. The tip of the series
 931    # of consecutive patches merged is our new parent for 
 932    # that branch.
 933    #
 934    foreach my $branch (keys %branches) {
 935
 936        # check that we actually know about the branch
 937        next unless -e "$git_dir/refs/heads/$branch";
 938
 939        my $mergebase = `git-merge-base $branch $ps->{branch}`;
 940        if ($?) { 
 941            # Don't die here, Arch supports one-way cherry-picking
 942            # between branches with no common base (or any relationship
 943            # at all beforehand)
 944            warn "Cannot find merge base for $branch and $ps->{branch}";
 945            next;
 946        }
 947        chomp $mergebase;
 948
 949        # now walk up to the mergepoint collecting what patches we have
 950        my $branchtip = git_rev_parse($ps->{branch});
 951        my @ancestors = `git-rev-list --topo-order $branchtip ^$mergebase`;
 952        my %have; # collected merges this branch has
 953        foreach my $merge (@{$ps->{merges}}) {
 954            $have{$merge} = 1;
 955        }
 956        my %ancestorshave;
 957        foreach my $par (@ancestors) {
 958            $par = commitid2pset($par);
 959            if (defined $par->{merges}) {
 960                foreach my $merge (@{$par->{merges}}) {
 961                    $ancestorshave{$merge}=1;
 962                }
 963            }
 964        }
 965        # print "++++ Merges in $ps->{id} are....\n";
 966        # my @have = sort keys %have;   print Dumper(\@have);
 967
 968        # merge what we have with what ancestors have
 969        %have = (%have, %ancestorshave);
 970
 971        # see what the remote branch has - these are the merges we 
 972        # will want to have in a consecutive series from the mergebase
 973        my $otherbranchtip = git_rev_parse($branch);
 974        my @needraw = `git-rev-list --topo-order $otherbranchtip ^$mergebase`;
 975        my @need;
 976        foreach my $needps (@needraw) {         # get the psets
 977            $needps = commitid2pset($needps);
 978            # git-rev-list will also
 979            # list commits merged in via earlier 
 980            # merges. we are only interested in commits
 981            # from the branch we're looking at
 982            if ($branch eq $needps->{branch}) {
 983                push @need, $needps->{id};
 984            }
 985        }
 986
 987        # print "++++ Merges from $branch we want are....\n";
 988        # print Dumper(\@need);
 989
 990        my $newparent;
 991        while (my $needed_commit = pop @need) {
 992            if ($have{$needed_commit}) {
 993                $newparent = $needed_commit;
 994            } else {
 995                last; # break out of the while
 996            }
 997        }
 998        if ($newparent) {
 999            push @parents, $newparent;
1000        }
1001
1002
1003    } # end foreach branch
1004
1005    # prune redundant parents
1006    my %parents;
1007    foreach my $p (@parents) {
1008        $parents{$p} = 1;
1009    }
1010    foreach my $p (@parents) {
1011        next unless exists $psets{$p}{merges};
1012        next unless ref    $psets{$p}{merges};
1013        my @merges = @{$psets{$p}{merges}};
1014        foreach my $merge (@merges) {
1015            if ($parents{$merge}) { 
1016                delete $parents{$merge};
1017            }
1018        }
1019    }
1020
1021    @parents = ();
1022    foreach (keys %parents) {
1023        push @parents, '-p', ptag($_);
1024    }
1025    return @parents;
1026}
1027
1028sub git_rev_parse {
1029    my $name = shift;
1030    my $val  = `git-rev-parse $name`;
1031    die "Error: git-rev-parse $name" if $?;
1032    chomp $val;
1033    return $val;
1034}
1035
1036# resolve a SHA1 to a known patchset
1037sub commitid2pset {
1038    my $commitid = shift;
1039    chomp $commitid;
1040    my $name = $rptags{$commitid} 
1041        || die "Cannot find reverse tag mapping for $commitid";
1042    $name =~ s|,|/|;
1043    my $ps   = $psets{$name} 
1044        || (print Dumper(sort keys %psets)) && die "Cannot find patchset for $name";
1045    return $ps;
1046}
1047
1048
1049# an alternative to `command` that allows input to be passed as an array
1050# to work around shell problems with weird characters in arguments
1051sub safe_pipe_capture {
1052    my @output;
1053    if (my $pid = open my $child, '-|') {
1054        @output = (<$child>);
1055        close $child or die join(' ',@_).": $! $?";
1056    } else {
1057        exec(@_) or die "$! $?"; # exec() can fail the executable can't be found
1058    }
1059    return wantarray ? @output : join('',@output);
1060}
1061
1062# `tla logs -rf -d <dir> | head -n1` or `baz tree-id <dir>`
1063sub arch_tree_id {
1064    my $dir = shift;
1065    chomp( my $ret = (safe_pipe_capture($TLA,'logs','-rf','-d',$dir))[0] );
1066    return $ret;
1067}
1068
1069sub archive_reachable {
1070    my $archive = shift;
1071    return 1 if $reachable{$archive};
1072    return 0 if $unreachable{$archive};
1073    
1074    if (system "$TLA whereis-archive $archive >/dev/null") {
1075        if ($opt_a && (system($TLA,'register-archive',
1076                      "http://mirrors.sourcecontrol.net/$archive") == 0)) {
1077            $reachable{$archive} = 1;
1078            return 1;
1079        }
1080        print STDERR "Archive is unreachable: $archive\n";
1081        $unreachable{$archive} = 1;
1082        return 0;
1083    } else {
1084        $reachable{$archive} = 1;
1085        return 1;
1086    }
1087}
1088