git-archimport-scripton commit [PATCH] archimport autodetects import status, supports incremental imports (3292ae4)
   1#!/usr/bin/perl -w
   2#
   3# This tool is copyright (c) 2005, Martin Langhoff.
   4# It is released under the Gnu Public License, version 2.
   5#
   6# The basic idea is to walk the output of tla abrowse, 
   7# fetch the changesets and apply them. 
   8#
   9=head1 Invocation
  10
  11    git-archimport-script -i <archive>/<branch> [<archive>/<branch>]
  12    [ <archive>/<branch> ]
  13
  14    The script expects you to provide the key roots where it can start the
  15    import from an 'initial import' or 'tag' type of Arch commit. It will
  16    then follow all the branching and tagging within the provided roots.
  17
  18    It will die if it sees branches that have different roots. 
  19
  20=head2 TODO
  21
  22 - keep track of merged patches, and mark a git merge when it happens
  23 - smarter rules to parse the archive history "up" and "down"
  24 - be able to continue an import where we left off
  25 - audit shell-escaping of filenames
  26
  27=head1 Devel tricks
  28
  29Add print in front of the shell commands invoked via backticks. 
  30
  31=cut
  32
  33use strict;
  34use warnings;
  35use Getopt::Std;
  36use File::Spec;
  37use File::Temp qw(tempfile);
  38use File::Path qw(mkpath);
  39use File::Basename qw(basename dirname);
  40use String::ShellQuote;
  41use Time::Local;
  42use IO::Socket;
  43use IO::Pipe;
  44use POSIX qw(strftime dup2);
  45use Data::Dumper qw/ Dumper /;
  46use IPC::Open2;
  47
  48$SIG{'PIPE'}="IGNORE";
  49$ENV{'TZ'}="UTC";
  50
  51our($opt_h,$opt_v, $opt_T,
  52    $opt_C,$opt_t);
  53
  54sub usage() {
  55    print STDERR <<END;
  56Usage: ${\basename $0}     # fetch/update GIT from Arch
  57       [ -h ] [ -v ] [ -T ] 
  58       [ -C GIT_repository ] [ -t tempdir ] 
  59       repository/arch-branch [ repository/arch-branch] ...
  60END
  61    exit(1);
  62}
  63
  64getopts("hviC:t:") or usage();
  65usage if $opt_h;
  66
  67@ARGV >= 1 or usage();
  68my @arch_roots = @ARGV;
  69
  70my $tmp = $opt_t;
  71$tmp ||= '/tmp';
  72$tmp .= '/git-archimport/';
  73
  74my $git_tree = $opt_C;
  75$git_tree ||= ".";
  76
  77
  78my @psets  = ();                # the collection
  79
  80foreach my $root (@arch_roots) {
  81    my ($arepo, $abranch) = split(m!/!, $root);
  82    open ABROWSE, "tla abrowse -f -A $arepo --desc --merges $abranch |" 
  83        or die "Problems with tla abrowse: $!";
  84    
  85    my %ps        = ();         # the current one
  86    my $mode      = '';
  87    my $lastseen  = '';
  88    
  89    while (<ABROWSE>) {
  90        chomp;
  91        
  92        # first record padded w 8 spaces
  93        if (s/^\s{8}\b//) {
  94            
  95            # store the record we just captured
  96            if (%ps) {
  97                my %temp = %ps; # break references
  98                push (@psets, \%temp);
  99                %ps = ();
 100            }
 101            
 102            my ($id, $type) = split(m/\s{3}/, $_);
 103            $ps{id}   = $id;
 104            $ps{repo} = $arepo;
 105
 106            # deal with types
 107            if ($type =~ m/^\(simple changeset\)/) {
 108                $ps{type} = 's';
 109            } elsif ($type eq '(initial import)') {
 110                $ps{type} = 'i';
 111            } elsif ($type =~ m/^\(tag revision of (.+)\)/) {
 112                $ps{type} = 't';
 113                $ps{tag}  = $1;
 114            } else { 
 115                warn "Unknown type $type";
 116            }
 117            $lastseen = 'id';
 118        }
 119        
 120        if (s/^\s{10}//) { 
 121            # 10 leading spaces or more 
 122            # indicate commit metadata
 123            
 124            # date & author 
 125            if ($lastseen eq 'id' && m/^\d{4}-\d{2}-\d{2}/) {
 126                
 127                my ($date, $authoremail) = split(m/\s{2,}/, $_);
 128                $ps{date}   = $date;
 129                $ps{date}   =~ s/\bGMT$//; # strip off trailign GMT
 130                if ($ps{date} =~ m/\b\w+$/) {
 131                    warn 'Arch dates not in GMT?! - imported dates will be wrong';
 132                }
 133            
 134                $authoremail =~ m/^(.+)\s(\S+)$/;
 135                $ps{author} = $1;
 136                $ps{email}  = $2;
 137            
 138                $lastseen = 'date';
 139            
 140            } elsif ($lastseen eq 'date') {
 141                # the only hint is position
 142                # subject is after date
 143                $ps{subj} = $_;
 144                $lastseen = 'subj';
 145            
 146            } elsif ($lastseen eq 'subj' && $_ eq 'merges in:') {
 147                $ps{merges} = [];
 148                $lastseen = 'merges';
 149            
 150            } elsif ($lastseen eq 'merges' && s/^\s{2}//) {
 151                push (@{$ps{merges}}, $_);
 152            } else {
 153                warn 'more metadata after merges!?';
 154            }
 155            
 156        }
 157    }
 158
 159    if (%ps) {
 160        my %temp = %ps;         # break references
 161        push (@psets, \%temp);
 162        %ps = ();
 163    }    
 164    close ABROWSE;
 165}                               # end foreach $root
 166
 167## Order patches by time
 168@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
 169
 170#print Dumper \@psets;
 171
 172##
 173## TODO cleanup irrelevant patches
 174##      and put an initial import
 175##      or a full tag
 176my $import = 0;
 177unless (-d '.git') { # initial import
 178    if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
 179        print "Starting import from $psets[0]{id}\n";
 180        `git-init-db`;
 181        die $! if $?;
 182        $import = 1;
 183    } else {
 184        die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
 185    }
 186}
 187
 188# process patchsets
 189foreach my $ps (@psets) {
 190
 191    $ps->{branch} =  branchname($ps->{id});
 192
 193    #
 194    # ensure we have a clean state 
 195    # 
 196    if (`git diff-files`) {
 197        die "Unclean tree when about to process $ps->{id} " .
 198            " - did we fail to commit cleanly before?";
 199    }
 200    die $! if $?;
 201
 202    #
 203    # skip commits already in repo
 204    #
 205    if (ptag($ps->{id})) {
 206      $opt_v && print "Skipping already imported: $ps->{id}\n";
 207      next;
 208    }
 209
 210    # 
 211    # create the branch if needed
 212    #
 213    if ($ps->{type} eq 'i' && !$import) {
 214        die "Should not have more than one 'Initial import' per GIT import: $ps->{id}";
 215    }
 216
 217    unless ($import) { # skip for import
 218        if ( -e ".git/refs/heads/$ps->{branch}") {
 219            # we know about this branch
 220            `git checkout    $ps->{branch}`;
 221        } else {
 222            # new branch! we need to verify a few things
 223            die "Branch on a non-tag!" unless $ps->{type} eq 't';
 224            my $branchpoint = ptag($ps->{tag});
 225            die "Tagging from unknown id unsupported: $ps->{tag}" 
 226                unless $branchpoint;
 227            
 228            # find where we are supposed to branch from
 229            `git checkout -b $ps->{branch} $branchpoint`;
 230        } 
 231        die $! if $?;
 232    } 
 233
 234    #
 235    # Apply the import/changeset/merge into the working tree
 236    # 
 237    if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
 238        apply_import($ps) or die $!;
 239        $import=0;
 240    } elsif ($ps->{type} eq 's') {
 241        apply_cset($ps);
 242    }
 243
 244    #
 245    # prepare update git's index, based on what arch knows
 246    # about the pset, resolve parents, etc
 247    #
 248    my $tree;
 249    
 250    my $commitlog = `tla cat-archive-log -A $ps->{repo} $ps->{id}`; 
 251    die "Error in cat-archive-log: $!" if $?;
 252        
 253    # parselog will git-add/rm files
 254    # and generally prepare things for the commit
 255    # NOTE: parselog will shell-quote filenames! 
 256    my ($sum, $msg, $add, $del, $mod, $ren) = parselog($commitlog);
 257    my $logmessage = "$sum\n$msg";
 258
 259
 260    # imports don't give us good info
 261    # on added files. Shame on them
 262    if ($ps->{type} eq 'i' || $ps->{type} eq 't') { 
 263        `find . -type f -print0 | grep -zv '^./.git' | xargs -0 -l100 git-update-cache --add`;
 264        `git-ls-files --deleted -z | xargs --no-run-if-empty -0 -l100 git-update-cache --remove`; 
 265    }
 266
 267    if (@$add) {
 268        while (@$add) {
 269            my @slice = splice(@$add, 0, 100);
 270            my $slice = join(' ', @slice);          
 271            `git-update-cache --add $slice`;
 272            die "Error in git-update-cache --add: $!" if $?;
 273        }
 274    }
 275    if (@$del) {
 276        foreach my $file (@$del) {
 277            unlink $file or die "Problems deleting $file : $!";
 278        }
 279        while (@$del) {
 280            my @slice = splice(@$del, 0, 100);
 281            my $slice = join(' ', @slice);
 282            `git-update-cache --remove $slice`;
 283            die "Error in git-update-cache --remove: $!" if $?;
 284        }
 285    }
 286    if (@$ren) {                # renamed
 287        if (@$ren % 2) {
 288            die "Odd number of entries in rename!?";
 289        }
 290        ;
 291        while (@$ren) {
 292            my $from = pop @$ren;
 293            my $to   = pop @$ren;           
 294
 295            unless (-d dirname($to)) {
 296                mkpath(dirname($to)); # will die on err
 297            }
 298            #print "moving $from $to";
 299            `mv $from $to`;
 300            die "Error renaming $from $to : $!" if $?;
 301            `git-update-cache --remove $from`;
 302            die "Error in git-update-cache --remove: $!" if $?;
 303            `git-update-cache --add $to`;
 304            die "Error in git-update-cache --add: $!" if $?;
 305        }
 306
 307    }
 308    if (@$mod) {                # must be _after_ renames
 309        while (@$mod) {
 310            my @slice = splice(@$mod, 0, 100);
 311            my $slice = join(' ', @slice);
 312            `git-update-cache $slice`;
 313            die "Error in git-update-cache: $!" if $?;
 314        }
 315    }
 316
 317    # warn "errors when running git-update-cache! $!";
 318    $tree = `git-write-tree`;
 319    die "cannot write tree $!" if $?;
 320    chomp $tree;
 321        
 322    
 323    #
 324    # Who's your daddy?
 325    #
 326    my @par;
 327    if ( -e ".git/refs/heads/$ps->{branch}") {
 328        if (open HEAD, "<.git/refs/heads/$ps->{branch}") {
 329            my $p = <HEAD>;
 330            close HEAD;
 331            chomp $p;
 332            push @par, '-p', $p;
 333        } else { 
 334            if ($ps->{type} eq 's') {
 335                warn "Could not find the right head for the branch $ps->{branch}";
 336            }
 337        }
 338    }
 339    
 340    my $par = join (' ', @par);
 341
 342    #    
 343    # Commit, tag and clean state
 344    #
 345    $ENV{TZ}                  = 'GMT';
 346    $ENV{GIT_AUTHOR_NAME}     = $ps->{author};
 347    $ENV{GIT_AUTHOR_EMAIL}    = $ps->{email};
 348    $ENV{GIT_AUTHOR_DATE}     = $ps->{date};
 349    $ENV{GIT_COMMITTER_NAME}  = $ps->{author};
 350    $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
 351    $ENV{GIT_COMMITTER_DATE}  = $ps->{date};
 352
 353    my ($pid, $commit_rh, $commit_wh);
 354    $commit_rh = 'commit_rh';
 355    $commit_wh = 'commit_wh';
 356    
 357    $pid = open2(*READER, *WRITER, "git-commit-tree $tree $par") 
 358        or die $!;
 359    print WRITER $logmessage;   # write
 360    close WRITER;
 361    my $commitid = <READER>;    # read
 362    chomp $commitid;
 363    close READER;
 364    waitpid $pid,0;             # close;
 365
 366    if (length $commitid != 40) {
 367        die "Something went wrong with the commit! $! $commitid";
 368    }
 369    #
 370    # Update the branch
 371    # 
 372    open  HEAD, ">.git/refs/heads/$ps->{branch}";
 373    print HEAD $commitid;
 374    close HEAD;
 375    unlink ('.git/HEAD');
 376    symlink("refs/heads/$ps->{branch}",".git/HEAD");
 377
 378    # tag accordingly
 379    ptag($ps->{id}, $commitid); # private tag
 380    if ($opt_T || $ps->{type} eq 't' || $ps->{type} eq 'i') {
 381        tag($ps->{id}, $commitid);
 382    }
 383    print " * Committed $ps->{id}\n";
 384    print "   + tree   $tree\n";
 385    print "   + commit $commitid\n";
 386    # print "   + commit date is  $ps->{date} \n";
 387}
 388
 389sub branchname {
 390    my $id = shift;
 391    $id =~ s#^.+?/##;
 392    my @parts = split(m/--/, $id);
 393    return join('--', @parts[0..1]);
 394}
 395
 396sub apply_import {
 397    my $ps = shift;
 398    my $bname = branchname($ps->{id});
 399
 400    `mkdir -p $tmp`;
 401
 402    `tla get -s --no-pristine -A $ps->{repo} $ps->{id} $tmp/import`;
 403    die "Cannot get import: $!" if $?;    
 404    `rsync -v --archive --delete --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/import/* ./`;
 405    die "Cannot rsync import:$!" if $?;
 406    
 407    `rm -fr $tmp/import`;
 408    die "Cannot remove tempdir: $!" if $?;
 409    
 410
 411    return 1;
 412}
 413
 414sub apply_cset {
 415    my $ps = shift;
 416
 417    `mkdir -p $tmp`;
 418
 419    # get the changeset
 420    `tla get-changeset  -A $ps->{repo} $ps->{id} $tmp/changeset`;
 421    die "Cannot get changeset: $!" if $?;
 422    
 423    # apply patches
 424    if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
 425        # this can be sped up considerably by doing
 426        #    (find | xargs cat) | patch
 427        # but that cna get mucked up by patches
 428        # with missing trailing newlines or the standard 
 429        # 'missing newline' flag in the patch - possibly
 430        # produced with an old/buggy diff.
 431        # slow and safe, we invoke patch once per patchfile
 432        `find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
 433        die "Problem applying patches! $!" if $?;
 434    }
 435
 436    # apply changed binary files
 437    if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
 438        foreach my $mod (@modified) {
 439            chomp $mod;
 440            my $orig = $mod;
 441            $orig =~ s/\.modified$//; # lazy
 442            $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
 443            #print "rsync -p '$mod' '$orig'";
 444            `rsync -p $mod ./$orig`;
 445            die "Problem applying binary changes! $!" if $?;
 446        }
 447    }
 448
 449    # bring in new files
 450    `rsync --archive --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/changeset/new-files-archive/* ./`;
 451
 452    # deleted files are hinted from the commitlog processing
 453
 454    `rm -fr $tmp/changeset`;
 455}
 456
 457
 458# =for reference
 459# A log entry looks like 
 460# Revision: moodle-org--moodle--1.3.3--patch-15
 461# Archive: arch-eduforge@catalyst.net.nz--2004
 462# Creator: Penny Leach <penny@catalyst.net.nz>
 463# Date: Wed May 25 14:15:34 NZST 2005
 464# Standard-date: 2005-05-25 02:15:34 GMT
 465# New-files: lang/de/.arch-ids/block_glossary_random.php.id
 466#     lang/de/.arch-ids/block_html.php.id
 467# New-directories: lang/de/help/questionnaire
 468#     lang/de/help/questionnaire/.arch-ids
 469# Renamed-files: .arch-ids/db_sears.sql.id db/.arch-ids/db_sears.sql.id
 470#    db_sears.sql db/db_sears.sql
 471# Removed-files: lang/be/docs/.arch-ids/release.html.id
 472#     lang/be/docs/.arch-ids/releaseold.html.id
 473# Modified-files: admin/cron.php admin/delete.php
 474#     admin/editor.html backup/lib.php backup/restore.php
 475# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
 476# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
 477# Keywords:
 478#
 479# Updating yadda tadda tadda madda
 480sub parselog {
 481    my $log = shift;
 482    #print $log;
 483
 484    my (@add, @del, @mod, @ren, @kw, $sum, $msg );
 485
 486    if ($log =~ m/(?:\n|^)New-files:(.*?)(?=\n\w)/s ) {
 487        my $files = $1;
 488        @add = split(m/\s+/s, $files);
 489    }
 490       
 491    if ($log =~ m/(?:\n|^)Removed-files:(.*?)(?=\n\w)/s ) {
 492        my $files = $1;
 493        @del = split(m/\s+/s, $files);
 494    }
 495    
 496    if ($log =~ m/(?:\n|^)Modified-files:(.*?)(?=\n\w)/s ) {
 497        my $files = $1;
 498        @mod = split(m/\s+/s, $files);
 499    }
 500    
 501    if ($log =~ m/(?:\n|^)Renamed-files:(.*?)(?=\n\w)/s ) {
 502        my $files = $1;
 503        @ren = split(m/\s+/s, $files);
 504    }
 505
 506    $sum ='';
 507    if ($log =~ m/^Summary:(.+?)$/m ) {
 508        $sum = $1;
 509        $sum =~ s/^\s+//;
 510        $sum =~ s/\s+$//;
 511    }
 512
 513    $msg = '';
 514    if ($log =~ m/\n\n(.+)$/s) {
 515        $msg = $1;
 516        $msg =~ s/^\s+//;
 517        $msg =~ s/\s+$//;
 518    }
 519
 520
 521    # cleanup the arrays
 522    foreach my $ref ( (\@add, \@del, \@mod, \@ren) ) {
 523        my @tmp = ();
 524        while (my $t = pop @$ref) {
 525            next unless length ($t);
 526            next if $t =~ m!\{arch\}/!;
 527            next if $t =~ m!\.arch-ids/!;
 528            next if $t =~ m!\.arch-inventory$!;
 529            push (@tmp, shell_quote($t));
 530        }
 531        @$ref = @tmp;
 532    }
 533    
 534    #print Dumper [$sum, $msg, \@add, \@del, \@mod, \@ren]; 
 535    return       ($sum, $msg, \@add, \@del, \@mod, \@ren); 
 536}
 537
 538# write/read a tag
 539sub tag {
 540    my ($tag, $commit) = @_;
 541    $tag =~ s|/|--|g; 
 542    $tag = shell_quote($tag);
 543    
 544    if ($commit) {
 545        open(C,">.git/refs/tags/$tag")
 546            or die "Cannot create tag $tag: $!\n";
 547        print C "$commit\n"
 548            or die "Cannot write tag $tag: $!\n";
 549        close(C)
 550            or die "Cannot write tag $tag: $!\n";
 551        print "Created tag '$tag' on '$commit'\n" if $opt_v;
 552    } else {                    # read
 553        open(C,"<.git/refs/tags/$tag")
 554            or die "Cannot read tag $tag: $!\n";
 555        $commit = <C>;
 556        chomp $commit;
 557        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 558        close(C)
 559            or die "Cannot read tag $tag: $!\n";
 560        return $commit;
 561    }
 562}
 563
 564# write/read a private tag
 565# reads fail softly if the tag isn't there
 566sub ptag {
 567    my ($tag, $commit) = @_;
 568    $tag =~ s|/|--|g; 
 569    $tag = shell_quote($tag);
 570    
 571    unless (-d '.git/archimport/tags') {
 572        mkpath('.git/archimport/tags');
 573    }
 574
 575    if ($commit) {              # write
 576        open(C,">.git/archimport/tags/$tag")
 577            or die "Cannot create tag $tag: $!\n";
 578        print C "$commit\n"
 579            or die "Cannot write tag $tag: $!\n";
 580        close(C)
 581            or die "Cannot write tag $tag: $!\n";
 582    } else {                    # read
 583        # if the tag isn't there, return 0
 584        unless ( -s ".git/archimport/tags/$tag") {
 585            return 0;
 586        }
 587        open(C,"<.git/archimport/tags/$tag")
 588            or die "Cannot read tag $tag: $!\n";
 589        $commit = <C>;
 590        chomp $commit;
 591        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 592        close(C)
 593            or die "Cannot read tag $tag: $!\n";
 594        return $commit;
 595    }
 596}