git-archimport-scripton commit [PATCH] Doc: replace read-cache with git-read-tree. (baeda3a)
   1#!/usr/bin/perl -w
   2#
   3# This tool is copyright (c) 2005, Martin Langhoff.
   4# It is released under the Gnu Public License, version 2.
   5#
   6# The basic idea is to walk the output of tla abrowse, 
   7# fetch the changesets and apply them. 
   8#
   9=head1 Invocation
  10
  11    git-archimport-script -i <archive>/<branch> [<archive>/<branch>]
  12    [ <archive>/<branch> ]
  13
  14    The script expects you to provide the key roots where it can start the
  15    import from an 'initial import' or 'tag' type of Arch commit. It will
  16    then follow all the branching and tagging within the provided roots.
  17
  18    It will die if it sees branches that have different roots. 
  19
  20=head2 TODO
  21
  22 - keep track of merged patches, and mark a git merge when it happens
  23 - smarter rules to parse the archive history "up" and "down"
  24 - be able to continue an import where we left off
  25 - audit shell-escaping of filenames
  26
  27=head1 Devel tricks
  28
  29Add print in front of the shell commands invoked via backticks. 
  30
  31=cut
  32
  33use strict;
  34use warnings;
  35use Getopt::Std;
  36use File::Spec;
  37use File::Temp qw(tempfile);
  38use File::Path qw(mkpath);
  39use File::Basename qw(basename dirname);
  40use String::ShellQuote;
  41use Time::Local;
  42use IO::Socket;
  43use IO::Pipe;
  44use POSIX qw(strftime dup2);
  45use Data::Dumper qw/ Dumper /;
  46use IPC::Open2;
  47
  48$SIG{'PIPE'}="IGNORE";
  49$ENV{'TZ'}="UTC";
  50
  51our($opt_h,$opt_v, $opt_T,
  52    $opt_C,$opt_t, $opt_i);
  53
  54sub usage() {
  55    print STDERR <<END;
  56Usage: ${\basename $0}     # fetch/update GIT from Arch
  57       [ -h ] [ -v ] [ -i ] [ -T ] 
  58       [ -C GIT_repository ] [ -t tempdir ] 
  59       repository/arch-branch [ repository/arch-branch] ...
  60END
  61    exit(1);
  62}
  63
  64getopts("hviC:t:") or usage();
  65usage if $opt_h;
  66
  67@ARGV >= 1 or usage();
  68my @arch_roots = @ARGV;
  69
  70my $tmp = $opt_t;
  71$tmp ||= '/tmp';
  72$tmp .= '/git-archimport/';
  73
  74my $git_tree = $opt_C;
  75$git_tree ||= ".";
  76
  77
  78my @psets  = ();                # the collection
  79
  80foreach my $root (@arch_roots) {
  81    my ($arepo, $abranch) = split(m!/!, $root);
  82    open ABROWSE, "tla abrowse -f -A $arepo --desc --merges $abranch |" 
  83        or die "Problems with tla abrowse: $!";
  84    
  85    my %ps        = ();         # the current one
  86    my $mode      = '';
  87    my $lastseen  = '';
  88    
  89    while (<ABROWSE>) {
  90        chomp;
  91        
  92        # first record padded w 8 spaces
  93        if (s/^\s{8}\b//) {
  94            
  95            # store the record we just captured
  96            if (%ps) {
  97                my %temp = %ps; # break references
  98                push (@psets, \%temp);
  99                %ps = ();
 100            }
 101            
 102            my ($id, $type) = split(m/\s{3}/, $_);
 103            $ps{id}   = $id;
 104            $ps{repo} = $arepo;
 105
 106            # deal with types
 107            if ($type =~ m/^\(simple changeset\)/) {
 108                $ps{type} = 's';
 109            } elsif ($type eq '(initial import)') {
 110                $ps{type} = 'i';
 111            } elsif ($type =~ m/^\(tag revision of (.+)\)/) {
 112                $ps{type} = 't';
 113                $ps{tag}  = $1;
 114            } else { 
 115                warn "Unknown type $type";
 116            }
 117            $lastseen = 'id';
 118        }
 119        
 120        if (s/^\s{10}//) { 
 121            # 10 leading spaces or more 
 122            # indicate commit metadata
 123            
 124            # date & author 
 125            if ($lastseen eq 'id' && m/^\d{4}-\d{2}-\d{2}/) {
 126                
 127                my ($date, $authoremail) = split(m/\s{2,}/, $_);
 128                $ps{date}   = $date;
 129                $ps{date}   =~ s/\bGMT$//; # strip off trailign GMT
 130                if ($ps{date} =~ m/\b\w+$/) {
 131                    warn 'Arch dates not in GMT?! - imported dates will be wrong';
 132                }
 133            
 134                $authoremail =~ m/^(.+)\s(\S+)$/;
 135                $ps{author} = $1;
 136                $ps{email}  = $2;
 137            
 138                $lastseen = 'date';
 139            
 140            } elsif ($lastseen eq 'date') {
 141                # the only hint is position
 142                # subject is after date
 143                $ps{subj} = $_;
 144                $lastseen = 'subj';
 145            
 146            } elsif ($lastseen eq 'subj' && $_ eq 'merges in:') {
 147                $ps{merges} = [];
 148                $lastseen = 'merges';
 149            
 150            } elsif ($lastseen eq 'merges' && s/^\s{2}//) {
 151                push (@{$ps{merges}}, $_);
 152            } else {
 153                warn 'more metadata after merges!?';
 154            }
 155            
 156        }
 157    }
 158
 159    if (%ps) {
 160        my %temp = %ps;         # break references
 161        push (@psets, \%temp);
 162        %ps = ();
 163    }    
 164    close ABROWSE;
 165}                               # end foreach $root
 166
 167## Order patches by time
 168@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
 169
 170#print Dumper \@psets;
 171
 172##
 173## TODO cleanup irrelevant patches
 174##      and put an initial import
 175##      or a full tag
 176
 177if ($opt_i) {                   # initial import 
 178    if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
 179        print "Starting import from $psets[0]{id}\n";
 180    } else {
 181        die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
 182    }
 183    `git-init-db`;
 184    die $! if $?;
 185}
 186
 187# process
 188my $lastbranch = branchname($psets[0]{id}); # only good for initial import
 189my $importseen = $opt_i ? 0 : 1; # start at 1 if opt_i
 190
 191foreach my $ps (@psets) {
 192
 193    $ps->{branch} =  branchname($ps->{id});
 194
 195    #
 196    # ensure we have a clean state 
 197    # 
 198    if (`git diff-files`) {
 199        die "Unclean tree when about to process $ps->{id} " .
 200            " - did we fail to commit cleanly before?";
 201    }
 202    die $! if $?;
 203
 204    # 
 205    # create the branch if needed
 206    #
 207    if ($ps->{type} eq 'i' && $importseen) {
 208        die "Should not have more than one 'Initial import' per GIT import";
 209    }
 210
 211    unless ($opt_i && !$importseen) { # skip for first commit
 212        if ( -e ".git/refs/heads/$ps->{branch}") {
 213            # we know about this branch
 214            `git checkout    $ps->{branch}`;
 215        } else {
 216            # new branch! we need to verify a few things
 217            die "Branch on a non-tag!" unless $ps->{type} eq 't';
 218            my $branchpoint = ptag($ps->{tag});
 219            die "Tagging from unknown id unsupported: $ps->{tag}" 
 220                unless $branchpoint;
 221            
 222            # find where we are supposed to branch from
 223            `git checkout -b $ps->{branch} $branchpoint`;
 224        } 
 225        die $! if $?;
 226    } 
 227
 228        
 229    #
 230    # Apply the import/changeset/merge into the working tree
 231    # 
 232    if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
 233        $importseen = 1;
 234        apply_import($ps) or die $!;
 235    } elsif ($ps->{type} eq 's') {
 236        apply_cset($ps);
 237    }
 238
 239    #
 240    # prepare update git's index, based on what arch knows
 241    # about the pset, resolve parents, etc
 242    #
 243    my $tree;
 244    
 245    my $commitlog = `tla cat-archive-log -A $ps->{repo} $ps->{id}`; 
 246    die "Error in cat-archive-log: $!" if $?;
 247        
 248    # parselog will git-add/rm files
 249    # and generally prepare things for the commit
 250    # NOTE: parselog will shell-quote filenames! 
 251    my ($sum, $msg, $add, $del, $mod, $ren) = parselog($commitlog);
 252    my $logmessage = "$sum\n$msg";
 253
 254
 255    # imports don't give us good info
 256    # on added files. Shame on them
 257    if ($ps->{type} eq 'i' || $ps->{type} eq 't') { 
 258        `find . -type f -print0 | grep -zv '^./.git' | xargs -0 -l100 git-update-cache --add`;
 259        `git-ls-files --deleted -z | xargs --no-run-if-empty -0 -l100 git-update-cache --remove`; 
 260    }
 261
 262    if (@$add) {
 263        while (@$add) {
 264            my @slice = splice(@$add, 0, 100);
 265            my $slice = join(' ', @slice);          
 266            `git-update-cache --add $slice`;
 267            die "Error in git-update-cache --add: $!" if $?;
 268        }
 269    }
 270    if (@$del) {
 271        foreach my $file (@$del) {
 272            unlink $file or die "Problems deleting $file : $!";
 273        }
 274        while (@$del) {
 275            my @slice = splice(@$del, 0, 100);
 276            my $slice = join(' ', @slice);
 277            `git-update-cache --remove $slice`;
 278            die "Error in git-update-cache --remove: $!" if $?;
 279        }
 280    }
 281    if (@$ren) {                # renamed
 282        if (@$ren % 2) {
 283            die "Odd number of entries in rename!?";
 284        }
 285        ;
 286        while (@$ren) {
 287            my $from = pop @$ren;
 288            my $to   = pop @$ren;           
 289
 290            unless (-d dirname($to)) {
 291                mkpath(dirname($to)); # will die on err
 292            }
 293            #print "moving $from $to";
 294            `mv $from $to`;
 295            die "Error renaming $from $to : $!" if $?;
 296            `git-update-cache --remove $from`;
 297            die "Error in git-update-cache --remove: $!" if $?;
 298            `git-update-cache --add $to`;
 299            die "Error in git-update-cache --add: $!" if $?;
 300        }
 301
 302    }
 303    if (@$mod) {                # must be _after_ renames
 304        while (@$mod) {
 305            my @slice = splice(@$mod, 0, 100);
 306            my $slice = join(' ', @slice);
 307            `git-update-cache $slice`;
 308            die "Error in git-update-cache: $!" if $?;
 309        }
 310    }
 311
 312    # warn "errors when running git-update-cache! $!";
 313    $tree = `git-write-tree`;
 314    die "cannot write tree $!" if $?;
 315    chomp $tree;
 316        
 317    
 318    #
 319    # Who's your daddy?
 320    #
 321    my @par;
 322    if ( -e ".git/refs/heads/$ps->{branch}") {
 323        if (open HEAD, "<.git/refs/heads/$ps->{branch}") {
 324            my $p = <HEAD>;
 325            close HEAD;
 326            chomp $p;
 327            push @par, '-p', $p;
 328        } else { 
 329            if ($ps->{type} eq 's') {
 330                warn "Could not find the right head for the branch $ps->{branch}";
 331            }
 332        }
 333    }
 334    
 335    my $par = join (' ', @par);
 336
 337    #    
 338    # Commit, tag and clean state
 339    #
 340    $ENV{TZ}                  = 'GMT';
 341    $ENV{GIT_AUTHOR_NAME}     = $ps->{author};
 342    $ENV{GIT_AUTHOR_EMAIL}    = $ps->{email};
 343    $ENV{GIT_AUTHOR_DATE}     = $ps->{date};
 344    $ENV{GIT_COMMITTER_NAME}  = $ps->{author};
 345    $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
 346    $ENV{GIT_COMMITTER_DATE}  = $ps->{date};
 347
 348    my ($pid, $commit_rh, $commit_wh);
 349    $commit_rh = 'commit_rh';
 350    $commit_wh = 'commit_wh';
 351    
 352    $pid = open2(*READER, *WRITER, "git-commit-tree $tree $par") 
 353        or die $!;
 354    print WRITER $logmessage;   # write
 355    close WRITER;
 356    my $commitid = <READER>;    # read
 357    chomp $commitid;
 358    close READER;
 359    waitpid $pid,0;             # close;
 360
 361    if (length $commitid != 40) {
 362        die "Something went wrong with the commit! $! $commitid";
 363    }
 364    #
 365    # Update the branch
 366    # 
 367    open  HEAD, ">.git/refs/heads/$ps->{branch}";
 368    print HEAD $commitid;
 369    close HEAD;
 370    unlink ('.git/HEAD');
 371    symlink("refs/heads/$ps->{branch}",".git/HEAD");
 372
 373    # tag accordingly
 374    ptag($ps->{id}, $commitid); # private tag
 375    if ($opt_T || $ps->{type} eq 't' || $ps->{type} eq 'i') {
 376        tag($ps->{id}, $commitid);
 377    }
 378    print " * Committed $ps->{id}\n";
 379    print "   + tree   $tree\n";
 380    print "   + commit $commitid\n";
 381    # print "   + commit date is  $ps->{date} \n";
 382}
 383
 384sub branchname {
 385    my $id = shift;
 386    $id =~ s#^.+?/##;
 387    my @parts = split(m/--/, $id);
 388    return join('--', @parts[0..1]);
 389}
 390
 391sub apply_import {
 392    my $ps = shift;
 393    my $bname = branchname($ps->{id});
 394
 395    `mkdir -p $tmp`;
 396
 397    `tla get -s --no-pristine -A $ps->{repo} $ps->{id} $tmp/import`;
 398    die "Cannot get import: $!" if $?;    
 399    `rsync -v --archive --delete --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/import/* ./`;
 400    die "Cannot rsync import:$!" if $?;
 401    
 402    `rm -fr $tmp/import`;
 403    die "Cannot remove tempdir: $!" if $?;
 404    
 405
 406    return 1;
 407}
 408
 409sub apply_cset {
 410    my $ps = shift;
 411
 412    `mkdir -p $tmp`;
 413
 414    # get the changeset
 415    `tla get-changeset  -A $ps->{repo} $ps->{id} $tmp/changeset`;
 416    die "Cannot get changeset: $!" if $?;
 417    
 418    # apply patches
 419    if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
 420        # this can be sped up considerably by doing
 421        #    (find | xargs cat) | patch
 422        # but that cna get mucked up by patches
 423        # with missing trailing newlines or the standard 
 424        # 'missing newline' flag in the patch - possibly
 425        # produced with an old/buggy diff.
 426        # slow and safe, we invoke patch once per patchfile
 427        `find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
 428        die "Problem applying patches! $!" if $?;
 429    }
 430
 431    # apply changed binary files
 432    if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
 433        foreach my $mod (@modified) {
 434            chomp $mod;
 435            my $orig = $mod;
 436            $orig =~ s/\.modified$//; # lazy
 437            $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
 438            #print "rsync -p '$mod' '$orig'";
 439            `rsync -p $mod ./$orig`;
 440            die "Problem applying binary changes! $!" if $?;
 441        }
 442    }
 443
 444    # bring in new files
 445    `rsync --archive --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/changeset/new-files-archive/* ./`;
 446
 447    # deleted files are hinted from the commitlog processing
 448
 449    `rm -fr $tmp/changeset`;
 450}
 451
 452
 453# =for reference
 454# A log entry looks like 
 455# Revision: moodle-org--moodle--1.3.3--patch-15
 456# Archive: arch-eduforge@catalyst.net.nz--2004
 457# Creator: Penny Leach <penny@catalyst.net.nz>
 458# Date: Wed May 25 14:15:34 NZST 2005
 459# Standard-date: 2005-05-25 02:15:34 GMT
 460# New-files: lang/de/.arch-ids/block_glossary_random.php.id
 461#     lang/de/.arch-ids/block_html.php.id
 462# New-directories: lang/de/help/questionnaire
 463#     lang/de/help/questionnaire/.arch-ids
 464# Renamed-files: .arch-ids/db_sears.sql.id db/.arch-ids/db_sears.sql.id
 465#    db_sears.sql db/db_sears.sql
 466# Removed-files: lang/be/docs/.arch-ids/release.html.id
 467#     lang/be/docs/.arch-ids/releaseold.html.id
 468# Modified-files: admin/cron.php admin/delete.php
 469#     admin/editor.html backup/lib.php backup/restore.php
 470# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
 471# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
 472# Keywords:
 473#
 474# Updating yadda tadda tadda madda
 475sub parselog {
 476    my $log = shift;
 477    #print $log;
 478
 479    my (@add, @del, @mod, @ren, @kw, $sum, $msg );
 480
 481    if ($log =~ m/(?:\n|^)New-files:(.*?)(?=\n\w)/s ) {
 482        my $files = $1;
 483        @add = split(m/\s+/s, $files);
 484    }
 485       
 486    if ($log =~ m/(?:\n|^)Removed-files:(.*?)(?=\n\w)/s ) {
 487        my $files = $1;
 488        @del = split(m/\s+/s, $files);
 489    }
 490    
 491    if ($log =~ m/(?:\n|^)Modified-files:(.*?)(?=\n\w)/s ) {
 492        my $files = $1;
 493        @mod = split(m/\s+/s, $files);
 494    }
 495    
 496    if ($log =~ m/(?:\n|^)Renamed-files:(.*?)(?=\n\w)/s ) {
 497        my $files = $1;
 498        @ren = split(m/\s+/s, $files);
 499    }
 500
 501    $sum ='';
 502    if ($log =~ m/^Summary:(.+?)$/m ) {
 503        $sum = $1;
 504        $sum =~ s/^\s+//;
 505        $sum =~ s/\s+$//;
 506    }
 507
 508    $msg = '';
 509    if ($log =~ m/\n\n(.+)$/s) {
 510        $msg = $1;
 511        $msg =~ s/^\s+//;
 512        $msg =~ s/\s+$//;
 513    }
 514
 515
 516    # cleanup the arrays
 517    foreach my $ref ( (\@add, \@del, \@mod, \@ren) ) {
 518        my @tmp = ();
 519        while (my $t = pop @$ref) {
 520            next unless length ($t);
 521            next if $t =~ m!\{arch\}/!;
 522            next if $t =~ m!\.arch-ids/!;
 523            next if $t =~ m!\.arch-inventory$!;
 524            push (@tmp, shell_quote($t));
 525        }
 526        @$ref = @tmp;
 527    }
 528    
 529    #print Dumper [$sum, $msg, \@add, \@del, \@mod, \@ren]; 
 530    return       ($sum, $msg, \@add, \@del, \@mod, \@ren); 
 531}
 532
 533# write/read a tag
 534sub tag {
 535    my ($tag, $commit) = @_;
 536    $tag =~ s|/|--|g; 
 537    $tag = shell_quote($tag);
 538    
 539    if ($commit) {
 540        open(C,">.git/refs/tags/$tag")
 541            or die "Cannot create tag $tag: $!\n";
 542        print C "$commit\n"
 543            or die "Cannot write tag $tag: $!\n";
 544        close(C)
 545            or die "Cannot write tag $tag: $!\n";
 546        print "Created tag '$tag' on '$commit'\n" if $opt_v;
 547    } else {                    # read
 548        open(C,"<.git/refs/tags/$tag")
 549            or die "Cannot read tag $tag: $!\n";
 550        $commit = <C>;
 551        chomp $commit;
 552        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 553        close(C)
 554            or die "Cannot read tag $tag: $!\n";
 555        return $commit;
 556    }
 557}
 558
 559# write/read a private tag
 560# reads fail softly if the tag isn't there
 561sub ptag {
 562    my ($tag, $commit) = @_;
 563    $tag =~ s|/|--|g; 
 564    $tag = shell_quote($tag);
 565    
 566    unless (-d '.git/archimport/tags') {
 567        mkpath('.git/archimport/tags');
 568    }
 569
 570    if ($commit) {              # write
 571        open(C,">.git/archimport/tags/$tag")
 572            or die "Cannot create tag $tag: $!\n";
 573        print C "$commit\n"
 574            or die "Cannot write tag $tag: $!\n";
 575        close(C)
 576            or die "Cannot write tag $tag: $!\n";
 577    } else {                    # read
 578        # if the tag isn't there, return 0
 579        unless ( -s ".git/archimport/tags/$tag") {
 580            warn "Could not find tag $tag -- perhaps it isn't in the repos we have?\n" 
 581                if $opt_v;
 582            return 0;
 583        }
 584        open(C,"<.git/archimport/tags/$tag")
 585            or die "Cannot read tag $tag: $!\n";
 586        $commit = <C>;
 587        chomp $commit;
 588        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 589        close(C)
 590            or die "Cannot read tag $tag: $!\n";
 591        return $commit;
 592    }
 593}