git-archimport-scripton commit Merge branch 'master' of . (712d865)
   1#!/usr/bin/perl -w
   2#
   3# This tool is copyright (c) 2005, Martin Langhoff.
   4# It is released under the Gnu Public License, version 2.
   5#
   6# The basic idea is to walk the output of tla abrowse, 
   7# fetch the changesets and apply them. 
   8#
   9=head1 Invocation
  10
  11    git-archimport-script -i <archive>/<branch> [<archive>/<branch>]
  12    [ <archive>/<branch> ]
  13
  14    The script expects you to provide the key roots where it can start the
  15    import from an 'initial import' or 'tag' type of Arch commit. It will
  16    then follow all the branching and tagging within the provided roots.
  17
  18    It will die if it sees branches that have different roots. 
  19
  20=head2 TODO
  21
  22 - keep track of merged patches, and mark a git merge when it happens
  23 - smarter rules to parse the archive history "up" and "down"
  24 - be able to continue an import where we left off
  25 - audit shell-escaping of filenames
  26
  27=head1 Devel tricks
  28
  29Add print in front of the shell commands invoked via backticks. 
  30
  31=cut
  32
  33use strict;
  34use warnings;
  35use Getopt::Std;
  36use File::Spec;
  37use File::Temp qw(tempfile);
  38use File::Path qw(mkpath);
  39use File::Basename qw(basename dirname);
  40use String::ShellQuote;
  41use Time::Local;
  42use IO::Socket;
  43use IO::Pipe;
  44use POSIX qw(strftime dup2);
  45use Data::Dumper qw/ Dumper /;
  46use IPC::Open2;
  47
  48$SIG{'PIPE'}="IGNORE";
  49$ENV{'TZ'}="UTC";
  50
  51our($opt_h,$opt_v, $opt_T,
  52    $opt_C,$opt_t);
  53
  54sub usage() {
  55    print STDERR <<END;
  56Usage: ${\basename $0}     # fetch/update GIT from Arch
  57       [ -h ] [ -v ] [ -T ] 
  58       [ -C GIT_repository ] [ -t tempdir ] 
  59       repository/arch-branch [ repository/arch-branch] ...
  60END
  61    exit(1);
  62}
  63
  64getopts("hviC:t:") or usage();
  65usage if $opt_h;
  66
  67@ARGV >= 1 or usage();
  68my @arch_roots = @ARGV;
  69
  70my $tmp = $opt_t;
  71$tmp ||= '/tmp';
  72$tmp .= '/git-archimport/';
  73
  74my $git_tree = $opt_C;
  75$git_tree ||= ".";
  76
  77
  78my @psets  = ();                # the collection
  79
  80foreach my $root (@arch_roots) {
  81    my ($arepo, $abranch) = split(m!/!, $root);
  82    open ABROWSE, "tla abrowse -f -A $arepo --desc --merges $abranch |" 
  83        or die "Problems with tla abrowse: $!";
  84    
  85    my %ps        = ();         # the current one
  86    my $mode      = '';
  87    my $lastseen  = '';
  88    
  89    while (<ABROWSE>) {
  90        chomp;
  91        
  92        # first record padded w 8 spaces
  93        if (s/^\s{8}\b//) {
  94            
  95            # store the record we just captured
  96            if (%ps) {
  97                my %temp = %ps; # break references
  98                push (@psets, \%temp);
  99                %ps = ();
 100            }
 101            
 102            my ($id, $type) = split(m/\s{3}/, $_);
 103            $ps{id}   = $id;
 104            $ps{repo} = $arepo;
 105
 106            # deal with types
 107            if ($type =~ m/^\(simple changeset\)/) {
 108                $ps{type} = 's';
 109            } elsif ($type eq '(initial import)') {
 110                $ps{type} = 'i';
 111            } elsif ($type =~ m/^\(tag revision of (.+)\)/) {
 112                $ps{type} = 't';
 113                $ps{tag}  = $1;
 114            } else { 
 115                warn "Unknown type $type";
 116            }
 117            $lastseen = 'id';
 118        }
 119        
 120        if (s/^\s{10}//) { 
 121            # 10 leading spaces or more 
 122            # indicate commit metadata
 123            
 124            # date & author 
 125            if ($lastseen eq 'id' && m/^\d{4}-\d{2}-\d{2}/) {
 126                
 127                my ($date, $authoremail) = split(m/\s{2,}/, $_);
 128                $ps{date}   = $date;
 129                $ps{date}   =~ s/\bGMT$//; # strip off trailign GMT
 130                if ($ps{date} =~ m/\b\w+$/) {
 131                    warn 'Arch dates not in GMT?! - imported dates will be wrong';
 132                }
 133            
 134                $authoremail =~ m/^(.+)\s(\S+)$/;
 135                $ps{author} = $1;
 136                $ps{email}  = $2;
 137            
 138                $lastseen = 'date';
 139            
 140            } elsif ($lastseen eq 'date') {
 141                # the only hint is position
 142                # subject is after date
 143                $ps{subj} = $_;
 144                $lastseen = 'subj';
 145            
 146            } elsif ($lastseen eq 'subj' && $_ eq 'merges in:') {
 147                $ps{merges} = [];
 148                $lastseen = 'merges';
 149            
 150            } elsif ($lastseen eq 'merges' && s/^\s{2}//) {
 151                push (@{$ps{merges}}, $_);
 152            } else {
 153                warn 'more metadata after merges!?';
 154            }
 155            
 156        }
 157    }
 158
 159    if (%ps) {
 160        my %temp = %ps;         # break references
 161        push (@psets, \%temp);
 162        %ps = ();
 163    }    
 164    close ABROWSE;
 165}                               # end foreach $root
 166
 167## Order patches by time
 168@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
 169
 170#print Dumper \@psets;
 171
 172##
 173## TODO cleanup irrelevant patches
 174##      and put an initial import
 175##      or a full tag
 176my $import = 0;
 177unless (-d '.git') { # initial import
 178    if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
 179        print "Starting import from $psets[0]{id}\n";
 180        `git-init-db`;
 181        die $! if $?;
 182        $import = 1;
 183    } else {
 184        die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
 185    }
 186}
 187
 188# process patchsets
 189foreach my $ps (@psets) {
 190
 191    $ps->{branch} =  branchname($ps->{id});
 192
 193    #
 194    # ensure we have a clean state 
 195    # 
 196    if (`git diff-files`) {
 197        die "Unclean tree when about to process $ps->{id} " .
 198            " - did we fail to commit cleanly before?";
 199    }
 200    die $! if $?;
 201
 202    #
 203    # skip commits already in repo
 204    #
 205    if (ptag($ps->{id})) {
 206      $opt_v && print "Skipping already imported: $ps->{id}\n";
 207      next;
 208    }
 209
 210    # 
 211    # create the branch if needed
 212    #
 213    if ($ps->{type} eq 'i' && !$import) {
 214        die "Should not have more than one 'Initial import' per GIT import: $ps->{id}";
 215    }
 216
 217    unless ($import) { # skip for import
 218        if ( -e ".git/refs/heads/$ps->{branch}") {
 219            # we know about this branch
 220            `git checkout    $ps->{branch}`;
 221        } else {
 222            # new branch! we need to verify a few things
 223            die "Branch on a non-tag!" unless $ps->{type} eq 't';
 224            my $branchpoint = ptag($ps->{tag});
 225            die "Tagging from unknown id unsupported: $ps->{tag}" 
 226                unless $branchpoint;
 227            
 228            # find where we are supposed to branch from
 229            `git checkout -b $ps->{branch} $branchpoint`;
 230
 231            # If we trust Arch with the fact that this is just 
 232            # a tag, and it does not affect the state of the tree
 233            # then we just tag and move on
 234            tag($ps->{id}, $branchpoint);
 235            ptag($ps->{id}, $branchpoint);
 236            print " * Tagged $ps->{id} at $branchpoint\n";
 237            next;
 238        } 
 239        die $! if $?;
 240    } 
 241
 242    #
 243    # Apply the import/changeset/merge into the working tree
 244    # 
 245    if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
 246        apply_import($ps) or die $!;
 247        $import=0;
 248    } elsif ($ps->{type} eq 's') {
 249        apply_cset($ps);
 250    }
 251
 252    #
 253    # prepare update git's index, based on what arch knows
 254    # about the pset, resolve parents, etc
 255    #
 256    my $tree;
 257    
 258    my $commitlog = `tla cat-archive-log -A $ps->{repo} $ps->{id}`; 
 259    die "Error in cat-archive-log: $!" if $?;
 260        
 261    # parselog will git-add/rm files
 262    # and generally prepare things for the commit
 263    # NOTE: parselog will shell-quote filenames! 
 264    my ($sum, $msg, $add, $del, $mod, $ren) = parselog($commitlog);
 265    my $logmessage = "$sum\n$msg";
 266
 267
 268    # imports don't give us good info
 269    # on added files. Shame on them
 270    if ($ps->{type} eq 'i' || $ps->{type} eq 't') { 
 271        `find . -type f -print0 | grep -zv '^./.git' | xargs -0 -l100 git-update-cache --add`;
 272        `git-ls-files --deleted -z | xargs --no-run-if-empty -0 -l100 git-update-cache --remove`; 
 273    }
 274
 275    if (@$add) {
 276        while (@$add) {
 277            my @slice = splice(@$add, 0, 100);
 278            my $slice = join(' ', @slice);          
 279            `git-update-cache --add $slice`;
 280            die "Error in git-update-cache --add: $!" if $?;
 281        }
 282    }
 283    if (@$del) {
 284        foreach my $file (@$del) {
 285            unlink $file or die "Problems deleting $file : $!";
 286        }
 287        while (@$del) {
 288            my @slice = splice(@$del, 0, 100);
 289            my $slice = join(' ', @slice);
 290            `git-update-cache --remove $slice`;
 291            die "Error in git-update-cache --remove: $!" if $?;
 292        }
 293    }
 294    if (@$ren) {                # renamed
 295        if (@$ren % 2) {
 296            die "Odd number of entries in rename!?";
 297        }
 298        ;
 299        while (@$ren) {
 300            my $from = pop @$ren;
 301            my $to   = pop @$ren;           
 302
 303            unless (-d dirname($to)) {
 304                mkpath(dirname($to)); # will die on err
 305            }
 306            #print "moving $from $to";
 307            `mv $from $to`;
 308            die "Error renaming $from $to : $!" if $?;
 309            `git-update-cache --remove $from`;
 310            die "Error in git-update-cache --remove: $!" if $?;
 311            `git-update-cache --add $to`;
 312            die "Error in git-update-cache --add: $!" if $?;
 313        }
 314
 315    }
 316    if (@$mod) {                # must be _after_ renames
 317        while (@$mod) {
 318            my @slice = splice(@$mod, 0, 100);
 319            my $slice = join(' ', @slice);
 320            `git-update-cache $slice`;
 321            die "Error in git-update-cache: $!" if $?;
 322        }
 323    }
 324
 325    # warn "errors when running git-update-cache! $!";
 326    $tree = `git-write-tree`;
 327    die "cannot write tree $!" if $?;
 328    chomp $tree;
 329        
 330    
 331    #
 332    # Who's your daddy?
 333    #
 334    my @par;
 335    if ( -e ".git/refs/heads/$ps->{branch}") {
 336        if (open HEAD, "<.git/refs/heads/$ps->{branch}") {
 337            my $p = <HEAD>;
 338            close HEAD;
 339            chomp $p;
 340            push @par, '-p', $p;
 341        } else { 
 342            if ($ps->{type} eq 's') {
 343                warn "Could not find the right head for the branch $ps->{branch}";
 344            }
 345        }
 346    }
 347    
 348    my $par = join (' ', @par);
 349
 350    #    
 351    # Commit, tag and clean state
 352    #
 353    $ENV{TZ}                  = 'GMT';
 354    $ENV{GIT_AUTHOR_NAME}     = $ps->{author};
 355    $ENV{GIT_AUTHOR_EMAIL}    = $ps->{email};
 356    $ENV{GIT_AUTHOR_DATE}     = $ps->{date};
 357    $ENV{GIT_COMMITTER_NAME}  = $ps->{author};
 358    $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
 359    $ENV{GIT_COMMITTER_DATE}  = $ps->{date};
 360
 361    my ($pid, $commit_rh, $commit_wh);
 362    $commit_rh = 'commit_rh';
 363    $commit_wh = 'commit_wh';
 364    
 365    $pid = open2(*READER, *WRITER, "git-commit-tree $tree $par") 
 366        or die $!;
 367    print WRITER $logmessage;   # write
 368    close WRITER;
 369    my $commitid = <READER>;    # read
 370    chomp $commitid;
 371    close READER;
 372    waitpid $pid,0;             # close;
 373
 374    if (length $commitid != 40) {
 375        die "Something went wrong with the commit! $! $commitid";
 376    }
 377    #
 378    # Update the branch
 379    # 
 380    open  HEAD, ">.git/refs/heads/$ps->{branch}";
 381    print HEAD $commitid;
 382    close HEAD;
 383    unlink ('.git/HEAD');
 384    symlink("refs/heads/$ps->{branch}",".git/HEAD");
 385
 386    # tag accordingly
 387    ptag($ps->{id}, $commitid); # private tag
 388    if ($opt_T || $ps->{type} eq 't' || $ps->{type} eq 'i') {
 389        tag($ps->{id}, $commitid);
 390    }
 391    print " * Committed $ps->{id}\n";
 392    print "   + tree   $tree\n";
 393    print "   + commit $commitid\n";
 394    # print "   + commit date is  $ps->{date} \n";
 395}
 396
 397sub branchname {
 398    my $id = shift;
 399    $id =~ s#^.+?/##;
 400    my @parts = split(m/--/, $id);
 401    return join('--', @parts[0..1]);
 402}
 403
 404sub apply_import {
 405    my $ps = shift;
 406    my $bname = branchname($ps->{id});
 407
 408    `mkdir -p $tmp`;
 409
 410    `tla get -s --no-pristine -A $ps->{repo} $ps->{id} $tmp/import`;
 411    die "Cannot get import: $!" if $?;    
 412    `rsync -v --archive --delete --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/import/* ./`;
 413    die "Cannot rsync import:$!" if $?;
 414    
 415    `rm -fr $tmp/import`;
 416    die "Cannot remove tempdir: $!" if $?;
 417    
 418
 419    return 1;
 420}
 421
 422sub apply_cset {
 423    my $ps = shift;
 424
 425    `mkdir -p $tmp`;
 426
 427    # get the changeset
 428    `tla get-changeset  -A $ps->{repo} $ps->{id} $tmp/changeset`;
 429    die "Cannot get changeset: $!" if $?;
 430    
 431    # apply patches
 432    if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
 433        # this can be sped up considerably by doing
 434        #    (find | xargs cat) | patch
 435        # but that cna get mucked up by patches
 436        # with missing trailing newlines or the standard 
 437        # 'missing newline' flag in the patch - possibly
 438        # produced with an old/buggy diff.
 439        # slow and safe, we invoke patch once per patchfile
 440        `find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
 441        die "Problem applying patches! $!" if $?;
 442    }
 443
 444    # apply changed binary files
 445    if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
 446        foreach my $mod (@modified) {
 447            chomp $mod;
 448            my $orig = $mod;
 449            $orig =~ s/\.modified$//; # lazy
 450            $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
 451            #print "rsync -p '$mod' '$orig'";
 452            `rsync -p $mod ./$orig`;
 453            die "Problem applying binary changes! $!" if $?;
 454        }
 455    }
 456
 457    # bring in new files
 458    `rsync --archive --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/changeset/new-files-archive/* ./`;
 459
 460    # deleted files are hinted from the commitlog processing
 461
 462    `rm -fr $tmp/changeset`;
 463}
 464
 465
 466# =for reference
 467# A log entry looks like 
 468# Revision: moodle-org--moodle--1.3.3--patch-15
 469# Archive: arch-eduforge@catalyst.net.nz--2004
 470# Creator: Penny Leach <penny@catalyst.net.nz>
 471# Date: Wed May 25 14:15:34 NZST 2005
 472# Standard-date: 2005-05-25 02:15:34 GMT
 473# New-files: lang/de/.arch-ids/block_glossary_random.php.id
 474#     lang/de/.arch-ids/block_html.php.id
 475# New-directories: lang/de/help/questionnaire
 476#     lang/de/help/questionnaire/.arch-ids
 477# Renamed-files: .arch-ids/db_sears.sql.id db/.arch-ids/db_sears.sql.id
 478#    db_sears.sql db/db_sears.sql
 479# Removed-files: lang/be/docs/.arch-ids/release.html.id
 480#     lang/be/docs/.arch-ids/releaseold.html.id
 481# Modified-files: admin/cron.php admin/delete.php
 482#     admin/editor.html backup/lib.php backup/restore.php
 483# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
 484# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
 485# Keywords:
 486#
 487# Updating yadda tadda tadda madda
 488sub parselog {
 489    my $log = shift;
 490    #print $log;
 491
 492    my (@add, @del, @mod, @ren, @kw, $sum, $msg );
 493
 494    if ($log =~ m/(?:\n|^)New-files:(.*?)(?=\n\w)/s ) {
 495        my $files = $1;
 496        @add = split(m/\s+/s, $files);
 497    }
 498       
 499    if ($log =~ m/(?:\n|^)Removed-files:(.*?)(?=\n\w)/s ) {
 500        my $files = $1;
 501        @del = split(m/\s+/s, $files);
 502    }
 503    
 504    if ($log =~ m/(?:\n|^)Modified-files:(.*?)(?=\n\w)/s ) {
 505        my $files = $1;
 506        @mod = split(m/\s+/s, $files);
 507    }
 508    
 509    if ($log =~ m/(?:\n|^)Renamed-files:(.*?)(?=\n\w)/s ) {
 510        my $files = $1;
 511        @ren = split(m/\s+/s, $files);
 512    }
 513
 514    $sum ='';
 515    if ($log =~ m/^Summary:(.+?)$/m ) {
 516        $sum = $1;
 517        $sum =~ s/^\s+//;
 518        $sum =~ s/\s+$//;
 519    }
 520
 521    $msg = '';
 522    if ($log =~ m/\n\n(.+)$/s) {
 523        $msg = $1;
 524        $msg =~ s/^\s+//;
 525        $msg =~ s/\s+$//;
 526    }
 527
 528
 529    # cleanup the arrays
 530    foreach my $ref ( (\@add, \@del, \@mod, \@ren) ) {
 531        my @tmp = ();
 532        while (my $t = pop @$ref) {
 533            next unless length ($t);
 534            next if $t =~ m!\{arch\}/!;
 535            next if $t =~ m!\.arch-ids/!;
 536            next if $t =~ m!\.arch-inventory$!;
 537            push (@tmp, shell_quote($t));
 538        }
 539        @$ref = @tmp;
 540    }
 541    
 542    #print Dumper [$sum, $msg, \@add, \@del, \@mod, \@ren]; 
 543    return       ($sum, $msg, \@add, \@del, \@mod, \@ren); 
 544}
 545
 546# write/read a tag
 547sub tag {
 548    my ($tag, $commit) = @_;
 549    $tag =~ s|/|--|g; 
 550    $tag = shell_quote($tag);
 551    
 552    if ($commit) {
 553        open(C,">.git/refs/tags/$tag")
 554            or die "Cannot create tag $tag: $!\n";
 555        print C "$commit\n"
 556            or die "Cannot write tag $tag: $!\n";
 557        close(C)
 558            or die "Cannot write tag $tag: $!\n";
 559        print "Created tag '$tag' on '$commit'\n" if $opt_v;
 560    } else {                    # read
 561        open(C,"<.git/refs/tags/$tag")
 562            or die "Cannot read tag $tag: $!\n";
 563        $commit = <C>;
 564        chomp $commit;
 565        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 566        close(C)
 567            or die "Cannot read tag $tag: $!\n";
 568        return $commit;
 569    }
 570}
 571
 572# write/read a private tag
 573# reads fail softly if the tag isn't there
 574sub ptag {
 575    my ($tag, $commit) = @_;
 576    $tag =~ s|/|--|g; 
 577    $tag = shell_quote($tag);
 578    
 579    unless (-d '.git/archimport/tags') {
 580        mkpath('.git/archimport/tags');
 581    }
 582
 583    if ($commit) {              # write
 584        open(C,">.git/archimport/tags/$tag")
 585            or die "Cannot create tag $tag: $!\n";
 586        print C "$commit\n"
 587            or die "Cannot write tag $tag: $!\n";
 588        close(C)
 589            or die "Cannot write tag $tag: $!\n";
 590    } else {                    # read
 591        # if the tag isn't there, return 0
 592        unless ( -s ".git/archimport/tags/$tag") {
 593            return 0;
 594        }
 595        open(C,"<.git/archimport/tags/$tag")
 596            or die "Cannot read tag $tag: $!\n";
 597        $commit = <C>;
 598        chomp $commit;
 599        die "Error reading tag $tag: $!\n" unless length $commit == 40;
 600        close(C)
 601            or die "Cannot read tag $tag: $!\n";
 602        return $commit;
 603    }
 604}