1#!/usr/bin/perl -w
2#
3# This tool is copyright (c) 2005, Martin Langhoff.
4# It is released under the Gnu Public License, version 2.
5#
6# The basic idea is to walk the output of tla abrowse,
7# fetch the changesets and apply them.
8#
9=head1 Invocation
10
11 git-archimport-script -i <archive>/<branch> [<archive>/<branch>]
12 [ <archive>/<branch> ]
13
14 The script expects you to provide the key roots where it can start the
15 import from an 'initial import' or 'tag' type of Arch commit. It will
16 then follow all the branching and tagging within the provided roots.
17
18 It will die if it sees branches that have different roots.
19
20=head2 TODO
21
22 - keep track of merged patches, and mark a git merge when it happens
23 - smarter rules to parse the archive history "up" and "down"
24 - be able to continue an import where we left off
25 - audit shell-escaping of filenames
26
27=head1 Devel tricks
28
29Add print in front of the shell commands invoked via backticks.
30
31=cut
32
33use strict;
34use warnings;
35use Getopt::Std;
36use File::Spec;
37use File::Temp qw(tempfile);
38use File::Path qw(mkpath);
39use File::Basename qw(basename dirname);
40use String::ShellQuote;
41use Time::Local;
42use IO::Socket;
43use IO::Pipe;
44use POSIX qw(strftime dup2);
45use Data::Dumper qw/ Dumper /;
46use IPC::Open2;
47
48$SIG{'PIPE'}="IGNORE";
49$ENV{'TZ'}="UTC";
50
51our($opt_h,$opt_v, $opt_T,
52 $opt_C,$opt_t, $opt_i);
53
54sub usage() {
55 print STDERR <<END;
56Usage: ${\basename $0} # fetch/update GIT from Arch
57 [ -h ] [ -v ] [ -i ] [ -T ]
58 [ -C GIT_repository ] [ -t tempdir ]
59 repository/arch-branch [ repository/arch-branch] ...
60END
61 exit(1);
62}
63
64getopts("hviC:t:") or usage();
65usage if $opt_h;
66
67@ARGV >= 1 or usage();
68my @arch_roots = @ARGV;
69
70my $tmp = $opt_t;
71$tmp ||= '/tmp';
72$tmp .= '/git-archimport/';
73
74my $git_tree = $opt_C;
75$git_tree ||= ".";
76
77
78my @psets = (); # the collection
79
80foreach my $root (@arch_roots) {
81 my ($arepo, $abranch) = split(m!/!, $root);
82 open ABROWSE, "tla abrowse -f -A $arepo --desc --merges $abranch |"
83 or die "Problems with tla abrowse: $!";
84
85 my %ps = (); # the current one
86 my $mode = '';
87 my $lastseen = '';
88
89 while (<ABROWSE>) {
90 chomp;
91
92 # first record padded w 8 spaces
93 if (s/^\s{8}\b//) {
94
95 # store the record we just captured
96 if (%ps) {
97 my %temp = %ps; # break references
98 push (@psets, \%temp);
99 %ps = ();
100 }
101
102 my ($id, $type) = split(m/\s{3}/, $_);
103 $ps{id} = $id;
104 $ps{repo} = $arepo;
105
106 # deal with types
107 if ($type =~ m/^\(simple changeset\)/) {
108 $ps{type} = 's';
109 } elsif ($type eq '(initial import)') {
110 $ps{type} = 'i';
111 } elsif ($type =~ m/^\(tag revision of (.+)\)/) {
112 $ps{type} = 't';
113 $ps{tag} = $1;
114 } else {
115 warn "Unknown type $type";
116 }
117 $lastseen = 'id';
118 }
119
120 if (s/^\s{10}//) {
121 # 10 leading spaces or more
122 # indicate commit metadata
123
124 # date & author
125 if ($lastseen eq 'id' && m/^\d{4}-\d{2}-\d{2}/) {
126
127 my ($date, $authoremail) = split(m/\s{2,}/, $_);
128 $ps{date} = $date;
129 $ps{date} =~ s/\bGMT$//; # strip off trailign GMT
130 if ($ps{date} =~ m/\b\w+$/) {
131 warn 'Arch dates not in GMT?! - imported dates will be wrong';
132 }
133
134 $authoremail =~ m/^(.+)\s(\S+)$/;
135 $ps{author} = $1;
136 $ps{email} = $2;
137
138 $lastseen = 'date';
139
140 } elsif ($lastseen eq 'date') {
141 # the only hint is position
142 # subject is after date
143 $ps{subj} = $_;
144 $lastseen = 'subj';
145
146 } elsif ($lastseen eq 'subj' && $_ eq 'merges in:') {
147 $ps{merges} = [];
148 $lastseen = 'merges';
149
150 } elsif ($lastseen eq 'merges' && s/^\s{2}//) {
151 push (@{$ps{merges}}, $_);
152 } else {
153 warn 'more metadata after merges!?';
154 }
155
156 }
157 }
158
159 if (%ps) {
160 my %temp = %ps; # break references
161 push (@psets, \%temp);
162 %ps = ();
163 }
164 close ABROWSE;
165} # end foreach $root
166
167## Order patches by time
168@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
169
170#print Dumper \@psets;
171
172##
173## TODO cleanup irrelevant patches
174## and put an initial import
175## or a full tag
176
177if ($opt_i) { # initial import
178 if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
179 print "Starting import from $psets[0]{id}\n";
180 } else {
181 die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
182 }
183 `git-init-db`;
184 die $! if $?;
185}
186
187# process
188my $lastbranch = branchname($psets[0]{id}); # only good for initial import
189my $importseen = $opt_i ? 0 : 1; # start at 1 if opt_i
190
191foreach my $ps (@psets) {
192
193 $ps->{branch} = branchname($ps->{id});
194
195 #
196 # ensure we have a clean state
197 #
198 if (`git diff-files`) {
199 die "Unclean tree when about to process $ps->{id} " .
200 " - did we fail to commit cleanly before?";
201 }
202 die $! if $?;
203
204 #
205 # create the branch if needed
206 #
207 if ($ps->{type} eq 'i' && $importseen) {
208 die "Should not have more than one 'Initial import' per GIT import";
209 }
210
211 unless ($opt_i && !$importseen) { # skip for first commit
212 if ( -e ".git/refs/heads/$ps->{branch}") {
213 # we know about this branch
214 `git checkout $ps->{branch}`;
215 } else {
216 # new branch! we need to verify a few things
217 die "Branch on a non-tag!" unless $ps->{type} eq 't';
218 my $branchpoint = ptag($ps->{tag});
219 die "Tagging from unknown id unsupported: $ps->{tag}"
220 unless $branchpoint;
221
222 # find where we are supposed to branch from
223 `git checkout -b $ps->{branch} $branchpoint`;
224 }
225 die $! if $?;
226 }
227
228
229 #
230 # Apply the import/changeset/merge into the working tree
231 #
232 if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
233 $importseen = 1;
234 apply_import($ps) or die $!;
235 } elsif ($ps->{type} eq 's') {
236 apply_cset($ps);
237 }
238
239 #
240 # prepare update git's index, based on what arch knows
241 # about the pset, resolve parents, etc
242 #
243 my $tree;
244
245 my $commitlog = `tla cat-archive-log -A $ps->{repo} $ps->{id}`;
246 die "Error in cat-archive-log: $!" if $?;
247
248 # parselog will git-add/rm files
249 # and generally prepare things for the commit
250 # NOTE: parselog will shell-quote filenames!
251 my ($sum, $msg, $add, $del, $mod, $ren) = parselog($commitlog);
252 my $logmessage = "$sum\n$msg";
253
254
255 # imports don't give us good info
256 # on added files. Shame on them
257 if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
258 `find . -type f -print0 | grep -zv '^./.git' | xargs -0 -l100 git-update-cache --add`;
259 `git-ls-files --deleted -z | xargs --no-run-if-empty -0 -l100 git-update-cache --remove`;
260 }
261
262 if (@$add) {
263 while (@$add) {
264 my @slice = splice(@$add, 0, 100);
265 my $slice = join(' ', @slice);
266 `git-update-cache --add $slice`;
267 die "Error in git-update-cache --add: $!" if $?;
268 }
269 }
270 if (@$del) {
271 foreach my $file (@$del) {
272 unlink $file or die "Problems deleting $file : $!";
273 }
274 while (@$del) {
275 my @slice = splice(@$del, 0, 100);
276 my $slice = join(' ', @slice);
277 `git-update-cache --remove $slice`;
278 die "Error in git-update-cache --remove: $!" if $?;
279 }
280 }
281 if (@$ren) { # renamed
282 if (@$ren % 2) {
283 die "Odd number of entries in rename!?";
284 }
285 ;
286 while (@$ren) {
287 my $from = pop @$ren;
288 my $to = pop @$ren;
289
290 unless (-d dirname($to)) {
291 mkpath(dirname($to)); # will die on err
292 }
293 #print "moving $from $to";
294 `mv $from $to`;
295 die "Error renaming $from $to : $!" if $?;
296 `git-update-cache --remove $from`;
297 die "Error in git-update-cache --remove: $!" if $?;
298 `git-update-cache --add $to`;
299 die "Error in git-update-cache --add: $!" if $?;
300 }
301
302 }
303 if (@$mod) { # must be _after_ renames
304 while (@$mod) {
305 my @slice = splice(@$mod, 0, 100);
306 my $slice = join(' ', @slice);
307 `git-update-cache $slice`;
308 die "Error in git-update-cache: $!" if $?;
309 }
310 }
311
312 # warn "errors when running git-update-cache! $!";
313 $tree = `git-write-tree`;
314 die "cannot write tree $!" if $?;
315 chomp $tree;
316
317
318 #
319 # Who's your daddy?
320 #
321 my @par;
322 if ( -e ".git/refs/heads/$ps->{branch}") {
323 if (open HEAD, "<.git/refs/heads/$ps->{branch}") {
324 my $p = <HEAD>;
325 close HEAD;
326 chomp $p;
327 push @par, '-p', $p;
328 } else {
329 if ($ps->{type} eq 's') {
330 warn "Could not find the right head for the branch $ps->{branch}";
331 }
332 }
333 }
334
335 my $par = join (' ', @par);
336
337 #
338 # Commit, tag and clean state
339 #
340 $ENV{TZ} = 'GMT';
341 $ENV{GIT_AUTHOR_NAME} = $ps->{author};
342 $ENV{GIT_AUTHOR_EMAIL} = $ps->{email};
343 $ENV{GIT_AUTHOR_DATE} = $ps->{date};
344 $ENV{GIT_COMMITTER_NAME} = $ps->{author};
345 $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
346 $ENV{GIT_COMMITTER_DATE} = $ps->{date};
347
348 my ($pid, $commit_rh, $commit_wh);
349 $commit_rh = 'commit_rh';
350 $commit_wh = 'commit_wh';
351
352 $pid = open2(*READER, *WRITER, "git-commit-tree $tree $par")
353 or die $!;
354 print WRITER $logmessage; # write
355 close WRITER;
356 my $commitid = <READER>; # read
357 chomp $commitid;
358 close READER;
359 waitpid $pid,0; # close;
360
361 if (length $commitid != 40) {
362 die "Something went wrong with the commit! $! $commitid";
363 }
364 #
365 # Update the branch
366 #
367 open HEAD, ">.git/refs/heads/$ps->{branch}";
368 print HEAD $commitid;
369 close HEAD;
370 unlink ('.git/HEAD');
371 symlink("refs/heads/$ps->{branch}",".git/HEAD");
372
373 # tag accordingly
374 ptag($ps->{id}, $commitid); # private tag
375 if ($opt_T || $ps->{type} eq 't' || $ps->{type} eq 'i') {
376 tag($ps->{id}, $commitid);
377 }
378 print " * Committed $ps->{id}\n";
379 print " + tree $tree\n";
380 print " + commit $commitid\n";
381 # print " + commit date is $ps->{date} \n";
382}
383
384sub branchname {
385 my $id = shift;
386 $id =~ s#^.+?/##;
387 my @parts = split(m/--/, $id);
388 return join('--', @parts[0..1]);
389}
390
391sub apply_import {
392 my $ps = shift;
393 my $bname = branchname($ps->{id});
394
395 `mkdir -p $tmp`;
396
397 `tla get -s --no-pristine -A $ps->{repo} $ps->{id} $tmp/import`;
398 die "Cannot get import: $!" if $?;
399 `rsync -v --archive --delete --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/import/* ./`;
400 die "Cannot rsync import:$!" if $?;
401
402 `rm -fr $tmp/import`;
403 die "Cannot remove tempdir: $!" if $?;
404
405
406 return 1;
407}
408
409sub apply_cset {
410 my $ps = shift;
411
412 `mkdir -p $tmp`;
413
414 # get the changeset
415 `tla get-changeset -A $ps->{repo} $ps->{id} $tmp/changeset`;
416 die "Cannot get changeset: $!" if $?;
417
418 # apply patches
419 if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
420 # this can be sped up considerably by doing
421 # (find | xargs cat) | patch
422 # but that cna get mucked up by patches
423 # with missing trailing newlines or the standard
424 # 'missing newline' flag in the patch - possibly
425 # produced with an old/buggy diff.
426 # slow and safe, we invoke patch once per patchfile
427 `find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
428 die "Problem applying patches! $!" if $?;
429 }
430
431 # apply changed binary files
432 if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
433 foreach my $mod (@modified) {
434 chomp $mod;
435 my $orig = $mod;
436 $orig =~ s/\.modified$//; # lazy
437 $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
438 #print "rsync -p '$mod' '$orig'";
439 `rsync -p $mod ./$orig`;
440 die "Problem applying binary changes! $!" if $?;
441 }
442 }
443
444 # bring in new files
445 `rsync --archive --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/changeset/new-files-archive/* ./`;
446
447 # deleted files are hinted from the commitlog processing
448
449 `rm -fr $tmp/changeset`;
450}
451
452
453# =for reference
454# A log entry looks like
455# Revision: moodle-org--moodle--1.3.3--patch-15
456# Archive: arch-eduforge@catalyst.net.nz--2004
457# Creator: Penny Leach <penny@catalyst.net.nz>
458# Date: Wed May 25 14:15:34 NZST 2005
459# Standard-date: 2005-05-25 02:15:34 GMT
460# New-files: lang/de/.arch-ids/block_glossary_random.php.id
461# lang/de/.arch-ids/block_html.php.id
462# New-directories: lang/de/help/questionnaire
463# lang/de/help/questionnaire/.arch-ids
464# Renamed-files: .arch-ids/db_sears.sql.id db/.arch-ids/db_sears.sql.id
465# db_sears.sql db/db_sears.sql
466# Removed-files: lang/be/docs/.arch-ids/release.html.id
467# lang/be/docs/.arch-ids/releaseold.html.id
468# Modified-files: admin/cron.php admin/delete.php
469# admin/editor.html backup/lib.php backup/restore.php
470# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
471# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
472# Keywords:
473#
474# Updating yadda tadda tadda madda
475sub parselog {
476 my $log = shift;
477 #print $log;
478
479 my (@add, @del, @mod, @ren, @kw, $sum, $msg );
480
481 if ($log =~ m/(?:\n|^)New-files:(.*?)(?=\n\w)/s ) {
482 my $files = $1;
483 @add = split(m/\s+/s, $files);
484 }
485
486 if ($log =~ m/(?:\n|^)Removed-files:(.*?)(?=\n\w)/s ) {
487 my $files = $1;
488 @del = split(m/\s+/s, $files);
489 }
490
491 if ($log =~ m/(?:\n|^)Modified-files:(.*?)(?=\n\w)/s ) {
492 my $files = $1;
493 @mod = split(m/\s+/s, $files);
494 }
495
496 if ($log =~ m/(?:\n|^)Renamed-files:(.*?)(?=\n\w)/s ) {
497 my $files = $1;
498 @ren = split(m/\s+/s, $files);
499 }
500
501 $sum ='';
502 if ($log =~ m/^Summary:(.+?)$/m ) {
503 $sum = $1;
504 $sum =~ s/^\s+//;
505 $sum =~ s/\s+$//;
506 }
507
508 $msg = '';
509 if ($log =~ m/\n\n(.+)$/s) {
510 $msg = $1;
511 $msg =~ s/^\s+//;
512 $msg =~ s/\s+$//;
513 }
514
515
516 # cleanup the arrays
517 foreach my $ref ( (\@add, \@del, \@mod, \@ren) ) {
518 my @tmp = ();
519 while (my $t = pop @$ref) {
520 next unless length ($t);
521 next if $t =~ m!\{arch\}/!;
522 next if $t =~ m!\.arch-ids/!;
523 next if $t =~ m!\.arch-inventory$!;
524 push (@tmp, shell_quote($t));
525 }
526 @$ref = @tmp;
527 }
528
529 #print Dumper [$sum, $msg, \@add, \@del, \@mod, \@ren];
530 return ($sum, $msg, \@add, \@del, \@mod, \@ren);
531}
532
533# write/read a tag
534sub tag {
535 my ($tag, $commit) = @_;
536 $tag =~ s|/|--|g;
537 $tag = shell_quote($tag);
538
539 if ($commit) {
540 open(C,">.git/refs/tags/$tag")
541 or die "Cannot create tag $tag: $!\n";
542 print C "$commit\n"
543 or die "Cannot write tag $tag: $!\n";
544 close(C)
545 or die "Cannot write tag $tag: $!\n";
546 print "Created tag '$tag' on '$commit'\n" if $opt_v;
547 } else { # read
548 open(C,"<.git/refs/tags/$tag")
549 or die "Cannot read tag $tag: $!\n";
550 $commit = <C>;
551 chomp $commit;
552 die "Error reading tag $tag: $!\n" unless length $commit == 40;
553 close(C)
554 or die "Cannot read tag $tag: $!\n";
555 return $commit;
556 }
557}
558
559# write/read a private tag
560# reads fail softly if the tag isn't there
561sub ptag {
562 my ($tag, $commit) = @_;
563 $tag =~ s|/|--|g;
564 $tag = shell_quote($tag);
565
566 unless (-d '.git/archimport/tags') {
567 mkpath('.git/archimport/tags');
568 }
569
570 if ($commit) { # write
571 open(C,">.git/archimport/tags/$tag")
572 or die "Cannot create tag $tag: $!\n";
573 print C "$commit\n"
574 or die "Cannot write tag $tag: $!\n";
575 close(C)
576 or die "Cannot write tag $tag: $!\n";
577 } else { # read
578 # if the tag isn't there, return 0
579 unless ( -s ".git/archimport/tags/$tag") {
580 warn "Could not find tag $tag -- perhaps it isn't in the repos we have?\n"
581 if $opt_v;
582 return 0;
583 }
584 open(C,"<.git/archimport/tags/$tag")
585 or die "Cannot read tag $tag: $!\n";
586 $commit = <C>;
587 chomp $commit;
588 die "Error reading tag $tag: $!\n" unless length $commit == 40;
589 close(C)
590 or die "Cannot read tag $tag: $!\n";
591 return $commit;
592 }
593}