1#!/usr/bin/perl -w
2#
3# This tool is copyright (c) 2005, Martin Langhoff.
4# It is released under the Gnu Public License, version 2.
5#
6# The basic idea is to walk the output of tla abrowse,
7# fetch the changesets and apply them.
8#
9=head1 Invocation
10
11 git-archimport-script -i <archive>/<branch> [<archive>/<branch>]
12 [ <archive>/<branch> ]
13
14 The script expects you to provide the key roots where it can start the
15 import from an 'initial import' or 'tag' type of Arch commit. It will
16 then follow all the branching and tagging within the provided roots.
17
18 It will die if it sees branches that have different roots.
19
20=head2 TODO
21
22 - keep track of merged patches, and mark a git merge when it happens
23 - smarter rules to parse the archive history "up" and "down"
24 - be able to continue an import where we left off
25 - audit shell-escaping of filenames
26
27=head1 Devel tricks
28
29Add print in front of the shell commands invoked via backticks.
30
31=cut
32
33use strict;
34use warnings;
35use Getopt::Std;
36use File::Spec;
37use File::Temp qw(tempfile);
38use File::Path qw(mkpath);
39use File::Basename qw(basename dirname);
40use String::ShellQuote;
41use Time::Local;
42use IO::Socket;
43use IO::Pipe;
44use POSIX qw(strftime dup2);
45use Data::Dumper qw/ Dumper /;
46use IPC::Open2;
47
48$SIG{'PIPE'}="IGNORE";
49$ENV{'TZ'}="UTC";
50
51our($opt_h,$opt_v, $opt_T,
52 $opt_C,$opt_t);
53
54sub usage() {
55 print STDERR <<END;
56Usage: ${\basename $0} # fetch/update GIT from Arch
57 [ -h ] [ -v ] [ -T ]
58 [ -C GIT_repository ] [ -t tempdir ]
59 repository/arch-branch [ repository/arch-branch] ...
60END
61 exit(1);
62}
63
64getopts("hviC:t:") or usage();
65usage if $opt_h;
66
67@ARGV >= 1 or usage();
68my @arch_roots = @ARGV;
69
70my $tmp = $opt_t;
71$tmp ||= '/tmp';
72$tmp .= '/git-archimport/';
73
74my $git_tree = $opt_C;
75$git_tree ||= ".";
76
77
78my @psets = (); # the collection
79
80foreach my $root (@arch_roots) {
81 my ($arepo, $abranch) = split(m!/!, $root);
82 open ABROWSE, "tla abrowse -f -A $arepo --desc --merges $abranch |"
83 or die "Problems with tla abrowse: $!";
84
85 my %ps = (); # the current one
86 my $mode = '';
87 my $lastseen = '';
88
89 while (<ABROWSE>) {
90 chomp;
91
92 # first record padded w 8 spaces
93 if (s/^\s{8}\b//) {
94
95 # store the record we just captured
96 if (%ps) {
97 my %temp = %ps; # break references
98 push (@psets, \%temp);
99 %ps = ();
100 }
101
102 my ($id, $type) = split(m/\s{3}/, $_);
103 $ps{id} = $id;
104 $ps{repo} = $arepo;
105
106 # deal with types
107 if ($type =~ m/^\(simple changeset\)/) {
108 $ps{type} = 's';
109 } elsif ($type eq '(initial import)') {
110 $ps{type} = 'i';
111 } elsif ($type =~ m/^\(tag revision of (.+)\)/) {
112 $ps{type} = 't';
113 $ps{tag} = $1;
114 } else {
115 warn "Unknown type $type";
116 }
117 $lastseen = 'id';
118 }
119
120 if (s/^\s{10}//) {
121 # 10 leading spaces or more
122 # indicate commit metadata
123
124 # date & author
125 if ($lastseen eq 'id' && m/^\d{4}-\d{2}-\d{2}/) {
126
127 my ($date, $authoremail) = split(m/\s{2,}/, $_);
128 $ps{date} = $date;
129 $ps{date} =~ s/\bGMT$//; # strip off trailign GMT
130 if ($ps{date} =~ m/\b\w+$/) {
131 warn 'Arch dates not in GMT?! - imported dates will be wrong';
132 }
133
134 $authoremail =~ m/^(.+)\s(\S+)$/;
135 $ps{author} = $1;
136 $ps{email} = $2;
137
138 $lastseen = 'date';
139
140 } elsif ($lastseen eq 'date') {
141 # the only hint is position
142 # subject is after date
143 $ps{subj} = $_;
144 $lastseen = 'subj';
145
146 } elsif ($lastseen eq 'subj' && $_ eq 'merges in:') {
147 $ps{merges} = [];
148 $lastseen = 'merges';
149
150 } elsif ($lastseen eq 'merges' && s/^\s{2}//) {
151 push (@{$ps{merges}}, $_);
152 } else {
153 warn 'more metadata after merges!?';
154 }
155
156 }
157 }
158
159 if (%ps) {
160 my %temp = %ps; # break references
161 push (@psets, \%temp);
162 %ps = ();
163 }
164 close ABROWSE;
165} # end foreach $root
166
167## Order patches by time
168@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
169
170#print Dumper \@psets;
171
172##
173## TODO cleanup irrelevant patches
174## and put an initial import
175## or a full tag
176my $import = 0;
177unless (-d '.git') { # initial import
178 if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
179 print "Starting import from $psets[0]{id}\n";
180 `git-init-db`;
181 die $! if $?;
182 $import = 1;
183 } else {
184 die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
185 }
186}
187
188# process patchsets
189foreach my $ps (@psets) {
190
191 $ps->{branch} = branchname($ps->{id});
192
193 #
194 # ensure we have a clean state
195 #
196 if (`git diff-files`) {
197 die "Unclean tree when about to process $ps->{id} " .
198 " - did we fail to commit cleanly before?";
199 }
200 die $! if $?;
201
202 #
203 # skip commits already in repo
204 #
205 if (ptag($ps->{id})) {
206 $opt_v && print "Skipping already imported: $ps->{id}\n";
207 next;
208 }
209
210 #
211 # create the branch if needed
212 #
213 if ($ps->{type} eq 'i' && !$import) {
214 die "Should not have more than one 'Initial import' per GIT import: $ps->{id}";
215 }
216
217 unless ($import) { # skip for import
218 if ( -e ".git/refs/heads/$ps->{branch}") {
219 # we know about this branch
220 `git checkout $ps->{branch}`;
221 } else {
222 # new branch! we need to verify a few things
223 die "Branch on a non-tag!" unless $ps->{type} eq 't';
224 my $branchpoint = ptag($ps->{tag});
225 die "Tagging from unknown id unsupported: $ps->{tag}"
226 unless $branchpoint;
227
228 # find where we are supposed to branch from
229 `git checkout -b $ps->{branch} $branchpoint`;
230
231 # If we trust Arch with the fact that this is just
232 # a tag, and it does not affect the state of the tree
233 # then we just tag and move on
234 tag($ps->{id}, $branchpoint);
235 ptag($ps->{id}, $branchpoint);
236 print " * Tagged $ps->{id} at $branchpoint\n";
237 next;
238 }
239 die $! if $?;
240 }
241
242 #
243 # Apply the import/changeset/merge into the working tree
244 #
245 if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
246 apply_import($ps) or die $!;
247 $import=0;
248 } elsif ($ps->{type} eq 's') {
249 apply_cset($ps);
250 }
251
252 #
253 # prepare update git's index, based on what arch knows
254 # about the pset, resolve parents, etc
255 #
256 my $tree;
257
258 my $commitlog = `tla cat-archive-log -A $ps->{repo} $ps->{id}`;
259 die "Error in cat-archive-log: $!" if $?;
260
261 # parselog will git-add/rm files
262 # and generally prepare things for the commit
263 # NOTE: parselog will shell-quote filenames!
264 my ($sum, $msg, $add, $del, $mod, $ren) = parselog($commitlog);
265 my $logmessage = "$sum\n$msg";
266
267
268 # imports don't give us good info
269 # on added files. Shame on them
270 if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
271 `find . -type f -print0 | grep -zv '^./.git' | xargs -0 -l100 git-update-cache --add`;
272 `git-ls-files --deleted -z | xargs --no-run-if-empty -0 -l100 git-update-cache --remove`;
273 }
274
275 if (@$add) {
276 while (@$add) {
277 my @slice = splice(@$add, 0, 100);
278 my $slice = join(' ', @slice);
279 `git-update-cache --add $slice`;
280 die "Error in git-update-cache --add: $!" if $?;
281 }
282 }
283 if (@$del) {
284 foreach my $file (@$del) {
285 unlink $file or die "Problems deleting $file : $!";
286 }
287 while (@$del) {
288 my @slice = splice(@$del, 0, 100);
289 my $slice = join(' ', @slice);
290 `git-update-cache --remove $slice`;
291 die "Error in git-update-cache --remove: $!" if $?;
292 }
293 }
294 if (@$ren) { # renamed
295 if (@$ren % 2) {
296 die "Odd number of entries in rename!?";
297 }
298 ;
299 while (@$ren) {
300 my $from = pop @$ren;
301 my $to = pop @$ren;
302
303 unless (-d dirname($to)) {
304 mkpath(dirname($to)); # will die on err
305 }
306 #print "moving $from $to";
307 `mv $from $to`;
308 die "Error renaming $from $to : $!" if $?;
309 `git-update-cache --remove $from`;
310 die "Error in git-update-cache --remove: $!" if $?;
311 `git-update-cache --add $to`;
312 die "Error in git-update-cache --add: $!" if $?;
313 }
314
315 }
316 if (@$mod) { # must be _after_ renames
317 while (@$mod) {
318 my @slice = splice(@$mod, 0, 100);
319 my $slice = join(' ', @slice);
320 `git-update-cache $slice`;
321 die "Error in git-update-cache: $!" if $?;
322 }
323 }
324
325 # warn "errors when running git-update-cache! $!";
326 $tree = `git-write-tree`;
327 die "cannot write tree $!" if $?;
328 chomp $tree;
329
330
331 #
332 # Who's your daddy?
333 #
334 my @par;
335 if ( -e ".git/refs/heads/$ps->{branch}") {
336 if (open HEAD, "<.git/refs/heads/$ps->{branch}") {
337 my $p = <HEAD>;
338 close HEAD;
339 chomp $p;
340 push @par, '-p', $p;
341 } else {
342 if ($ps->{type} eq 's') {
343 warn "Could not find the right head for the branch $ps->{branch}";
344 }
345 }
346 }
347
348 my $par = join (' ', @par);
349
350 #
351 # Commit, tag and clean state
352 #
353 $ENV{TZ} = 'GMT';
354 $ENV{GIT_AUTHOR_NAME} = $ps->{author};
355 $ENV{GIT_AUTHOR_EMAIL} = $ps->{email};
356 $ENV{GIT_AUTHOR_DATE} = $ps->{date};
357 $ENV{GIT_COMMITTER_NAME} = $ps->{author};
358 $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
359 $ENV{GIT_COMMITTER_DATE} = $ps->{date};
360
361 my ($pid, $commit_rh, $commit_wh);
362 $commit_rh = 'commit_rh';
363 $commit_wh = 'commit_wh';
364
365 $pid = open2(*READER, *WRITER, "git-commit-tree $tree $par")
366 or die $!;
367 print WRITER $logmessage; # write
368 close WRITER;
369 my $commitid = <READER>; # read
370 chomp $commitid;
371 close READER;
372 waitpid $pid,0; # close;
373
374 if (length $commitid != 40) {
375 die "Something went wrong with the commit! $! $commitid";
376 }
377 #
378 # Update the branch
379 #
380 open HEAD, ">.git/refs/heads/$ps->{branch}";
381 print HEAD $commitid;
382 close HEAD;
383 unlink ('.git/HEAD');
384 symlink("refs/heads/$ps->{branch}",".git/HEAD");
385
386 # tag accordingly
387 ptag($ps->{id}, $commitid); # private tag
388 if ($opt_T || $ps->{type} eq 't' || $ps->{type} eq 'i') {
389 tag($ps->{id}, $commitid);
390 }
391 print " * Committed $ps->{id}\n";
392 print " + tree $tree\n";
393 print " + commit $commitid\n";
394 # print " + commit date is $ps->{date} \n";
395}
396
397sub branchname {
398 my $id = shift;
399 $id =~ s#^.+?/##;
400 my @parts = split(m/--/, $id);
401 return join('--', @parts[0..1]);
402}
403
404sub apply_import {
405 my $ps = shift;
406 my $bname = branchname($ps->{id});
407
408 `mkdir -p $tmp`;
409
410 `tla get -s --no-pristine -A $ps->{repo} $ps->{id} $tmp/import`;
411 die "Cannot get import: $!" if $?;
412 `rsync -v --archive --delete --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/import/* ./`;
413 die "Cannot rsync import:$!" if $?;
414
415 `rm -fr $tmp/import`;
416 die "Cannot remove tempdir: $!" if $?;
417
418
419 return 1;
420}
421
422sub apply_cset {
423 my $ps = shift;
424
425 `mkdir -p $tmp`;
426
427 # get the changeset
428 `tla get-changeset -A $ps->{repo} $ps->{id} $tmp/changeset`;
429 die "Cannot get changeset: $!" if $?;
430
431 # apply patches
432 if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
433 # this can be sped up considerably by doing
434 # (find | xargs cat) | patch
435 # but that cna get mucked up by patches
436 # with missing trailing newlines or the standard
437 # 'missing newline' flag in the patch - possibly
438 # produced with an old/buggy diff.
439 # slow and safe, we invoke patch once per patchfile
440 `find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
441 die "Problem applying patches! $!" if $?;
442 }
443
444 # apply changed binary files
445 if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
446 foreach my $mod (@modified) {
447 chomp $mod;
448 my $orig = $mod;
449 $orig =~ s/\.modified$//; # lazy
450 $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
451 #print "rsync -p '$mod' '$orig'";
452 `rsync -p $mod ./$orig`;
453 die "Problem applying binary changes! $!" if $?;
454 }
455 }
456
457 # bring in new files
458 `rsync --archive --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/changeset/new-files-archive/* ./`;
459
460 # deleted files are hinted from the commitlog processing
461
462 `rm -fr $tmp/changeset`;
463}
464
465
466# =for reference
467# A log entry looks like
468# Revision: moodle-org--moodle--1.3.3--patch-15
469# Archive: arch-eduforge@catalyst.net.nz--2004
470# Creator: Penny Leach <penny@catalyst.net.nz>
471# Date: Wed May 25 14:15:34 NZST 2005
472# Standard-date: 2005-05-25 02:15:34 GMT
473# New-files: lang/de/.arch-ids/block_glossary_random.php.id
474# lang/de/.arch-ids/block_html.php.id
475# New-directories: lang/de/help/questionnaire
476# lang/de/help/questionnaire/.arch-ids
477# Renamed-files: .arch-ids/db_sears.sql.id db/.arch-ids/db_sears.sql.id
478# db_sears.sql db/db_sears.sql
479# Removed-files: lang/be/docs/.arch-ids/release.html.id
480# lang/be/docs/.arch-ids/releaseold.html.id
481# Modified-files: admin/cron.php admin/delete.php
482# admin/editor.html backup/lib.php backup/restore.php
483# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
484# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
485# Keywords:
486#
487# Updating yadda tadda tadda madda
488sub parselog {
489 my $log = shift;
490 #print $log;
491
492 my (@add, @del, @mod, @ren, @kw, $sum, $msg );
493
494 if ($log =~ m/(?:\n|^)New-files:(.*?)(?=\n\w)/s ) {
495 my $files = $1;
496 @add = split(m/\s+/s, $files);
497 }
498
499 if ($log =~ m/(?:\n|^)Removed-files:(.*?)(?=\n\w)/s ) {
500 my $files = $1;
501 @del = split(m/\s+/s, $files);
502 }
503
504 if ($log =~ m/(?:\n|^)Modified-files:(.*?)(?=\n\w)/s ) {
505 my $files = $1;
506 @mod = split(m/\s+/s, $files);
507 }
508
509 if ($log =~ m/(?:\n|^)Renamed-files:(.*?)(?=\n\w)/s ) {
510 my $files = $1;
511 @ren = split(m/\s+/s, $files);
512 }
513
514 $sum ='';
515 if ($log =~ m/^Summary:(.+?)$/m ) {
516 $sum = $1;
517 $sum =~ s/^\s+//;
518 $sum =~ s/\s+$//;
519 }
520
521 $msg = '';
522 if ($log =~ m/\n\n(.+)$/s) {
523 $msg = $1;
524 $msg =~ s/^\s+//;
525 $msg =~ s/\s+$//;
526 }
527
528
529 # cleanup the arrays
530 foreach my $ref ( (\@add, \@del, \@mod, \@ren) ) {
531 my @tmp = ();
532 while (my $t = pop @$ref) {
533 next unless length ($t);
534 next if $t =~ m!\{arch\}/!;
535 next if $t =~ m!\.arch-ids/!;
536 next if $t =~ m!\.arch-inventory$!;
537 push (@tmp, shell_quote($t));
538 }
539 @$ref = @tmp;
540 }
541
542 #print Dumper [$sum, $msg, \@add, \@del, \@mod, \@ren];
543 return ($sum, $msg, \@add, \@del, \@mod, \@ren);
544}
545
546# write/read a tag
547sub tag {
548 my ($tag, $commit) = @_;
549 $tag =~ s|/|--|g;
550 $tag = shell_quote($tag);
551
552 if ($commit) {
553 open(C,">.git/refs/tags/$tag")
554 or die "Cannot create tag $tag: $!\n";
555 print C "$commit\n"
556 or die "Cannot write tag $tag: $!\n";
557 close(C)
558 or die "Cannot write tag $tag: $!\n";
559 print "Created tag '$tag' on '$commit'\n" if $opt_v;
560 } else { # read
561 open(C,"<.git/refs/tags/$tag")
562 or die "Cannot read tag $tag: $!\n";
563 $commit = <C>;
564 chomp $commit;
565 die "Error reading tag $tag: $!\n" unless length $commit == 40;
566 close(C)
567 or die "Cannot read tag $tag: $!\n";
568 return $commit;
569 }
570}
571
572# write/read a private tag
573# reads fail softly if the tag isn't there
574sub ptag {
575 my ($tag, $commit) = @_;
576 $tag =~ s|/|--|g;
577 $tag = shell_quote($tag);
578
579 unless (-d '.git/archimport/tags') {
580 mkpath('.git/archimport/tags');
581 }
582
583 if ($commit) { # write
584 open(C,">.git/archimport/tags/$tag")
585 or die "Cannot create tag $tag: $!\n";
586 print C "$commit\n"
587 or die "Cannot write tag $tag: $!\n";
588 close(C)
589 or die "Cannot write tag $tag: $!\n";
590 } else { # read
591 # if the tag isn't there, return 0
592 unless ( -s ".git/archimport/tags/$tag") {
593 return 0;
594 }
595 open(C,"<.git/archimport/tags/$tag")
596 or die "Cannot read tag $tag: $!\n";
597 $commit = <C>;
598 chomp $commit;
599 die "Error reading tag $tag: $!\n" unless length $commit == 40;
600 close(C)
601 or die "Cannot read tag $tag: $!\n";
602 return $commit;
603 }
604}