1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# https://github.com/Bibzball/Git-Mediawiki/wiki 13# 14# Known limitations: 15# 16# - Several strategies are provided to fetch modifications from the 17# wiki, but no automatic heuristics is provided, the user has 18# to understand and chose which strategy is appropriate for him. 19# 20# - Git renames could be turned into MediaWiki renames (see TODO 21# below) 22# 23# - No way to import "one page, and all pages included in it" 24# 25# - Multiple remote MediaWikis have not been very well tested. 26 27use strict; 28use MediaWiki::API; 29use DateTime::Format::ISO8601; 30 31# By default, use UTF-8 to communicate with Git and the user 32binmode STDERR,":utf8"; 33binmode STDOUT,":utf8"; 34 35use URI::Escape; 36use IPC::Open2; 37 38use warnings; 39 40# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 41useconstant SLASH_REPLACEMENT =>"%2F"; 42 43# It's not always possible to delete pages (may require some 44# priviledges). Deleted pages are replaced with this content. 45useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 46 47# It's not possible to create empty pages. New empty files in Git are 48# sent with this content instead. 49useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 50 51# used to reflect file creation or deletion in diff. 52useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 53 54# Used on Git's side to reflect empty edit messages on the wiki 55useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 56 57my$remotename=$ARGV[0]; 58my$url=$ARGV[1]; 59 60# Accept both space-separated and multiple keys in config file. 61# Spaces should be written as _ anyway because we'll use chomp. 62my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 63chomp(@tracked_pages); 64 65# Just like @tracked_pages, but for MediaWiki categories. 66my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 67chomp(@tracked_categories); 68 69# Import media files on pull 70my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 71chomp($import_media); 72$import_media= ($import_mediaeq"true"); 73 74# Export media files on push 75my$export_media= run_git("config --get --bool remote.".$remotename.".mediaexport"); 76chomp($export_media); 77$export_media= !($export_mediaeq"false"); 78 79my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 80# Note: mwPassword is discourraged. Use the credential system instead. 81my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 82my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 83chomp($wiki_login); 84chomp($wiki_passwd); 85chomp($wiki_domain); 86 87# Import only last revisions (both for clone and fetch) 88my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 89chomp($shallow_import); 90$shallow_import= ($shallow_importeq"true"); 91 92# Fetch (clone and pull) by revisions instead of by pages. This behavior 93# is more efficient when we have a wiki with lots of pages and we fetch 94# the revisions quite often so that they concern only few pages. 95# Possible values: 96# - by_rev: perform one query per new revision on the remote wiki 97# - by_page: query each tracked page for new revision 98my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 99unless($fetch_strategy) { 100$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 101} 102chomp($fetch_strategy); 103unless($fetch_strategy) { 104$fetch_strategy="by_page"; 105} 106 107# Dumb push: don't update notes and mediawiki ref to reflect the last push. 108# 109# Configurable with mediawiki.dumbPush, or per-remote with 110# remote.<remotename>.dumbPush. 111# 112# This means the user will have to re-import the just-pushed 113# revisions. On the other hand, this means that the Git revisions 114# corresponding to MediaWiki revisions are all imported from the wiki, 115# regardless of whether they were initially created in Git or from the 116# web interface, hence all users will get the same history (i.e. if 117# the push from Git to MediaWiki loses some information, everybody 118# will get the history with information lost). If the import is 119# deterministic, this means everybody gets the same sha1 for each 120# MediaWiki revision. 121my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 122unless($dumb_push) { 123$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 124} 125chomp($dumb_push); 126$dumb_push= ($dumb_pusheq"true"); 127 128my$wiki_name=$url; 129$wiki_name=~s/[^\/]*:\/\///; 130# If URL is like http://user:password@example.com/, we clearly don't 131# want the password in $wiki_name. While we're there, also remove user 132# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 133$wiki_name=~s/^.*@//; 134 135# Commands parser 136my$entry; 137my@cmd; 138while(<STDIN>) { 139chomp; 140@cmd=split(/ /); 141if(defined($cmd[0])) { 142# Line not blank 143if($cmd[0]eq"capabilities") { 144die("Too many arguments for capabilities")unless(!defined($cmd[1])); 145 mw_capabilities(); 146}elsif($cmd[0]eq"list") { 147die("Too many arguments for list")unless(!defined($cmd[2])); 148 mw_list($cmd[1]); 149}elsif($cmd[0]eq"import") { 150die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 151 mw_import($cmd[1]); 152}elsif($cmd[0]eq"option") { 153die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 154 mw_option($cmd[1],$cmd[2]); 155}elsif($cmd[0]eq"push") { 156 mw_push($cmd[1]); 157}else{ 158print STDERR "Unknown command. Aborting...\n"; 159last; 160} 161}else{ 162# blank line: we should terminate 163last; 164} 165 166BEGIN{ $| =1}# flush STDOUT, to make sure the previous 167# command is fully processed. 168} 169 170########################## Functions ############################## 171 172## credential API management (generic functions) 173 174sub credential_read { 175my%credential; 176my$reader=shift; 177my$op=shift; 178while(<$reader>) { 179my($key,$value) =/([^=]*)=(.*)/; 180if(not defined$key) { 181die"ERROR receiving response from git credential$op:\n$_\n"; 182} 183$credential{$key} =$value; 184} 185return%credential; 186} 187 188sub credential_write { 189my$credential=shift; 190my$writer=shift; 191# url overwrites other fields, so it must come first 192print$writer"url=$credential->{url}\n"ifexists$credential->{url}; 193while(my($key,$value) =each(%$credential) ) { 194if(length$value&&$keyne'url') { 195print$writer"$key=$value\n"; 196} 197} 198} 199 200sub credential_run { 201my$op=shift; 202my$credential=shift; 203my$pid= open2(my$reader,my$writer,"git credential$op"); 204 credential_write($credential,$writer); 205print$writer"\n"; 206close($writer); 207 208if($opeq"fill") { 209%$credential= credential_read($reader,$op); 210}else{ 211if(<$reader>) { 212die"ERROR while running git credential$op:\n$_"; 213} 214} 215close($reader); 216waitpid($pid,0); 217my$child_exit_status=$?>>8; 218if($child_exit_status!=0) { 219die"'git credential$op' failed with code$child_exit_status."; 220} 221} 222 223# MediaWiki API instance, created lazily. 224my$mediawiki; 225 226sub mw_connect_maybe { 227if($mediawiki) { 228return; 229} 230$mediawiki= MediaWiki::API->new; 231$mediawiki->{config}->{api_url} ="$url/api.php"; 232if($wiki_login) { 233my%credential= (url =>$url); 234$credential{username} =$wiki_login; 235$credential{password} =$wiki_passwd; 236 credential_run("fill", \%credential); 237my$request= {lgname =>$credential{username}, 238 lgpassword =>$credential{password}, 239 lgdomain =>$wiki_domain}; 240if($mediawiki->login($request)) { 241 credential_run("approve", \%credential); 242print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 243}else{ 244print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 245print STDERR " (error ". 246$mediawiki->{error}->{code} .': '. 247$mediawiki->{error}->{details} .")\n"; 248 credential_run("reject", \%credential); 249exit1; 250} 251} 252} 253 254## Functions for listing pages on the remote wiki 255sub get_mw_tracked_pages { 256my$pages=shift; 257 get_mw_page_list(\@tracked_pages,$pages); 258} 259 260sub get_mw_page_list { 261my$page_list=shift; 262my$pages=shift; 263my@some_pages=@$page_list; 264while(@some_pages) { 265my$last=50; 266if($#some_pages<$last) { 267$last=$#some_pages; 268} 269my@slice=@some_pages[0..$last]; 270 get_mw_first_pages(\@slice,$pages); 271@some_pages=@some_pages[51..$#some_pages]; 272} 273} 274 275sub get_mw_tracked_categories { 276my$pages=shift; 277foreachmy$category(@tracked_categories) { 278if(index($category,':') <0) { 279# Mediawiki requires the Category 280# prefix, but let's not force the user 281# to specify it. 282$category="Category:".$category; 283} 284my$mw_pages=$mediawiki->list( { 285 action =>'query', 286 list =>'categorymembers', 287 cmtitle =>$category, 288 cmlimit =>'max'} ) 289||die$mediawiki->{error}->{code} .': ' 290.$mediawiki->{error}->{details}; 291foreachmy$page(@{$mw_pages}) { 292$pages->{$page->{title}} =$page; 293} 294} 295} 296 297sub get_mw_all_pages { 298my$pages=shift; 299# No user-provided list, get the list of pages from the API. 300my$mw_pages=$mediawiki->list({ 301 action =>'query', 302 list =>'allpages', 303 aplimit =>'max' 304}); 305if(!defined($mw_pages)) { 306print STDERR "fatal: could not get the list of wiki pages.\n"; 307print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 308print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 309exit1; 310} 311foreachmy$page(@{$mw_pages}) { 312$pages->{$page->{title}} =$page; 313} 314} 315 316# queries the wiki for a set of pages. Meant to be used within a loop 317# querying the wiki for slices of page list. 318sub get_mw_first_pages { 319my$some_pages=shift; 320my@some_pages= @{$some_pages}; 321 322my$pages=shift; 323 324# pattern 'page1|page2|...' required by the API 325my$titles=join('|',@some_pages); 326 327my$mw_pages=$mediawiki->api({ 328 action =>'query', 329 titles =>$titles, 330}); 331if(!defined($mw_pages)) { 332print STDERR "fatal: could not query the list of wiki pages.\n"; 333print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 334print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 335exit1; 336} 337while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 338if($id<0) { 339print STDERR "Warning: page$page->{title} not found on wiki\n"; 340}else{ 341$pages->{$page->{title}} =$page; 342} 343} 344} 345 346# Get the list of pages to be fetched according to configuration. 347sub get_mw_pages { 348 mw_connect_maybe(); 349 350print STDERR "Listing pages on remote wiki...\n"; 351 352my%pages;# hash on page titles to avoid duplicates 353my$user_defined; 354if(@tracked_pages) { 355$user_defined=1; 356# The user provided a list of pages titles, but we 357# still need to query the API to get the page IDs. 358 get_mw_tracked_pages(\%pages); 359} 360if(@tracked_categories) { 361$user_defined=1; 362 get_mw_tracked_categories(\%pages); 363} 364if(!$user_defined) { 365 get_mw_all_pages(\%pages); 366} 367if($import_media) { 368print STDERR "Getting media files for selected pages...\n"; 369if($user_defined) { 370 get_linked_mediafiles(\%pages); 371}else{ 372 get_all_mediafiles(\%pages); 373} 374} 375print STDERR (scalar keys%pages) ." pages found.\n"; 376return%pages; 377} 378 379# usage: $out = run_git("command args"); 380# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 381sub run_git { 382my$args=shift; 383my$encoding= (shift||"encoding(UTF-8)"); 384open(my$git,"-|:$encoding","git ".$args); 385my$res=do{local$/; <$git> }; 386close($git); 387 388return$res; 389} 390 391 392sub get_all_mediafiles { 393my$pages=shift; 394# Attach list of all pages for media files from the API, 395# they are in a different namespace, only one namespace 396# can be queried at the same moment 397my$mw_pages=$mediawiki->list({ 398 action =>'query', 399 list =>'allpages', 400 apnamespace => get_mw_namespace_id("File"), 401 aplimit =>'max' 402}); 403if(!defined($mw_pages)) { 404print STDERR "fatal: could not get the list of pages for media files.\n"; 405print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 406print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 407exit1; 408} 409foreachmy$page(@{$mw_pages}) { 410$pages->{$page->{title}} =$page; 411} 412} 413 414sub get_linked_mediafiles { 415my$pages=shift; 416my@titles=map$_->{title},values(%{$pages}); 417 418# The query is split in small batches because of the MW API limit of 419# the number of links to be returned (500 links max). 420my$batch=10; 421while(@titles) { 422if($#titles<$batch) { 423$batch=$#titles; 424} 425my@slice=@titles[0..$batch]; 426 427# pattern 'page1|page2|...' required by the API 428my$mw_titles=join('|',@slice); 429 430# Media files could be included or linked from 431# a page, get all related 432my$query= { 433 action =>'query', 434 prop =>'links|images', 435 titles =>$mw_titles, 436 plnamespace => get_mw_namespace_id("File"), 437 pllimit =>'max' 438}; 439my$result=$mediawiki->api($query); 440 441while(my($id,$page) =each(%{$result->{query}->{pages}})) { 442my@media_titles; 443if(defined($page->{links})) { 444my@link_titles=map$_->{title}, @{$page->{links}}; 445push(@media_titles,@link_titles); 446} 447if(defined($page->{images})) { 448my@image_titles=map$_->{title}, @{$page->{images}}; 449push(@media_titles,@image_titles); 450} 451if(@media_titles) { 452 get_mw_page_list(\@media_titles,$pages); 453} 454} 455 456@titles=@titles[($batch+1)..$#titles]; 457} 458} 459 460sub get_mw_mediafile_for_page_revision { 461# Name of the file on Wiki, with the prefix. 462my$filename=shift; 463my$timestamp=shift; 464my%mediafile; 465 466# Search if on a media file with given timestamp exists on 467# MediaWiki. In that case download the file. 468my$query= { 469 action =>'query', 470 prop =>'imageinfo', 471 titles =>"File:".$filename, 472 iistart =>$timestamp, 473 iiend =>$timestamp, 474 iiprop =>'timestamp|archivename|url', 475 iilimit =>1 476}; 477my$result=$mediawiki->api($query); 478 479my($fileid,$file) =each( %{$result->{query}->{pages}} ); 480# If not defined it means there is no revision of the file for 481# given timestamp. 482if(defined($file->{imageinfo})) { 483$mediafile{title} =$filename; 484 485my$fileinfo=pop(@{$file->{imageinfo}}); 486$mediafile{timestamp} =$fileinfo->{timestamp}; 487# Mediawiki::API's download function doesn't support https URLs 488# and can't download old versions of files. 489print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 490$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 491} 492return%mediafile; 493} 494 495sub download_mw_mediafile { 496my$url=shift; 497 498my$response=$mediawiki->{ua}->get($url); 499if($response->code==200) { 500return$response->decoded_content; 501}else{ 502print STDERR "Error downloading mediafile from :\n"; 503print STDERR "URL:$url\n"; 504print STDERR "Server response: ".$response->code." ".$response->message."\n"; 505exit1; 506} 507} 508 509sub get_last_local_revision { 510# Get note regarding last mediawiki revision 511my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 512my@note_info=split(/ /,$note); 513 514my$lastrevision_number; 515if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 516print STDERR "No previous mediawiki revision found"; 517$lastrevision_number=0; 518}else{ 519# Notes are formatted : mediawiki_revision: #number 520$lastrevision_number=$note_info[1]; 521chomp($lastrevision_number); 522print STDERR "Last local mediawiki revision found is$lastrevision_number"; 523} 524return$lastrevision_number; 525} 526 527# Remember the timestamp corresponding to a revision id. 528my%basetimestamps; 529 530# Get the last remote revision without taking in account which pages are 531# tracked or not. This function makes a single request to the wiki thus 532# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 533# option. 534sub get_last_global_remote_rev { 535 mw_connect_maybe(); 536 537my$query= { 538 action =>'query', 539 list =>'recentchanges', 540 prop =>'revisions', 541 rclimit =>'1', 542 rcdir =>'older', 543}; 544my$result=$mediawiki->api($query); 545return$result->{query}->{recentchanges}[0]->{revid}; 546} 547 548# Get the last remote revision concerning the tracked pages and the tracked 549# categories. 550sub get_last_remote_revision { 551 mw_connect_maybe(); 552 553my%pages_hash= get_mw_pages(); 554my@pages=values(%pages_hash); 555 556my$max_rev_num=0; 557 558print STDERR "Getting last revision id on tracked pages...\n"; 559 560foreachmy$page(@pages) { 561my$id=$page->{pageid}; 562 563my$query= { 564 action =>'query', 565 prop =>'revisions', 566 rvprop =>'ids|timestamp', 567 pageids =>$id, 568}; 569 570my$result=$mediawiki->api($query); 571 572my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 573 574$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 575 576$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 577} 578 579print STDERR "Last remote revision found is$max_rev_num.\n"; 580return$max_rev_num; 581} 582 583# Clean content before sending it to MediaWiki 584sub mediawiki_clean { 585my$string=shift; 586my$page_created=shift; 587# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 588# This function right trims a string and adds a \n at the end to follow this rule 589$string=~s/\s+$//; 590if($stringeq""&&$page_created) { 591# Creating empty pages is forbidden. 592$string= EMPTY_CONTENT; 593} 594return$string."\n"; 595} 596 597# Filter applied on MediaWiki data before adding them to Git 598sub mediawiki_smudge { 599my$string=shift; 600if($stringeq EMPTY_CONTENT) { 601$string=""; 602} 603# This \n is important. This is due to mediawiki's way to handle end of files. 604return$string."\n"; 605} 606 607sub mediawiki_clean_filename { 608my$filename=shift; 609$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 610# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 611# Do a variant of URL-encoding, i.e. looks like URL-encoding, 612# but with _ added to prevent MediaWiki from thinking this is 613# an actual special character. 614$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 615# If we use the uri escape before 616# we should unescape here, before anything 617 618return$filename; 619} 620 621sub mediawiki_smudge_filename { 622my$filename=shift; 623$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 624$filename=~s/ /_/g; 625# Decode forbidden characters encoded in mediawiki_clean_filename 626$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 627return$filename; 628} 629 630sub literal_data { 631my($content) =@_; 632print STDOUT "data ", bytes::length($content),"\n",$content; 633} 634 635sub literal_data_raw { 636# Output possibly binary content. 637my($content) =@_; 638# Avoid confusion between size in bytes and in characters 639 utf8::downgrade($content); 640binmode STDOUT,":raw"; 641print STDOUT "data ", bytes::length($content),"\n",$content; 642binmode STDOUT,":utf8"; 643} 644 645sub mw_capabilities { 646# Revisions are imported to the private namespace 647# refs/mediawiki/$remotename/ by the helper and fetched into 648# refs/remotes/$remotename later by fetch. 649print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 650print STDOUT "import\n"; 651print STDOUT "list\n"; 652print STDOUT "push\n"; 653print STDOUT "\n"; 654} 655 656sub mw_list { 657# MediaWiki do not have branches, we consider one branch arbitrarily 658# called master, and HEAD pointing to it. 659print STDOUT "? refs/heads/master\n"; 660print STDOUT "\@refs/heads/masterHEAD\n"; 661print STDOUT "\n"; 662} 663 664sub mw_option { 665print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 666print STDOUT "unsupported\n"; 667} 668 669sub fetch_mw_revisions_for_page { 670my$page=shift; 671my$id=shift; 672my$fetch_from=shift; 673my@page_revs= (); 674my$query= { 675 action =>'query', 676 prop =>'revisions', 677 rvprop =>'ids', 678 rvdir =>'newer', 679 rvstartid =>$fetch_from, 680 rvlimit =>500, 681 pageids =>$id, 682}; 683 684my$revnum=0; 685# Get 500 revisions at a time due to the mediawiki api limit 686while(1) { 687my$result=$mediawiki->api($query); 688 689# Parse each of those 500 revisions 690foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 691my$page_rev_ids; 692$page_rev_ids->{pageid} =$page->{pageid}; 693$page_rev_ids->{revid} =$revision->{revid}; 694push(@page_revs,$page_rev_ids); 695$revnum++; 696} 697last unless$result->{'query-continue'}; 698$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 699} 700if($shallow_import&&@page_revs) { 701print STDERR " Found 1 revision (shallow import).\n"; 702@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 703return$page_revs[0]; 704} 705print STDERR " Found ",$revnum," revision(s).\n"; 706return@page_revs; 707} 708 709sub fetch_mw_revisions { 710my$pages=shift;my@pages= @{$pages}; 711my$fetch_from=shift; 712 713my@revisions= (); 714my$n=1; 715foreachmy$page(@pages) { 716my$id=$page->{pageid}; 717 718print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 719$n++; 720my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 721@revisions= (@page_revs,@revisions); 722} 723 724return($n,@revisions); 725} 726 727sub import_file_revision { 728my$commit=shift; 729my%commit= %{$commit}; 730my$full_import=shift; 731my$n=shift; 732my$mediafile=shift; 733my%mediafile; 734if($mediafile) { 735%mediafile= %{$mediafile}; 736} 737 738my$title=$commit{title}; 739my$comment=$commit{comment}; 740my$content=$commit{content}; 741my$author=$commit{author}; 742my$date=$commit{date}; 743 744print STDOUT "commit refs/mediawiki/$remotename/master\n"; 745print STDOUT "mark :$n\n"; 746print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 747 literal_data($comment); 748 749# If it's not a clone, we need to know where to start from 750if(!$full_import&&$n==1) { 751print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 752} 753if($contentne DELETED_CONTENT) { 754print STDOUT "M 644 inline$title.mw\n"; 755 literal_data($content); 756if(%mediafile) { 757print STDOUT "M 644 inline$mediafile{title}\n"; 758 literal_data_raw($mediafile{content}); 759} 760print STDOUT "\n\n"; 761}else{ 762print STDOUT "D$title.mw\n"; 763} 764 765# mediawiki revision number in the git note 766if($full_import&&$n==1) { 767print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 768} 769print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 770print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 771 literal_data("Note added by git-mediawiki during import"); 772if(!$full_import&&$n==1) { 773print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 774} 775print STDOUT "N inline :$n\n"; 776 literal_data("mediawiki_revision: ".$commit{mw_revision}); 777print STDOUT "\n\n"; 778} 779 780# parse a sequence of 781# <cmd> <arg1> 782# <cmd> <arg2> 783# \n 784# (like batch sequence of import and sequence of push statements) 785sub get_more_refs { 786my$cmd=shift; 787my@refs; 788while(1) { 789my$line= <STDIN>; 790if($line=~m/^$cmd (.*)$/) { 791push(@refs,$1); 792}elsif($lineeq"\n") { 793return@refs; 794}else{ 795die("Invalid command in a '$cmd' batch: ".$_); 796} 797} 798} 799 800sub mw_import { 801# multiple import commands can follow each other. 802my@refs= (shift, get_more_refs("import")); 803foreachmy$ref(@refs) { 804 mw_import_ref($ref); 805} 806print STDOUT "done\n"; 807} 808 809sub mw_import_ref { 810my$ref=shift; 811# The remote helper will call "import HEAD" and 812# "import refs/heads/master". 813# Since HEAD is a symbolic ref to master (by convention, 814# followed by the output of the command "list" that we gave), 815# we don't need to do anything in this case. 816if($refeq"HEAD") { 817return; 818} 819 820 mw_connect_maybe(); 821 822print STDERR "Searching revisions...\n"; 823my$last_local= get_last_local_revision(); 824my$fetch_from=$last_local+1; 825if($fetch_from==1) { 826print STDERR ", fetching from beginning.\n"; 827}else{ 828print STDERR ", fetching from here.\n"; 829} 830 831my$n=0; 832if($fetch_strategyeq"by_rev") { 833print STDERR "Fetching & writing export data by revs...\n"; 834$n= mw_import_ref_by_revs($fetch_from); 835}elsif($fetch_strategyeq"by_page") { 836print STDERR "Fetching & writing export data by pages...\n"; 837$n= mw_import_ref_by_pages($fetch_from); 838}else{ 839print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 840print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 841exit1; 842} 843 844if($fetch_from==1&&$n==0) { 845print STDERR "You appear to have cloned an empty MediaWiki.\n"; 846# Something has to be done remote-helper side. If nothing is done, an error is 847# thrown saying that HEAD is refering to unknown object 0000000000000000000 848# and the clone fails. 849} 850} 851 852sub mw_import_ref_by_pages { 853 854my$fetch_from=shift; 855my%pages_hash= get_mw_pages(); 856my@pages=values(%pages_hash); 857 858my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 859 860@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 861my@revision_ids=map$_->{revid},@revisions; 862 863return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 864} 865 866sub mw_import_ref_by_revs { 867 868my$fetch_from=shift; 869my%pages_hash= get_mw_pages(); 870 871my$last_remote= get_last_global_remote_rev(); 872my@revision_ids=$fetch_from..$last_remote; 873return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 874} 875 876# Import revisions given in second argument (array of integers). 877# Only pages appearing in the third argument (hash indexed by page titles) 878# will be imported. 879sub mw_import_revids { 880my$fetch_from=shift; 881my$revision_ids=shift; 882my$pages=shift; 883 884my$n=0; 885my$n_actual=0; 886my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 887 888foreachmy$pagerevid(@$revision_ids) { 889# Count page even if we skip it, since we display 890# $n/$total and $total includes skipped pages. 891$n++; 892 893# fetch the content of the pages 894my$query= { 895 action =>'query', 896 prop =>'revisions', 897 rvprop =>'content|timestamp|comment|user|ids', 898 revids =>$pagerevid, 899}; 900 901my$result=$mediawiki->api($query); 902 903if(!$result) { 904die"Failed to retrieve modified page for revision$pagerevid"; 905} 906 907if(defined($result->{query}->{badrevids}->{$pagerevid})) { 908# The revision id does not exist on the remote wiki. 909next; 910} 911 912if(!defined($result->{query}->{pages})) { 913die"Invalid revision$pagerevid."; 914} 915 916my@result_pages=values(%{$result->{query}->{pages}}); 917my$result_page=$result_pages[0]; 918my$rev=$result_pages[0]->{revisions}->[0]; 919 920my$page_title=$result_page->{title}; 921 922if(!exists($pages->{$page_title})) { 923print STDERR "$n/",scalar(@$revision_ids), 924": Skipping revision #$rev->{revid} of$page_title\n"; 925next; 926} 927 928$n_actual++; 929 930my%commit; 931$commit{author} =$rev->{user} ||'Anonymous'; 932$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 933$commit{title} = mediawiki_smudge_filename($page_title); 934$commit{mw_revision} =$rev->{revid}; 935$commit{content} = mediawiki_smudge($rev->{'*'}); 936 937if(!defined($rev->{timestamp})) { 938$last_timestamp++; 939}else{ 940$last_timestamp=$rev->{timestamp}; 941} 942$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 943 944# Differentiates classic pages and media files. 945my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 946my%mediafile; 947if($namespace) { 948my$id= get_mw_namespace_id($namespace); 949if($id&&$id== get_mw_namespace_id("File")) { 950%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 951} 952} 953# If this is a revision of the media page for new version 954# of a file do one common commit for both file and media page. 955# Else do commit only for that page. 956print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 957 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 958} 959 960return$n_actual; 961} 962 963sub error_non_fast_forward { 964my$advice= run_git("config --bool advice.pushNonFastForward"); 965chomp($advice); 966if($advicene"false") { 967# Native git-push would show this after the summary. 968# We can't ask it to display it cleanly, so print it 969# ourselves before. 970print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 971print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 972print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 973} 974print STDOUT "error$_[0]\"non-fast-forward\"\n"; 975return0; 976} 977 978sub mw_upload_file { 979my$complete_file_name=shift; 980my$new_sha1=shift; 981my$extension=shift; 982my$file_deleted=shift; 983my$summary=shift; 984my$newrevid; 985my$path="File:".$complete_file_name; 986my%hashFiles= get_allowed_file_extensions(); 987if(!exists($hashFiles{$extension})) { 988print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 989print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 990return$newrevid; 991} 992# Deleting and uploading a file requires a priviledged user 993if($file_deleted) { 994 mw_connect_maybe(); 995my$query= { 996 action =>'delete', 997 title =>$path, 998 reason =>$summary 999};1000if(!$mediawiki->edit($query)) {1001print STDERR "Failed to delete file on remote wiki\n";1002print STDERR "Check your permissions on the remote site. Error code:\n";1003print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details};1004exit1;1005}1006}else{1007# Don't let perl try to interpret file content as UTF-8 => use "raw"1008my$content= run_git("cat-file blob$new_sha1","raw");1009if($contentne"") {1010 mw_connect_maybe();1011$mediawiki->{config}->{upload_url} =1012"$url/index.php/Special:Upload";1013$mediawiki->edit({1014 action =>'upload',1015 filename =>$complete_file_name,1016 comment =>$summary,1017 file => [undef,1018$complete_file_name,1019 Content =>$content],1020 ignorewarnings =>1,1021}, {1022 skip_encoding =>11023} ) ||die$mediawiki->{error}->{code} .':'1024.$mediawiki->{error}->{details};1025my$last_file_page=$mediawiki->get_page({title =>$path});1026$newrevid=$last_file_page->{revid};1027print STDERR "Pushed file:$new_sha1-$complete_file_name.\n";1028}else{1029print STDERR "Empty file$complete_file_namenot pushed.\n";1030}1031}1032return$newrevid;1033}10341035sub mw_push_file {1036my$diff_info=shift;1037# $diff_info contains a string in this format:1038# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1039my@diff_info_split=split(/[ \t]/,$diff_info);10401041# Filename, including .mw extension1042my$complete_file_name=shift;1043# Commit message1044my$summary=shift;1045# MediaWiki revision number. Keep the previous one by default,1046# in case there's no edit to perform.1047my$oldrevid=shift;1048my$newrevid;10491050if($summaryeq EMPTY_MESSAGE) {1051$summary='';1052}10531054my$new_sha1=$diff_info_split[3];1055my$old_sha1=$diff_info_split[2];1056my$page_created= ($old_sha1eq NULL_SHA1);1057my$page_deleted= ($new_sha1eq NULL_SHA1);1058$complete_file_name= mediawiki_clean_filename($complete_file_name);10591060my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1061if(!defined($extension)) {1062$extension="";1063}1064if($extensioneq"mw") {1065my$ns= get_mw_namespace_id_for_page($complete_file_name);1066if($ns&&$ns== get_mw_namespace_id("File") && (!$export_media)) {1067print STDERR "Ignoring media file related page:$complete_file_name\n";1068return($oldrevid,"ok");1069}1070my$file_content;1071if($page_deleted) {1072# Deleting a page usually requires1073# special priviledges. A common1074# convention is to replace the page1075# with this content instead:1076$file_content= DELETED_CONTENT;1077}else{1078$file_content= run_git("cat-file blob$new_sha1");1079}10801081 mw_connect_maybe();10821083my$result=$mediawiki->edit( {1084 action =>'edit',1085 summary =>$summary,1086 title =>$title,1087 basetimestamp =>$basetimestamps{$oldrevid},1088 text => mediawiki_clean($file_content,$page_created),1089}, {1090 skip_encoding =>1# Helps with names with accentuated characters1091});1092if(!$result) {1093if($mediawiki->{error}->{code} ==3) {1094# edit conflicts, considered as non-fast-forward1095print STDERR 'Warning: Error '.1096$mediawiki->{error}->{code} .1097' from mediwiki: '.$mediawiki->{error}->{details} .1098".\n";1099return($oldrevid,"non-fast-forward");1100}else{1101# Other errors. Shouldn't happen => just die()1102die'Fatal: Error '.1103$mediawiki->{error}->{code} .1104' from mediwiki: '.$mediawiki->{error}->{details};1105}1106}1107$newrevid=$result->{edit}->{newrevid};1108print STDERR "Pushed file:$new_sha1-$title\n";1109}elsif($export_media) {1110$newrevid= mw_upload_file($complete_file_name,$new_sha1,1111$extension,$page_deleted,1112$summary);1113}else{1114print STDERR "Ignoring media file$title\n";1115}1116$newrevid= ($newrevidor$oldrevid);1117return($newrevid,"ok");1118}11191120sub mw_push {1121# multiple push statements can follow each other1122my@refsspecs= (shift, get_more_refs("push"));1123my$pushed;1124formy$refspec(@refsspecs) {1125my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1126or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1127if($force) {1128print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1129}1130if($localeq"") {1131print STDERR "Cannot delete remote branch on a MediaWiki\n";1132print STDOUT "error$remotecannot delete\n";1133next;1134}1135if($remotene"refs/heads/master") {1136print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1137print STDOUT "error$remoteonly master allowed\n";1138next;1139}1140if(mw_push_revision($local,$remote)) {1141$pushed=1;1142}1143}11441145# Notify Git that the push is done1146print STDOUT "\n";11471148if($pushed&&$dumb_push) {1149print STDERR "Just pushed some revisions to MediaWiki.\n";1150print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1151print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1152print STDERR "\n";1153print STDERR " git pull --rebase\n";1154print STDERR "\n";1155}1156}11571158sub mw_push_revision {1159my$local=shift;1160my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1161my$last_local_revid= get_last_local_revision();1162print STDERR ".\n";# Finish sentence started by get_last_local_revision()1163my$last_remote_revid= get_last_remote_revision();1164my$mw_revision=$last_remote_revid;11651166# Get sha1 of commit pointed by local HEAD1167my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1168# Get sha1 of commit pointed by remotes/$remotename/master1169my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1170chomp($remoteorigin_sha1);11711172if($last_local_revid>0&&1173$last_local_revid<$last_remote_revid) {1174return error_non_fast_forward($remote);1175}11761177if($HEAD_sha1eq$remoteorigin_sha1) {1178# nothing to push1179return0;1180}11811182# Get every commit in between HEAD and refs/remotes/origin/master,1183# including HEAD and refs/remotes/origin/master1184my@commit_pairs= ();1185if($last_local_revid>0) {1186my$parsed_sha1=$remoteorigin_sha1;1187# Find a path from last MediaWiki commit to pushed commit1188print STDERR "Computing path from local to remote ...\n";1189my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents$local^$parsed_sha1"));1190my%local_ancestry;1191foreachmy$line(@local_ancestry) {1192if(my($child,$parents) =$line=~m/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1193foreachmy$parent(split(' ',$parents)) {1194$local_ancestry{$parent} =$child;1195}1196}elsif(!$line=~m/^([a-f0-9]+)/) {1197die"Unexpected output from git rev-list:$line";1198}1199}1200while($parsed_sha1ne$HEAD_sha1) {1201my$child=$local_ancestry{$parsed_sha1};1202if(!$child) {1203printf STDERR "Cannot find a path in history from remote commit to last commit\n";1204return error_non_fast_forward($remote);1205}1206push(@commit_pairs, [$parsed_sha1,$child]);1207$parsed_sha1=$child;1208}1209}else{1210# No remote mediawiki revision. Export the whole1211# history (linearized with --first-parent)1212print STDERR "Warning: no common ancestor, pushing complete history\n";1213my$history= run_git("rev-list --first-parent --children$local");1214my@history=split('\n',$history);1215@history=@history[1..$#history];1216foreachmy$line(reverse@history) {1217my@commit_info_split=split(/ |\n/,$line);1218push(@commit_pairs, \@commit_info_split);1219}1220}12211222foreachmy$commit_info_split(@commit_pairs) {1223my$sha1_child= @{$commit_info_split}[0];1224my$sha1_commit= @{$commit_info_split}[1];1225my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1226# TODO: we could detect rename, and encode them with a #redirect on the wiki.1227# TODO: for now, it's just a delete+add1228my@diff_info_list=split(/\0/,$diff_infos);1229# Keep the subject line of the commit message as mediawiki comment for the revision1230my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1231chomp($commit_msg);1232# Push every blob1233while(@diff_info_list) {1234my$status;1235# git diff-tree -z gives an output like1236# <metadata>\0<filename1>\01237# <metadata>\0<filename2>\01238# and we've split on \0.1239my$info=shift(@diff_info_list);1240my$file=shift(@diff_info_list);1241($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1242if($statuseq"non-fast-forward") {1243# we may already have sent part of the1244# commit to MediaWiki, but it's too1245# late to cancel it. Stop the push in1246# the middle, but still give an1247# accurate error message.1248return error_non_fast_forward($remote);1249}1250if($statusne"ok") {1251die("Unknown error from mw_push_file()");1252}1253}1254unless($dumb_push) {1255 run_git("notes --ref=$remotename/mediawikiadd -f -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1256 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1257}1258}12591260print STDOUT "ok$remote\n";1261return1;1262}12631264sub get_allowed_file_extensions {1265 mw_connect_maybe();12661267my$query= {1268 action =>'query',1269 meta =>'siteinfo',1270 siprop =>'fileextensions'1271};1272my$result=$mediawiki->api($query);1273my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1274my%hashFile=map{$_=>1}@file_extensions;12751276return%hashFile;1277}12781279# In memory cache for MediaWiki namespace ids.1280my%namespace_id;12811282# Namespaces whose id is cached in the configuration file1283# (to avoid duplicates)1284my%cached_mw_namespace_id;12851286# Return MediaWiki id for a canonical namespace name.1287# Ex.: "File", "Project".1288sub get_mw_namespace_id {1289 mw_connect_maybe();1290my$name=shift;12911292if(!exists$namespace_id{$name}) {1293# Look at configuration file, if the record for that namespace is1294# already cached. Namespaces are stored in form:1295# "Name_of_namespace:Id_namespace", ex.: "File:6".1296my@temp=split(/[\n]/, run_git("config --get-all remote."1297.$remotename.".namespaceCache"));1298chomp(@temp);1299foreachmy$ns(@temp) {1300my($n,$id) =split(/:/,$ns);1301if($ideq'notANameSpace') {1302$namespace_id{$n} = {is_namespace =>0};1303}else{1304$namespace_id{$n} = {is_namespace =>1, id =>$id};1305}1306$cached_mw_namespace_id{$n} =1;1307}1308}13091310if(!exists$namespace_id{$name}) {1311print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1312# NS not found => get namespace id from MW and store it in1313# configuration file.1314my$query= {1315 action =>'query',1316 meta =>'siteinfo',1317 siprop =>'namespaces'1318};1319my$result=$mediawiki->api($query);13201321while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1322if(defined($ns->{id}) &&defined($ns->{canonical})) {1323$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1324if($ns->{'*'}) {1325# alias (e.g. french Fichier: as alias for canonical File:)1326$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1327}1328}1329}1330}13311332my$ns=$namespace_id{$name};1333my$id;13341335unless(defined$ns) {1336print STDERR "No such namespace$nameon MediaWiki.\n";1337$ns= {is_namespace =>0};1338$namespace_id{$name} =$ns;1339}13401341if($ns->{is_namespace}) {1342$id=$ns->{id};1343}13441345# Store "notANameSpace" as special value for inexisting namespaces1346my$store_id= ($id||'notANameSpace');13471348# Store explicitely requested namespaces on disk1349if(!exists$cached_mw_namespace_id{$name}) {1350 run_git("config --add remote.".$remotename1351.".namespaceCache\"".$name.":".$store_id."\"");1352$cached_mw_namespace_id{$name} =1;1353}1354return$id;1355}13561357sub get_mw_namespace_id_for_page {1358if(my($namespace) =$_[0] =~/^([^:]*):/) {1359return get_mw_namespace_id($namespace);1360}else{1361return;1362}1363}