1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use Git::Mediawiki qw(clean_filename smudge_filename connect_maybe 18 EMPTY HTTP_CODE_OK); 19use DateTime::Format::ISO8601; 20use warnings; 21 22# By default, use UTF-8 to communicate with Git and the user 23binmode STDERR,':encoding(UTF-8)'; 24binmode STDOUT,':encoding(UTF-8)'; 25 26use URI::Escape; 27 28# It's not always possible to delete pages (may require some 29# privileges). Deleted pages are replaced with this content. 30useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 31 32# It's not possible to create empty pages. New empty files in Git are 33# sent with this content instead. 34useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 35 36# used to reflect file creation or deletion in diff. 37useconstant NULL_SHA1 =>'0000000000000000000000000000000000000000'; 38 39# Used on Git's side to reflect empty edit messages on the wiki 40useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 41 42# Number of pages taken into account at once in submodule get_mw_page_list 43useconstant SLICE_SIZE =>50; 44 45# Number of linked mediafile to get at once in get_linked_mediafiles 46# The query is split in small batches because of the MW API limit of 47# the number of links to be returned (500 links max). 48useconstant BATCH_SIZE =>10; 49 50if(@ARGV!=2) { 51 exit_error_usage(); 52} 53 54my$remotename=$ARGV[0]; 55my$url=$ARGV[1]; 56 57# Accept both space-separated and multiple keys in config file. 58# Spaces should be written as _ anyway because we'll use chomp. 59my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.${remotename}.pages")); 60chomp(@tracked_pages); 61 62# Just like @tracked_pages, but for MediaWiki categories. 63my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.${remotename}.categories")); 64chomp(@tracked_categories); 65 66# Import media files on pull 67my$import_media= run_git("config --get --bool remote.${remotename}.mediaimport"); 68chomp($import_media); 69$import_media= ($import_mediaeq'true'); 70 71# Export media files on push 72my$export_media= run_git("config --get --bool remote.${remotename}.mediaexport"); 73chomp($export_media); 74$export_media= !($export_mediaeq'false'); 75 76my$wiki_login= run_git("config --get remote.${remotename}.mwLogin"); 77# Note: mwPassword is discourraged. Use the credential system instead. 78my$wiki_passwd= run_git("config --get remote.${remotename}.mwPassword"); 79my$wiki_domain= run_git("config --get remote.${remotename}.mwDomain"); 80chomp($wiki_login); 81chomp($wiki_passwd); 82chomp($wiki_domain); 83 84# Import only last revisions (both for clone and fetch) 85my$shallow_import= run_git("config --get --bool remote.${remotename}.shallow"); 86chomp($shallow_import); 87$shallow_import= ($shallow_importeq'true'); 88 89# Fetch (clone and pull) by revisions instead of by pages. This behavior 90# is more efficient when we have a wiki with lots of pages and we fetch 91# the revisions quite often so that they concern only few pages. 92# Possible values: 93# - by_rev: perform one query per new revision on the remote wiki 94# - by_page: query each tracked page for new revision 95my$fetch_strategy= run_git("config --get remote.${remotename}.fetchStrategy"); 96if(!$fetch_strategy) { 97$fetch_strategy= run_git('config --get mediawiki.fetchStrategy'); 98} 99chomp($fetch_strategy); 100if(!$fetch_strategy) { 101$fetch_strategy='by_page'; 102} 103 104# Remember the timestamp corresponding to a revision id. 105my%basetimestamps; 106 107# Dumb push: don't update notes and mediawiki ref to reflect the last push. 108# 109# Configurable with mediawiki.dumbPush, or per-remote with 110# remote.<remotename>.dumbPush. 111# 112# This means the user will have to re-import the just-pushed 113# revisions. On the other hand, this means that the Git revisions 114# corresponding to MediaWiki revisions are all imported from the wiki, 115# regardless of whether they were initially created in Git or from the 116# web interface, hence all users will get the same history (i.e. if 117# the push from Git to MediaWiki loses some information, everybody 118# will get the history with information lost). If the import is 119# deterministic, this means everybody gets the same sha1 for each 120# MediaWiki revision. 121my$dumb_push= run_git("config --get --bool remote.${remotename}.dumbPush"); 122if(!$dumb_push) { 123$dumb_push= run_git('config --get --bool mediawiki.dumbPush'); 124} 125chomp($dumb_push); 126$dumb_push= ($dumb_pusheq'true'); 127 128my$wiki_name=$url; 129$wiki_name=~s{[^/]*://}{}; 130# If URL is like http://user:password@example.com/, we clearly don't 131# want the password in $wiki_name. While we're there, also remove user 132# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 133$wiki_name=~s/^.*@//; 134 135# Commands parser 136while(<STDIN>) { 137chomp; 138 139if(!parse_command($_)) { 140last; 141} 142 143BEGIN{ $| =1}# flush STDOUT, to make sure the previous 144# command is fully processed. 145} 146 147########################## Functions ############################## 148 149## error handling 150sub exit_error_usage { 151die"ERROR: git-remote-mediawiki module was not called with a correct number of\n". 152"parameters\n". 153"You may obtain this error because you attempted to run the git-remote-mediawiki\n". 154"module directly.\n". 155"This module can be used the following way:\n". 156"\tgit clone mediawiki://<address of a mediawiki>\n". 157"Then, use git commit, push and pull as with every normal git repository.\n"; 158} 159 160sub parse_command { 161my($line) =@_; 162my@cmd=split(/ /,$line); 163if(!defined$cmd[0]) { 164return0; 165} 166if($cmd[0]eq'capabilities') { 167die("Too many arguments for capabilities\n") 168if(defined($cmd[1])); 169 mw_capabilities(); 170}elsif($cmd[0]eq'list') { 171die("Too many arguments for list\n")if(defined($cmd[2])); 172 mw_list($cmd[1]); 173}elsif($cmd[0]eq'import') { 174die("Invalid argument for import\n") 175if($cmd[1]eq EMPTY); 176die("Too many arguments for import\n") 177if(defined($cmd[2])); 178 mw_import($cmd[1]); 179}elsif($cmd[0]eq'option') { 180die("Invalid arguments for option\n") 181if($cmd[1]eq EMPTY ||$cmd[2]eq EMPTY); 182die("Too many arguments for option\n") 183if(defined($cmd[3])); 184 mw_option($cmd[1],$cmd[2]); 185}elsif($cmd[0]eq'push') { 186 mw_push($cmd[1]); 187}else{ 188print{*STDERR}"Unknown command. Aborting...\n"; 189return0; 190} 191return1; 192} 193 194# MediaWiki API instance, created lazily. 195my$mediawiki; 196 197sub fatal_mw_error { 198my$action=shift; 199print STDERR "fatal: could not$action.\n"; 200print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 201if($url=~/^https/) { 202print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 203print STDERR "fatal: and the SSL certificate is correct.\n"; 204}else{ 205print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 206} 207print STDERR "fatal: (error ". 208$mediawiki->{error}->{code} .': '. 209$mediawiki->{error}->{details} .")\n"; 210exit1; 211} 212 213## Functions for listing pages on the remote wiki 214sub get_mw_tracked_pages { 215my$pages=shift; 216 get_mw_page_list(\@tracked_pages,$pages); 217return; 218} 219 220sub get_mw_page_list { 221my$page_list=shift; 222my$pages=shift; 223my@some_pages= @{$page_list}; 224while(@some_pages) { 225my$last_page= SLICE_SIZE; 226if($#some_pages<$last_page) { 227$last_page=$#some_pages; 228} 229my@slice=@some_pages[0..$last_page]; 230 get_mw_first_pages(\@slice,$pages); 231@some_pages=@some_pages[(SLICE_SIZE +1)..$#some_pages]; 232} 233return; 234} 235 236sub get_mw_tracked_categories { 237my$pages=shift; 238foreachmy$category(@tracked_categories) { 239if(index($category,':') <0) { 240# Mediawiki requires the Category 241# prefix, but let's not force the user 242# to specify it. 243$category="Category:${category}"; 244} 245my$mw_pages=$mediawiki->list( { 246 action =>'query', 247 list =>'categorymembers', 248 cmtitle =>$category, 249 cmlimit =>'max'} ) 250||die$mediawiki->{error}->{code} .': ' 251.$mediawiki->{error}->{details} ."\n"; 252foreachmy$page(@{$mw_pages}) { 253$pages->{$page->{title}} =$page; 254} 255} 256return; 257} 258 259sub get_mw_all_pages { 260my$pages=shift; 261# No user-provided list, get the list of pages from the API. 262my$mw_pages=$mediawiki->list({ 263 action =>'query', 264 list =>'allpages', 265 aplimit =>'max' 266}); 267if(!defined($mw_pages)) { 268 fatal_mw_error("get the list of wiki pages"); 269} 270foreachmy$page(@{$mw_pages}) { 271$pages->{$page->{title}} =$page; 272} 273return; 274} 275 276# queries the wiki for a set of pages. Meant to be used within a loop 277# querying the wiki for slices of page list. 278sub get_mw_first_pages { 279my$some_pages=shift; 280my@some_pages= @{$some_pages}; 281 282my$pages=shift; 283 284# pattern 'page1|page2|...' required by the API 285my$titles=join('|',@some_pages); 286 287my$mw_pages=$mediawiki->api({ 288 action =>'query', 289 titles =>$titles, 290}); 291if(!defined($mw_pages)) { 292 fatal_mw_error("query the list of wiki pages"); 293} 294while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 295if($id<0) { 296print{*STDERR}"Warning: page$page->{title} not found on wiki\n"; 297}else{ 298$pages->{$page->{title}} =$page; 299} 300} 301return; 302} 303 304# Get the list of pages to be fetched according to configuration. 305sub get_mw_pages { 306$mediawiki= connect_maybe($mediawiki,$remotename,$url); 307 308print{*STDERR}"Listing pages on remote wiki...\n"; 309 310my%pages;# hash on page titles to avoid duplicates 311my$user_defined; 312if(@tracked_pages) { 313$user_defined=1; 314# The user provided a list of pages titles, but we 315# still need to query the API to get the page IDs. 316 get_mw_tracked_pages(\%pages); 317} 318if(@tracked_categories) { 319$user_defined=1; 320 get_mw_tracked_categories(\%pages); 321} 322if(!$user_defined) { 323 get_mw_all_pages(\%pages); 324} 325if($import_media) { 326print{*STDERR}"Getting media files for selected pages...\n"; 327if($user_defined) { 328 get_linked_mediafiles(\%pages); 329}else{ 330 get_all_mediafiles(\%pages); 331} 332} 333print{*STDERR} (scalar keys%pages) ." pages found.\n"; 334return%pages; 335} 336 337# usage: $out = run_git("command args"); 338# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 339sub run_git { 340my$args=shift; 341my$encoding= (shift||'encoding(UTF-8)'); 342open(my$git,"-|:${encoding}","git ${args}") 343or die"Unable to fork:$!\n"; 344my$res=do{ 345local$/=undef; 346<$git> 347}; 348close($git); 349 350return$res; 351} 352 353 354sub get_all_mediafiles { 355my$pages=shift; 356# Attach list of all pages for media files from the API, 357# they are in a different namespace, only one namespace 358# can be queried at the same moment 359my$mw_pages=$mediawiki->list({ 360 action =>'query', 361 list =>'allpages', 362 apnamespace => get_mw_namespace_id('File'), 363 aplimit =>'max' 364}); 365if(!defined($mw_pages)) { 366print{*STDERR}"fatal: could not get the list of pages for media files.\n"; 367print{*STDERR}"fatal: '$url' does not appear to be a mediawiki\n"; 368print{*STDERR}"fatal: make sure '$url/api.php' is a valid page.\n"; 369exit1; 370} 371foreachmy$page(@{$mw_pages}) { 372$pages->{$page->{title}} =$page; 373} 374return; 375} 376 377sub get_linked_mediafiles { 378my$pages=shift; 379my@titles=map{$_->{title} }values(%{$pages}); 380 381my$batch= BATCH_SIZE; 382while(@titles) { 383if($#titles<$batch) { 384$batch=$#titles; 385} 386my@slice=@titles[0..$batch]; 387 388# pattern 'page1|page2|...' required by the API 389my$mw_titles=join('|',@slice); 390 391# Media files could be included or linked from 392# a page, get all related 393my$query= { 394 action =>'query', 395 prop =>'links|images', 396 titles =>$mw_titles, 397 plnamespace => get_mw_namespace_id('File'), 398 pllimit =>'max' 399}; 400my$result=$mediawiki->api($query); 401 402while(my($id,$page) =each(%{$result->{query}->{pages}})) { 403my@media_titles; 404if(defined($page->{links})) { 405my@link_titles 406=map{$_->{title} } @{$page->{links}}; 407push(@media_titles,@link_titles); 408} 409if(defined($page->{images})) { 410my@image_titles 411=map{$_->{title} } @{$page->{images}}; 412push(@media_titles,@image_titles); 413} 414if(@media_titles) { 415 get_mw_page_list(\@media_titles,$pages); 416} 417} 418 419@titles=@titles[($batch+1)..$#titles]; 420} 421return; 422} 423 424sub get_mw_mediafile_for_page_revision { 425# Name of the file on Wiki, with the prefix. 426my$filename=shift; 427my$timestamp=shift; 428my%mediafile; 429 430# Search if on a media file with given timestamp exists on 431# MediaWiki. In that case download the file. 432my$query= { 433 action =>'query', 434 prop =>'imageinfo', 435 titles =>"File:${filename}", 436 iistart =>$timestamp, 437 iiend =>$timestamp, 438 iiprop =>'timestamp|archivename|url', 439 iilimit =>1 440}; 441my$result=$mediawiki->api($query); 442 443my($fileid,$file) =each( %{$result->{query}->{pages}} ); 444# If not defined it means there is no revision of the file for 445# given timestamp. 446if(defined($file->{imageinfo})) { 447$mediafile{title} =$filename; 448 449my$fileinfo=pop(@{$file->{imageinfo}}); 450$mediafile{timestamp} =$fileinfo->{timestamp}; 451# Mediawiki::API's download function doesn't support https URLs 452# and can't download old versions of files. 453print{*STDERR}"\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 454$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 455} 456return%mediafile; 457} 458 459sub download_mw_mediafile { 460my$download_url=shift; 461 462my$response=$mediawiki->{ua}->get($download_url); 463if($response->code== HTTP_CODE_OK) { 464# It is tempting to return 465# $response->decoded_content({charset => "none"}), but 466# when doing so, utf8::downgrade($content) fails with 467# "Wide character in subroutine entry". 468$response->decode(); 469return$response->content(); 470}else{ 471print{*STDERR}"Error downloading mediafile from :\n"; 472print{*STDERR}"URL: ${download_url}\n"; 473print{*STDERR}'Server response: '.$response->code.q{ }.$response->message."\n"; 474exit1; 475} 476} 477 478sub get_last_local_revision { 479# Get note regarding last mediawiki revision 480my$note= run_git("notes --ref=${remotename}/mediawiki show refs/mediawiki/${remotename}/master 2>/dev/null"); 481my@note_info=split(/ /,$note); 482 483my$lastrevision_number; 484if(!(defined($note_info[0]) &&$note_info[0]eq'mediawiki_revision:')) { 485print{*STDERR}'No previous mediawiki revision found'; 486$lastrevision_number=0; 487}else{ 488# Notes are formatted : mediawiki_revision: #number 489$lastrevision_number=$note_info[1]; 490chomp($lastrevision_number); 491print{*STDERR}"Last local mediawiki revision found is ${lastrevision_number}"; 492} 493return$lastrevision_number; 494} 495 496# Get the last remote revision without taking in account which pages are 497# tracked or not. This function makes a single request to the wiki thus 498# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 499# option. 500sub get_last_global_remote_rev { 501$mediawiki= connect_maybe($mediawiki,$remotename,$url); 502 503my$query= { 504 action =>'query', 505 list =>'recentchanges', 506 prop =>'revisions', 507 rclimit =>'1', 508 rcdir =>'older', 509}; 510my$result=$mediawiki->api($query); 511return$result->{query}->{recentchanges}[0]->{revid}; 512} 513 514# Get the last remote revision concerning the tracked pages and the tracked 515# categories. 516sub get_last_remote_revision { 517$mediawiki= connect_maybe($mediawiki,$remotename,$url); 518 519my%pages_hash= get_mw_pages(); 520my@pages=values(%pages_hash); 521 522my$max_rev_num=0; 523 524print{*STDERR}"Getting last revision id on tracked pages...\n"; 525 526foreachmy$page(@pages) { 527my$id=$page->{pageid}; 528 529my$query= { 530 action =>'query', 531 prop =>'revisions', 532 rvprop =>'ids|timestamp', 533 pageids =>$id, 534}; 535 536my$result=$mediawiki->api($query); 537 538my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 539 540$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 541 542$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 543} 544 545print{*STDERR}"Last remote revision found is$max_rev_num.\n"; 546return$max_rev_num; 547} 548 549# Clean content before sending it to MediaWiki 550sub mediawiki_clean { 551my$string=shift; 552my$page_created=shift; 553# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 554# This function right trims a string and adds a \n at the end to follow this rule 555$string=~s/\s+$//; 556if($stringeq EMPTY &&$page_created) { 557# Creating empty pages is forbidden. 558$string= EMPTY_CONTENT; 559} 560return$string."\n"; 561} 562 563# Filter applied on MediaWiki data before adding them to Git 564sub mediawiki_smudge { 565my$string=shift; 566if($stringeq EMPTY_CONTENT) { 567$string= EMPTY; 568} 569# This \n is important. This is due to mediawiki's way to handle end of files. 570return"${string}\n"; 571} 572 573sub literal_data { 574my($content) =@_; 575print{*STDOUT}'data ', bytes::length($content),"\n",$content; 576return; 577} 578 579sub literal_data_raw { 580# Output possibly binary content. 581my($content) =@_; 582# Avoid confusion between size in bytes and in characters 583 utf8::downgrade($content); 584binmode STDOUT,':raw'; 585print{*STDOUT}'data ', bytes::length($content),"\n",$content; 586binmode STDOUT,':encoding(UTF-8)'; 587return; 588} 589 590sub mw_capabilities { 591# Revisions are imported to the private namespace 592# refs/mediawiki/$remotename/ by the helper and fetched into 593# refs/remotes/$remotename later by fetch. 594print{*STDOUT}"refspec refs/heads/*:refs/mediawiki/${remotename}/*\n"; 595print{*STDOUT}"import\n"; 596print{*STDOUT}"list\n"; 597print{*STDOUT}"push\n"; 598if($dumb_push) { 599print{*STDOUT}"no-private-update\n"; 600} 601print{*STDOUT}"\n"; 602return; 603} 604 605sub mw_list { 606# MediaWiki do not have branches, we consider one branch arbitrarily 607# called master, and HEAD pointing to it. 608print{*STDOUT}"? refs/heads/master\n"; 609print{*STDOUT}"\@refs/heads/masterHEAD\n"; 610print{*STDOUT}"\n"; 611return; 612} 613 614sub mw_option { 615print{*STDERR}"remote-helper command 'option$_[0]' not yet implemented\n"; 616print{*STDOUT}"unsupported\n"; 617return; 618} 619 620sub fetch_mw_revisions_for_page { 621my$page=shift; 622my$id=shift; 623my$fetch_from=shift; 624my@page_revs= (); 625my$query= { 626 action =>'query', 627 prop =>'revisions', 628 rvprop =>'ids', 629 rvdir =>'newer', 630 rvstartid =>$fetch_from, 631 rvlimit =>500, 632 pageids =>$id, 633 634# Let MediaWiki know that we support the latest API. 635continue=>'', 636}; 637 638my$revnum=0; 639# Get 500 revisions at a time due to the mediawiki api limit 640while(1) { 641my$result=$mediawiki->api($query); 642 643# Parse each of those 500 revisions 644foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 645my$page_rev_ids; 646$page_rev_ids->{pageid} =$page->{pageid}; 647$page_rev_ids->{revid} =$revision->{revid}; 648push(@page_revs,$page_rev_ids); 649$revnum++; 650} 651 652if($result->{'query-continue'}) {# For legacy APIs 653$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 654}elsif($result->{continue}) {# For newer APIs 655$query->{rvstartid} =$result->{continue}->{rvcontinue}; 656$query->{continue} =$result->{continue}->{continue}; 657}else{ 658last; 659} 660} 661if($shallow_import&&@page_revs) { 662print{*STDERR}" Found 1 revision (shallow import).\n"; 663@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 664return$page_revs[0]; 665} 666print{*STDERR}" Found ${revnum} revision(s).\n"; 667return@page_revs; 668} 669 670sub fetch_mw_revisions { 671my$pages=shift;my@pages= @{$pages}; 672my$fetch_from=shift; 673 674my@revisions= (); 675my$n=1; 676foreachmy$page(@pages) { 677my$id=$page->{pageid}; 678print{*STDERR}"page ${n}/",scalar(@pages),': ',$page->{title},"\n"; 679$n++; 680my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 681@revisions= (@page_revs,@revisions); 682} 683 684return($n,@revisions); 685} 686 687sub fe_escape_path { 688my$path=shift; 689$path=~s/\\/\\\\/g; 690$path=~s/"/\\"/g; 691$path=~s/\n/\\n/g; 692returnqq("${path}"); 693} 694 695sub import_file_revision { 696my$commit=shift; 697my%commit= %{$commit}; 698my$full_import=shift; 699my$n=shift; 700my$mediafile=shift; 701my%mediafile; 702if($mediafile) { 703%mediafile= %{$mediafile}; 704} 705 706my$title=$commit{title}; 707my$comment=$commit{comment}; 708my$content=$commit{content}; 709my$author=$commit{author}; 710my$date=$commit{date}; 711 712print{*STDOUT}"commit refs/mediawiki/${remotename}/master\n"; 713print{*STDOUT}"mark :${n}\n"; 714print{*STDOUT}"committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 715 literal_data($comment); 716 717# If it's not a clone, we need to know where to start from 718if(!$full_import&&$n==1) { 719print{*STDOUT}"from refs/mediawiki/${remotename}/master^0\n"; 720} 721if($contentne DELETED_CONTENT) { 722print{*STDOUT}'M 644 inline '. 723 fe_escape_path("${title}.mw") ."\n"; 724 literal_data($content); 725if(%mediafile) { 726print{*STDOUT}'M 644 inline ' 727. fe_escape_path($mediafile{title}) ."\n"; 728 literal_data_raw($mediafile{content}); 729} 730print{*STDOUT}"\n\n"; 731}else{ 732print{*STDOUT}'D '. fe_escape_path("${title}.mw") ."\n"; 733} 734 735# mediawiki revision number in the git note 736if($full_import&&$n==1) { 737print{*STDOUT}"reset refs/notes/${remotename}/mediawiki\n"; 738} 739print{*STDOUT}"commit refs/notes/${remotename}/mediawiki\n"; 740print{*STDOUT}"committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 741 literal_data('Note added by git-mediawiki during import'); 742if(!$full_import&&$n==1) { 743print{*STDOUT}"from refs/notes/${remotename}/mediawiki^0\n"; 744} 745print{*STDOUT}"N inline :${n}\n"; 746 literal_data("mediawiki_revision:$commit{mw_revision}"); 747print{*STDOUT}"\n\n"; 748return; 749} 750 751# parse a sequence of 752# <cmd> <arg1> 753# <cmd> <arg2> 754# \n 755# (like batch sequence of import and sequence of push statements) 756sub get_more_refs { 757my$cmd=shift; 758my@refs; 759while(1) { 760my$line= <STDIN>; 761if($line=~/^$cmd (.*)$/) { 762push(@refs,$1); 763}elsif($lineeq"\n") { 764return@refs; 765}else{ 766die("Invalid command in a '$cmd' batch:$_\n"); 767} 768} 769return; 770} 771 772sub mw_import { 773# multiple import commands can follow each other. 774my@refs= (shift, get_more_refs('import')); 775foreachmy$ref(@refs) { 776 mw_import_ref($ref); 777} 778print{*STDOUT}"done\n"; 779return; 780} 781 782sub mw_import_ref { 783my$ref=shift; 784# The remote helper will call "import HEAD" and 785# "import refs/heads/master". 786# Since HEAD is a symbolic ref to master (by convention, 787# followed by the output of the command "list" that we gave), 788# we don't need to do anything in this case. 789if($refeq'HEAD') { 790return; 791} 792 793$mediawiki= connect_maybe($mediawiki,$remotename,$url); 794 795print{*STDERR}"Searching revisions...\n"; 796my$last_local= get_last_local_revision(); 797my$fetch_from=$last_local+1; 798if($fetch_from==1) { 799print{*STDERR}", fetching from beginning.\n"; 800}else{ 801print{*STDERR}", fetching from here.\n"; 802} 803 804my$n=0; 805if($fetch_strategyeq'by_rev') { 806print{*STDERR}"Fetching & writing export data by revs...\n"; 807$n= mw_import_ref_by_revs($fetch_from); 808}elsif($fetch_strategyeq'by_page') { 809print{*STDERR}"Fetching & writing export data by pages...\n"; 810$n= mw_import_ref_by_pages($fetch_from); 811}else{ 812print{*STDERR}qq(fatal: invalid fetch strategy "${fetch_strategy}".\n); 813print{*STDERR}"Check your configuration variables remote.${remotename}.fetchStrategy and mediawiki.fetchStrategy\n"; 814exit1; 815} 816 817if($fetch_from==1&&$n==0) { 818print{*STDERR}"You appear to have cloned an empty MediaWiki.\n"; 819# Something has to be done remote-helper side. If nothing is done, an error is 820# thrown saying that HEAD is referring to unknown object 0000000000000000000 821# and the clone fails. 822} 823return; 824} 825 826sub mw_import_ref_by_pages { 827 828my$fetch_from=shift; 829my%pages_hash= get_mw_pages(); 830my@pages=values(%pages_hash); 831 832my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 833 834@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 835my@revision_ids=map{$_->{revid} }@revisions; 836 837return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 838} 839 840sub mw_import_ref_by_revs { 841 842my$fetch_from=shift; 843my%pages_hash= get_mw_pages(); 844 845my$last_remote= get_last_global_remote_rev(); 846my@revision_ids=$fetch_from..$last_remote; 847return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 848} 849 850# Import revisions given in second argument (array of integers). 851# Only pages appearing in the third argument (hash indexed by page titles) 852# will be imported. 853sub mw_import_revids { 854my$fetch_from=shift; 855my$revision_ids=shift; 856my$pages=shift; 857 858my$n=0; 859my$n_actual=0; 860my$last_timestamp=0;# Placeholder in case $rev->timestamp is undefined 861 862foreachmy$pagerevid(@{$revision_ids}) { 863# Count page even if we skip it, since we display 864# $n/$total and $total includes skipped pages. 865$n++; 866 867# fetch the content of the pages 868my$query= { 869 action =>'query', 870 prop =>'revisions', 871 rvprop =>'content|timestamp|comment|user|ids', 872 revids =>$pagerevid, 873}; 874 875my$result=$mediawiki->api($query); 876 877if(!$result) { 878die"Failed to retrieve modified page for revision$pagerevid\n"; 879} 880 881if(defined($result->{query}->{badrevids}->{$pagerevid})) { 882# The revision id does not exist on the remote wiki. 883next; 884} 885 886if(!defined($result->{query}->{pages})) { 887die"Invalid revision ${pagerevid}.\n"; 888} 889 890my@result_pages=values(%{$result->{query}->{pages}}); 891my$result_page=$result_pages[0]; 892my$rev=$result_pages[0]->{revisions}->[0]; 893 894my$page_title=$result_page->{title}; 895 896if(!exists($pages->{$page_title})) { 897print{*STDERR}"${n}/",scalar(@{$revision_ids}), 898": Skipping revision #$rev->{revid} of ${page_title}\n"; 899next; 900} 901 902$n_actual++; 903 904my%commit; 905$commit{author} =$rev->{user} ||'Anonymous'; 906$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 907$commit{title} = smudge_filename($page_title); 908$commit{mw_revision} =$rev->{revid}; 909$commit{content} = mediawiki_smudge($rev->{'*'}); 910 911if(!defined($rev->{timestamp})) { 912$last_timestamp++; 913}else{ 914$last_timestamp=$rev->{timestamp}; 915} 916$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 917 918# Differentiates classic pages and media files. 919my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 920my%mediafile; 921if($namespace) { 922my$id= get_mw_namespace_id($namespace); 923if($id&&$id== get_mw_namespace_id('File')) { 924%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 925} 926} 927# If this is a revision of the media page for new version 928# of a file do one common commit for both file and media page. 929# Else do commit only for that page. 930print{*STDERR}"${n}/",scalar(@{$revision_ids}),": Revision #$rev->{revid} of$commit{title}\n"; 931 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 932} 933 934return$n_actual; 935} 936 937sub error_non_fast_forward { 938my$advice= run_git('config --bool advice.pushNonFastForward'); 939chomp($advice); 940if($advicene'false') { 941# Native git-push would show this after the summary. 942# We can't ask it to display it cleanly, so print it 943# ourselves before. 944print{*STDERR}"To prevent you from losing history, non-fast-forward updates were rejected\n"; 945print{*STDERR}"Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 946print{*STDERR}"'Note about fast-forwards' section of 'git push --help' for details.\n"; 947} 948print{*STDOUT}qq(error$_[0] "non-fast-forward"\n); 949return0; 950} 951 952sub mw_upload_file { 953my$complete_file_name=shift; 954my$new_sha1=shift; 955my$extension=shift; 956my$file_deleted=shift; 957my$summary=shift; 958my$newrevid; 959my$path="File:${complete_file_name}"; 960my%hashFiles= get_allowed_file_extensions(); 961if(!exists($hashFiles{$extension})) { 962print{*STDERR}"${complete_file_name} is not a permitted file on this wiki.\n"; 963print{*STDERR}"Check the configuration of file uploads in your mediawiki.\n"; 964return$newrevid; 965} 966# Deleting and uploading a file requires a privileged user 967if($file_deleted) { 968$mediawiki= connect_maybe($mediawiki,$remotename,$url); 969my$query= { 970 action =>'delete', 971 title =>$path, 972 reason =>$summary 973}; 974if(!$mediawiki->edit($query)) { 975print{*STDERR}"Failed to delete file on remote wiki\n"; 976print{*STDERR}"Check your permissions on the remote site. Error code:\n"; 977print{*STDERR}$mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 978exit1; 979} 980}else{ 981# Don't let perl try to interpret file content as UTF-8 => use "raw" 982my$content= run_git("cat-file blob ${new_sha1}",'raw'); 983if($contentne EMPTY) { 984$mediawiki= connect_maybe($mediawiki,$remotename,$url); 985$mediawiki->{config}->{upload_url} = 986"${url}/index.php/Special:Upload"; 987$mediawiki->edit({ 988 action =>'upload', 989 filename =>$complete_file_name, 990 comment =>$summary, 991 file => [undef, 992$complete_file_name, 993 Content =>$content], 994 ignorewarnings =>1, 995}, { 996 skip_encoding =>1 997} ) ||die$mediawiki->{error}->{code} .':' 998.$mediawiki->{error}->{details} ."\n"; 999my$last_file_page=$mediawiki->get_page({title =>$path});1000$newrevid=$last_file_page->{revid};1001print{*STDERR}"Pushed file: ${new_sha1} - ${complete_file_name}.\n";1002}else{1003print{*STDERR}"Empty file ${complete_file_name} not pushed.\n";1004}1005}1006return$newrevid;1007}10081009sub mw_push_file {1010my$diff_info=shift;1011# $diff_info contains a string in this format:1012# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1013my@diff_info_split=split(/[ \t]/,$diff_info);10141015# Filename, including .mw extension1016my$complete_file_name=shift;1017# Commit message1018my$summary=shift;1019# MediaWiki revision number. Keep the previous one by default,1020# in case there's no edit to perform.1021my$oldrevid=shift;1022my$newrevid;10231024if($summaryeq EMPTY_MESSAGE) {1025$summary= EMPTY;1026}10271028my$new_sha1=$diff_info_split[3];1029my$old_sha1=$diff_info_split[2];1030my$page_created= ($old_sha1eq NULL_SHA1);1031my$page_deleted= ($new_sha1eq NULL_SHA1);1032$complete_file_name= clean_filename($complete_file_name);10331034my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1035if(!defined($extension)) {1036$extension= EMPTY;1037}1038if($extensioneq'mw') {1039my$ns= get_mw_namespace_id_for_page($complete_file_name);1040if($ns&&$ns== get_mw_namespace_id('File') && (!$export_media)) {1041print{*STDERR}"Ignoring media file related page: ${complete_file_name}\n";1042return($oldrevid,'ok');1043}1044my$file_content;1045if($page_deleted) {1046# Deleting a page usually requires1047# special privileges. A common1048# convention is to replace the page1049# with this content instead:1050$file_content= DELETED_CONTENT;1051}else{1052$file_content= run_git("cat-file blob ${new_sha1}");1053}10541055$mediawiki= connect_maybe($mediawiki,$remotename,$url);10561057my$result=$mediawiki->edit( {1058 action =>'edit',1059 summary =>$summary,1060 title =>$title,1061 basetimestamp =>$basetimestamps{$oldrevid},1062 text => mediawiki_clean($file_content,$page_created),1063}, {1064 skip_encoding =>1# Helps with names with accentuated characters1065});1066if(!$result) {1067if($mediawiki->{error}->{code} ==3) {1068# edit conflicts, considered as non-fast-forward1069print{*STDERR}'Warning: Error '.1070$mediawiki->{error}->{code} .1071' from mediawiki: '.$mediawiki->{error}->{details} .1072".\n";1073return($oldrevid,'non-fast-forward');1074}else{1075# Other errors. Shouldn't happen => just die()1076die'Fatal: Error '.1077$mediawiki->{error}->{code} .1078' from mediawiki: '.$mediawiki->{error}->{details} ."\n";1079}1080}1081$newrevid=$result->{edit}->{newrevid};1082print{*STDERR}"Pushed file: ${new_sha1} - ${title}\n";1083}elsif($export_media) {1084$newrevid= mw_upload_file($complete_file_name,$new_sha1,1085$extension,$page_deleted,1086$summary);1087}else{1088print{*STDERR}"Ignoring media file ${title}\n";1089}1090$newrevid= ($newrevidor$oldrevid);1091return($newrevid,'ok');1092}10931094sub mw_push {1095# multiple push statements can follow each other1096my@refsspecs= (shift, get_more_refs('push'));1097my$pushed;1098formy$refspec(@refsspecs) {1099my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1100or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>\n");1101if($force) {1102print{*STDERR}"Warning: forced push not allowed on a MediaWiki.\n";1103}1104if($localeq EMPTY) {1105print{*STDERR}"Cannot delete remote branch on a MediaWiki\n";1106print{*STDOUT}"error ${remote} cannot delete\n";1107next;1108}1109if($remotene'refs/heads/master') {1110print{*STDERR}"Only push to the branch 'master' is supported on a MediaWiki\n";1111print{*STDOUT}"error ${remote} only master allowed\n";1112next;1113}1114if(mw_push_revision($local,$remote)) {1115$pushed=1;1116}1117}11181119# Notify Git that the push is done1120print{*STDOUT}"\n";11211122if($pushed&&$dumb_push) {1123print{*STDERR}"Just pushed some revisions to MediaWiki.\n";1124print{*STDERR}"The pushed revisions now have to be re-imported, and your current branch\n";1125print{*STDERR}"needs to be updated with these re-imported commits. You can do this with\n";1126print{*STDERR}"\n";1127print{*STDERR}" git pull --rebase\n";1128print{*STDERR}"\n";1129}1130return;1131}11321133sub mw_push_revision {1134my$local=shift;1135my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1136my$last_local_revid= get_last_local_revision();1137print{*STDERR}".\n";# Finish sentence started by get_last_local_revision()1138my$last_remote_revid= get_last_remote_revision();1139my$mw_revision=$last_remote_revid;11401141# Get sha1 of commit pointed by local HEAD1142my$HEAD_sha1= run_git("rev-parse ${local} 2>/dev/null");1143chomp($HEAD_sha1);1144# Get sha1 of commit pointed by remotes/$remotename/master1145my$remoteorigin_sha1= run_git("rev-parse refs/remotes/${remotename}/master 2>/dev/null");1146chomp($remoteorigin_sha1);11471148if($last_local_revid>0&&1149$last_local_revid<$last_remote_revid) {1150return error_non_fast_forward($remote);1151}11521153if($HEAD_sha1eq$remoteorigin_sha1) {1154# nothing to push1155return0;1156}11571158# Get every commit in between HEAD and refs/remotes/origin/master,1159# including HEAD and refs/remotes/origin/master1160my@commit_pairs= ();1161if($last_local_revid>0) {1162my$parsed_sha1=$remoteorigin_sha1;1163# Find a path from last MediaWiki commit to pushed commit1164print{*STDERR}"Computing path from local to remote ...\n";1165my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents ${local} ^${parsed_sha1}"));1166my%local_ancestry;1167foreachmy$line(@local_ancestry) {1168if(my($child,$parents) =$line=~/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1169foreachmy$parent(split(/ /,$parents)) {1170$local_ancestry{$parent} =$child;1171}1172}elsif(!$line=~/^([a-f0-9]+)/) {1173die"Unexpected output from git rev-list: ${line}\n";1174}1175}1176while($parsed_sha1ne$HEAD_sha1) {1177my$child=$local_ancestry{$parsed_sha1};1178if(!$child) {1179print{*STDERR}"Cannot find a path in history from remote commit to last commit\n";1180return error_non_fast_forward($remote);1181}1182push(@commit_pairs, [$parsed_sha1,$child]);1183$parsed_sha1=$child;1184}1185}else{1186# No remote mediawiki revision. Export the whole1187# history (linearized with --first-parent)1188print{*STDERR}"Warning: no common ancestor, pushing complete history\n";1189my$history= run_git("rev-list --first-parent --children ${local}");1190my@history=split(/\n/,$history);1191@history=@history[1..$#history];1192foreachmy$line(reverse@history) {1193my@commit_info_split=split(/[ \n]/,$line);1194push(@commit_pairs, \@commit_info_split);1195}1196}11971198foreachmy$commit_info_split(@commit_pairs) {1199my$sha1_child= @{$commit_info_split}[0];1200my$sha1_commit= @{$commit_info_split}[1];1201my$diff_infos= run_git("diff-tree -r --raw -z ${sha1_child} ${sha1_commit}");1202# TODO: we could detect rename, and encode them with a #redirect on the wiki.1203# TODO: for now, it's just a delete+add1204my@diff_info_list=split(/\0/,$diff_infos);1205# Keep the subject line of the commit message as mediawiki comment for the revision1206my$commit_msg= run_git(qq(log --no-walk --format="%s" ${sha1_commit}));1207chomp($commit_msg);1208# Push every blob1209while(@diff_info_list) {1210my$status;1211# git diff-tree -z gives an output like1212# <metadata>\0<filename1>\01213# <metadata>\0<filename2>\01214# and we've split on \0.1215my$info=shift(@diff_info_list);1216my$file=shift(@diff_info_list);1217($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1218if($statuseq'non-fast-forward') {1219# we may already have sent part of the1220# commit to MediaWiki, but it's too1221# late to cancel it. Stop the push in1222# the middle, but still give an1223# accurate error message.1224return error_non_fast_forward($remote);1225}1226if($statusne'ok') {1227die("Unknown error from mw_push_file()\n");1228}1229}1230if(!$dumb_push) {1231 run_git(qq(notes --ref=${remotename}/mediawiki add -f -m "mediawiki_revision: ${mw_revision}" ${sha1_commit}));1232}1233}12341235print{*STDOUT}"ok ${remote}\n";1236return1;1237}12381239sub get_allowed_file_extensions {1240$mediawiki= connect_maybe($mediawiki,$remotename,$url);12411242my$query= {1243 action =>'query',1244 meta =>'siteinfo',1245 siprop =>'fileextensions'1246};1247my$result=$mediawiki->api($query);1248my@file_extensions=map{$_->{ext}} @{$result->{query}->{fileextensions}};1249my%hashFile=map{$_=>1}@file_extensions;12501251return%hashFile;1252}12531254# In memory cache for MediaWiki namespace ids.1255my%namespace_id;12561257# Namespaces whose id is cached in the configuration file1258# (to avoid duplicates)1259my%cached_mw_namespace_id;12601261# Return MediaWiki id for a canonical namespace name.1262# Ex.: "File", "Project".1263sub get_mw_namespace_id {1264$mediawiki= connect_maybe($mediawiki,$remotename,$url);1265my$name=shift;12661267if(!exists$namespace_id{$name}) {1268# Look at configuration file, if the record for that namespace is1269# already cached. Namespaces are stored in form:1270# "Name_of_namespace:Id_namespace", ex.: "File:6".1271my@temp=split(/\n/,1272 run_git("config --get-all remote.${remotename}.namespaceCache"));1273chomp(@temp);1274foreachmy$ns(@temp) {1275my($n,$id) =split(/:/,$ns);1276if($ideq'notANameSpace') {1277$namespace_id{$n} = {is_namespace =>0};1278}else{1279$namespace_id{$n} = {is_namespace =>1, id =>$id};1280}1281$cached_mw_namespace_id{$n} =1;1282}1283}12841285if(!exists$namespace_id{$name}) {1286print{*STDERR}"Namespace ${name} not found in cache, querying the wiki ...\n";1287# NS not found => get namespace id from MW and store it in1288# configuration file.1289my$query= {1290 action =>'query',1291 meta =>'siteinfo',1292 siprop =>'namespaces'1293};1294my$result=$mediawiki->api($query);12951296while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1297if(defined($ns->{id}) &&defined($ns->{canonical})) {1298$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1299if($ns->{'*'}) {1300# alias (e.g. french Fichier: as alias for canonical File:)1301$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1302}1303}1304}1305}13061307my$ns=$namespace_id{$name};1308my$id;13091310if(!defined$ns) {1311print{*STDERR}"No such namespace ${name} on MediaWiki.\n";1312$ns= {is_namespace =>0};1313$namespace_id{$name} =$ns;1314}13151316if($ns->{is_namespace}) {1317$id=$ns->{id};1318}13191320# Store "notANameSpace" as special value for inexisting namespaces1321my$store_id= ($id||'notANameSpace');13221323# Store explicitly requested namespaces on disk1324if(!exists$cached_mw_namespace_id{$name}) {1325 run_git(qq(config --add remote.${remotename}.namespaceCache "${name}:${store_id}"));1326$cached_mw_namespace_id{$name} =1;1327}1328return$id;1329}13301331sub get_mw_namespace_id_for_page {1332my$namespace=shift;1333if($namespace=~/^([^:]*):/) {1334return get_mw_namespace_id($namespace);1335}else{1336return;1337}1338}