1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use DateTime::Format::ISO8601; 18use warnings; 19 20# By default, use UTF-8 to communicate with Git and the user 21binmode STDERR,":utf8"; 22binmode STDOUT,":utf8"; 23 24use URI::Escape; 25use IPC::Open2; 26 27# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 28useconstant SLASH_REPLACEMENT =>"%2F"; 29 30# It's not always possible to delete pages (may require some 31# privileges). Deleted pages are replaced with this content. 32useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 33 34# It's not possible to create empty pages. New empty files in Git are 35# sent with this content instead. 36useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 37 38# used to reflect file creation or deletion in diff. 39useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 40 41# Used on Git's side to reflect empty edit messages on the wiki 42useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 43 44my$remotename=$ARGV[0]; 45my$url=$ARGV[1]; 46 47# Accept both space-separated and multiple keys in config file. 48# Spaces should be written as _ anyway because we'll use chomp. 49my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 50chomp(@tracked_pages); 51 52# Just like @tracked_pages, but for MediaWiki categories. 53my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 54chomp(@tracked_categories); 55 56# Import media files on pull 57my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 58chomp($import_media); 59$import_media= ($import_mediaeq"true"); 60 61# Export media files on push 62my$export_media= run_git("config --get --bool remote.".$remotename.".mediaexport"); 63chomp($export_media); 64$export_media= !($export_mediaeq"false"); 65 66my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 67# Note: mwPassword is discourraged. Use the credential system instead. 68my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 69my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 70chomp($wiki_login); 71chomp($wiki_passwd); 72chomp($wiki_domain); 73 74# Import only last revisions (both for clone and fetch) 75my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 76chomp($shallow_import); 77$shallow_import= ($shallow_importeq"true"); 78 79# Fetch (clone and pull) by revisions instead of by pages. This behavior 80# is more efficient when we have a wiki with lots of pages and we fetch 81# the revisions quite often so that they concern only few pages. 82# Possible values: 83# - by_rev: perform one query per new revision on the remote wiki 84# - by_page: query each tracked page for new revision 85my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 86unless($fetch_strategy) { 87$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 88} 89chomp($fetch_strategy); 90unless($fetch_strategy) { 91$fetch_strategy="by_page"; 92} 93 94# Dumb push: don't update notes and mediawiki ref to reflect the last push. 95# 96# Configurable with mediawiki.dumbPush, or per-remote with 97# remote.<remotename>.dumbPush. 98# 99# This means the user will have to re-import the just-pushed 100# revisions. On the other hand, this means that the Git revisions 101# corresponding to MediaWiki revisions are all imported from the wiki, 102# regardless of whether they were initially created in Git or from the 103# web interface, hence all users will get the same history (i.e. if 104# the push from Git to MediaWiki loses some information, everybody 105# will get the history with information lost). If the import is 106# deterministic, this means everybody gets the same sha1 for each 107# MediaWiki revision. 108my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 109unless($dumb_push) { 110$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 111} 112chomp($dumb_push); 113$dumb_push= ($dumb_pusheq"true"); 114 115my$wiki_name=$url; 116$wiki_name=~s/[^\/]*:\/\///; 117# If URL is like http://user:password@example.com/, we clearly don't 118# want the password in $wiki_name. While we're there, also remove user 119# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 120$wiki_name=~s/^.*@//; 121 122# Commands parser 123my$entry; 124my@cmd; 125while(<STDIN>) { 126chomp; 127@cmd=split(/ /); 128if(defined($cmd[0])) { 129# Line not blank 130if($cmd[0]eq"capabilities") { 131die("Too many arguments for capabilities")unless(!defined($cmd[1])); 132 mw_capabilities(); 133}elsif($cmd[0]eq"list") { 134die("Too many arguments for list")unless(!defined($cmd[2])); 135 mw_list($cmd[1]); 136}elsif($cmd[0]eq"import") { 137die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 138 mw_import($cmd[1]); 139}elsif($cmd[0]eq"option") { 140die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 141 mw_option($cmd[1],$cmd[2]); 142}elsif($cmd[0]eq"push") { 143 mw_push($cmd[1]); 144}else{ 145print STDERR "Unknown command. Aborting...\n"; 146last; 147} 148}else{ 149# blank line: we should terminate 150last; 151} 152 153BEGIN{ $| =1}# flush STDOUT, to make sure the previous 154# command is fully processed. 155} 156 157########################## Functions ############################## 158 159# MediaWiki API instance, created lazily. 160my$mediawiki; 161 162sub mw_connect_maybe { 163if($mediawiki) { 164return; 165} 166$mediawiki= MediaWiki::API->new; 167$mediawiki->{config}->{api_url} ="$url/api.php"; 168if($wiki_login) { 169my%credential= ( 170'url'=>$url, 171'username'=>$wiki_login, 172'password'=>$wiki_passwd 173); 174 Git::credential(\%credential); 175my$request= {lgname =>$credential{username}, 176 lgpassword =>$credential{password}, 177 lgdomain =>$wiki_domain}; 178if($mediawiki->login($request)) { 179 Git::credential(\%credential,'approve'); 180print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 181}else{ 182print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 183print STDERR " (error ". 184$mediawiki->{error}->{code} .': '. 185$mediawiki->{error}->{details} .")\n"; 186 Git::credential(\%credential,'reject'); 187exit1; 188} 189} 190} 191 192sub fatal_mw_error { 193my$action=shift; 194print STDERR "fatal: could not$action.\n"; 195print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 196if($url=~/^https/) { 197print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 198print STDERR "fatal: and the SSL certificate is correct.\n"; 199}else{ 200print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 201} 202print STDERR "fatal: (error ". 203$mediawiki->{error}->{code} .': '. 204$mediawiki->{error}->{details} .")\n"; 205exit1; 206} 207 208## Functions for listing pages on the remote wiki 209sub get_mw_tracked_pages { 210my$pages=shift; 211 get_mw_page_list(\@tracked_pages,$pages); 212} 213 214sub get_mw_page_list { 215my$page_list=shift; 216my$pages=shift; 217my@some_pages=@$page_list; 218while(@some_pages) { 219my$last=50; 220if($#some_pages<$last) { 221$last=$#some_pages; 222} 223my@slice=@some_pages[0..$last]; 224 get_mw_first_pages(\@slice,$pages); 225@some_pages=@some_pages[51..$#some_pages]; 226} 227} 228 229sub get_mw_tracked_categories { 230my$pages=shift; 231foreachmy$category(@tracked_categories) { 232if(index($category,':') <0) { 233# Mediawiki requires the Category 234# prefix, but let's not force the user 235# to specify it. 236$category="Category:".$category; 237} 238my$mw_pages=$mediawiki->list( { 239 action =>'query', 240 list =>'categorymembers', 241 cmtitle =>$category, 242 cmlimit =>'max'} ) 243||die$mediawiki->{error}->{code} .': ' 244.$mediawiki->{error}->{details}; 245foreachmy$page(@{$mw_pages}) { 246$pages->{$page->{title}} =$page; 247} 248} 249} 250 251sub get_mw_all_pages { 252my$pages=shift; 253# No user-provided list, get the list of pages from the API. 254my$mw_pages=$mediawiki->list({ 255 action =>'query', 256 list =>'allpages', 257 aplimit =>'max' 258}); 259if(!defined($mw_pages)) { 260 fatal_mw_error("get the list of wiki pages"); 261} 262foreachmy$page(@{$mw_pages}) { 263$pages->{$page->{title}} =$page; 264} 265} 266 267# queries the wiki for a set of pages. Meant to be used within a loop 268# querying the wiki for slices of page list. 269sub get_mw_first_pages { 270my$some_pages=shift; 271my@some_pages= @{$some_pages}; 272 273my$pages=shift; 274 275# pattern 'page1|page2|...' required by the API 276my$titles=join('|',@some_pages); 277 278my$mw_pages=$mediawiki->api({ 279 action =>'query', 280 titles =>$titles, 281}); 282if(!defined($mw_pages)) { 283 fatal_mw_error("query the list of wiki pages"); 284} 285while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 286if($id<0) { 287print STDERR "Warning: page$page->{title} not found on wiki\n"; 288}else{ 289$pages->{$page->{title}} =$page; 290} 291} 292} 293 294# Get the list of pages to be fetched according to configuration. 295sub get_mw_pages { 296 mw_connect_maybe(); 297 298print STDERR "Listing pages on remote wiki...\n"; 299 300my%pages;# hash on page titles to avoid duplicates 301my$user_defined; 302if(@tracked_pages) { 303$user_defined=1; 304# The user provided a list of pages titles, but we 305# still need to query the API to get the page IDs. 306 get_mw_tracked_pages(\%pages); 307} 308if(@tracked_categories) { 309$user_defined=1; 310 get_mw_tracked_categories(\%pages); 311} 312if(!$user_defined) { 313 get_mw_all_pages(\%pages); 314} 315if($import_media) { 316print STDERR "Getting media files for selected pages...\n"; 317if($user_defined) { 318 get_linked_mediafiles(\%pages); 319}else{ 320 get_all_mediafiles(\%pages); 321} 322} 323print STDERR (scalar keys%pages) ." pages found.\n"; 324return%pages; 325} 326 327# usage: $out = run_git("command args"); 328# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 329sub run_git { 330my$args=shift; 331my$encoding= (shift||"encoding(UTF-8)"); 332open(my$git,"-|:$encoding","git ".$args); 333my$res=do{local$/; <$git> }; 334close($git); 335 336return$res; 337} 338 339 340sub get_all_mediafiles { 341my$pages=shift; 342# Attach list of all pages for media files from the API, 343# they are in a different namespace, only one namespace 344# can be queried at the same moment 345my$mw_pages=$mediawiki->list({ 346 action =>'query', 347 list =>'allpages', 348 apnamespace => get_mw_namespace_id("File"), 349 aplimit =>'max' 350}); 351if(!defined($mw_pages)) { 352print STDERR "fatal: could not get the list of pages for media files.\n"; 353print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 354print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 355exit1; 356} 357foreachmy$page(@{$mw_pages}) { 358$pages->{$page->{title}} =$page; 359} 360} 361 362sub get_linked_mediafiles { 363my$pages=shift; 364my@titles=map$_->{title},values(%{$pages}); 365 366# The query is split in small batches because of the MW API limit of 367# the number of links to be returned (500 links max). 368my$batch=10; 369while(@titles) { 370if($#titles<$batch) { 371$batch=$#titles; 372} 373my@slice=@titles[0..$batch]; 374 375# pattern 'page1|page2|...' required by the API 376my$mw_titles=join('|',@slice); 377 378# Media files could be included or linked from 379# a page, get all related 380my$query= { 381 action =>'query', 382 prop =>'links|images', 383 titles =>$mw_titles, 384 plnamespace => get_mw_namespace_id("File"), 385 pllimit =>'max' 386}; 387my$result=$mediawiki->api($query); 388 389while(my($id,$page) =each(%{$result->{query}->{pages}})) { 390my@media_titles; 391if(defined($page->{links})) { 392my@link_titles=map$_->{title}, @{$page->{links}}; 393push(@media_titles,@link_titles); 394} 395if(defined($page->{images})) { 396my@image_titles=map$_->{title}, @{$page->{images}}; 397push(@media_titles,@image_titles); 398} 399if(@media_titles) { 400 get_mw_page_list(\@media_titles,$pages); 401} 402} 403 404@titles=@titles[($batch+1)..$#titles]; 405} 406} 407 408sub get_mw_mediafile_for_page_revision { 409# Name of the file on Wiki, with the prefix. 410my$filename=shift; 411my$timestamp=shift; 412my%mediafile; 413 414# Search if on a media file with given timestamp exists on 415# MediaWiki. In that case download the file. 416my$query= { 417 action =>'query', 418 prop =>'imageinfo', 419 titles =>"File:".$filename, 420 iistart =>$timestamp, 421 iiend =>$timestamp, 422 iiprop =>'timestamp|archivename|url', 423 iilimit =>1 424}; 425my$result=$mediawiki->api($query); 426 427my($fileid,$file) =each( %{$result->{query}->{pages}} ); 428# If not defined it means there is no revision of the file for 429# given timestamp. 430if(defined($file->{imageinfo})) { 431$mediafile{title} =$filename; 432 433my$fileinfo=pop(@{$file->{imageinfo}}); 434$mediafile{timestamp} =$fileinfo->{timestamp}; 435# Mediawiki::API's download function doesn't support https URLs 436# and can't download old versions of files. 437print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 438$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 439} 440return%mediafile; 441} 442 443sub download_mw_mediafile { 444my$url=shift; 445 446my$response=$mediawiki->{ua}->get($url); 447if($response->code==200) { 448return$response->decoded_content; 449}else{ 450print STDERR "Error downloading mediafile from :\n"; 451print STDERR "URL:$url\n"; 452print STDERR "Server response: ".$response->code." ".$response->message."\n"; 453exit1; 454} 455} 456 457sub get_last_local_revision { 458# Get note regarding last mediawiki revision 459my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 460my@note_info=split(/ /,$note); 461 462my$lastrevision_number; 463if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 464print STDERR "No previous mediawiki revision found"; 465$lastrevision_number=0; 466}else{ 467# Notes are formatted : mediawiki_revision: #number 468$lastrevision_number=$note_info[1]; 469chomp($lastrevision_number); 470print STDERR "Last local mediawiki revision found is$lastrevision_number"; 471} 472return$lastrevision_number; 473} 474 475# Remember the timestamp corresponding to a revision id. 476my%basetimestamps; 477 478# Get the last remote revision without taking in account which pages are 479# tracked or not. This function makes a single request to the wiki thus 480# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 481# option. 482sub get_last_global_remote_rev { 483 mw_connect_maybe(); 484 485my$query= { 486 action =>'query', 487 list =>'recentchanges', 488 prop =>'revisions', 489 rclimit =>'1', 490 rcdir =>'older', 491}; 492my$result=$mediawiki->api($query); 493return$result->{query}->{recentchanges}[0]->{revid}; 494} 495 496# Get the last remote revision concerning the tracked pages and the tracked 497# categories. 498sub get_last_remote_revision { 499 mw_connect_maybe(); 500 501my%pages_hash= get_mw_pages(); 502my@pages=values(%pages_hash); 503 504my$max_rev_num=0; 505 506print STDERR "Getting last revision id on tracked pages...\n"; 507 508foreachmy$page(@pages) { 509my$id=$page->{pageid}; 510 511my$query= { 512 action =>'query', 513 prop =>'revisions', 514 rvprop =>'ids|timestamp', 515 pageids =>$id, 516}; 517 518my$result=$mediawiki->api($query); 519 520my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 521 522$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 523 524$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 525} 526 527print STDERR "Last remote revision found is$max_rev_num.\n"; 528return$max_rev_num; 529} 530 531# Clean content before sending it to MediaWiki 532sub mediawiki_clean { 533my$string=shift; 534my$page_created=shift; 535# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 536# This function right trims a string and adds a \n at the end to follow this rule 537$string=~s/\s+$//; 538if($stringeq""&&$page_created) { 539# Creating empty pages is forbidden. 540$string= EMPTY_CONTENT; 541} 542return$string."\n"; 543} 544 545# Filter applied on MediaWiki data before adding them to Git 546sub mediawiki_smudge { 547my$string=shift; 548if($stringeq EMPTY_CONTENT) { 549$string=""; 550} 551# This \n is important. This is due to mediawiki's way to handle end of files. 552return$string."\n"; 553} 554 555sub mediawiki_clean_filename { 556my$filename=shift; 557$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 558# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 559# Do a variant of URL-encoding, i.e. looks like URL-encoding, 560# but with _ added to prevent MediaWiki from thinking this is 561# an actual special character. 562$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 563# If we use the uri escape before 564# we should unescape here, before anything 565 566return$filename; 567} 568 569sub mediawiki_smudge_filename { 570my$filename=shift; 571$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 572$filename=~s/ /_/g; 573# Decode forbidden characters encoded in mediawiki_clean_filename 574$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 575return$filename; 576} 577 578sub literal_data { 579my($content) =@_; 580print STDOUT "data ", bytes::length($content),"\n",$content; 581} 582 583sub literal_data_raw { 584# Output possibly binary content. 585my($content) =@_; 586# Avoid confusion between size in bytes and in characters 587 utf8::downgrade($content); 588binmode STDOUT,":raw"; 589print STDOUT "data ", bytes::length($content),"\n",$content; 590binmode STDOUT,":utf8"; 591} 592 593sub mw_capabilities { 594# Revisions are imported to the private namespace 595# refs/mediawiki/$remotename/ by the helper and fetched into 596# refs/remotes/$remotename later by fetch. 597print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 598print STDOUT "import\n"; 599print STDOUT "list\n"; 600print STDOUT "push\n"; 601print STDOUT "\n"; 602} 603 604sub mw_list { 605# MediaWiki do not have branches, we consider one branch arbitrarily 606# called master, and HEAD pointing to it. 607print STDOUT "? refs/heads/master\n"; 608print STDOUT "\@refs/heads/masterHEAD\n"; 609print STDOUT "\n"; 610} 611 612sub mw_option { 613print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 614print STDOUT "unsupported\n"; 615} 616 617sub fetch_mw_revisions_for_page { 618my$page=shift; 619my$id=shift; 620my$fetch_from=shift; 621my@page_revs= (); 622my$query= { 623 action =>'query', 624 prop =>'revisions', 625 rvprop =>'ids', 626 rvdir =>'newer', 627 rvstartid =>$fetch_from, 628 rvlimit =>500, 629 pageids =>$id, 630}; 631 632my$revnum=0; 633# Get 500 revisions at a time due to the mediawiki api limit 634while(1) { 635my$result=$mediawiki->api($query); 636 637# Parse each of those 500 revisions 638foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 639my$page_rev_ids; 640$page_rev_ids->{pageid} =$page->{pageid}; 641$page_rev_ids->{revid} =$revision->{revid}; 642push(@page_revs,$page_rev_ids); 643$revnum++; 644} 645last unless$result->{'query-continue'}; 646$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 647} 648if($shallow_import&&@page_revs) { 649print STDERR " Found 1 revision (shallow import).\n"; 650@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 651return$page_revs[0]; 652} 653print STDERR " Found ",$revnum," revision(s).\n"; 654return@page_revs; 655} 656 657sub fetch_mw_revisions { 658my$pages=shift;my@pages= @{$pages}; 659my$fetch_from=shift; 660 661my@revisions= (); 662my$n=1; 663foreachmy$page(@pages) { 664my$id=$page->{pageid}; 665 666print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 667$n++; 668my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 669@revisions= (@page_revs,@revisions); 670} 671 672return($n,@revisions); 673} 674 675sub fe_escape_path { 676my$path=shift; 677$path=~s/\\/\\\\/g; 678$path=~s/"/\\"/g; 679$path=~s/\n/\\n/g; 680return'"'.$path.'"'; 681} 682 683sub import_file_revision { 684my$commit=shift; 685my%commit= %{$commit}; 686my$full_import=shift; 687my$n=shift; 688my$mediafile=shift; 689my%mediafile; 690if($mediafile) { 691%mediafile= %{$mediafile}; 692} 693 694my$title=$commit{title}; 695my$comment=$commit{comment}; 696my$content=$commit{content}; 697my$author=$commit{author}; 698my$date=$commit{date}; 699 700print STDOUT "commit refs/mediawiki/$remotename/master\n"; 701print STDOUT "mark :$n\n"; 702print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 703 literal_data($comment); 704 705# If it's not a clone, we need to know where to start from 706if(!$full_import&&$n==1) { 707print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 708} 709if($contentne DELETED_CONTENT) { 710print STDOUT "M 644 inline ". 711 fe_escape_path($title.".mw") ."\n"; 712 literal_data($content); 713if(%mediafile) { 714print STDOUT "M 644 inline " 715. fe_escape_path($mediafile{title}) ."\n"; 716 literal_data_raw($mediafile{content}); 717} 718print STDOUT "\n\n"; 719}else{ 720print STDOUT "D ". fe_escape_path($title.".mw") ."\n"; 721} 722 723# mediawiki revision number in the git note 724if($full_import&&$n==1) { 725print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 726} 727print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 728print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 729 literal_data("Note added by git-mediawiki during import"); 730if(!$full_import&&$n==1) { 731print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 732} 733print STDOUT "N inline :$n\n"; 734 literal_data("mediawiki_revision: ".$commit{mw_revision}); 735print STDOUT "\n\n"; 736} 737 738# parse a sequence of 739# <cmd> <arg1> 740# <cmd> <arg2> 741# \n 742# (like batch sequence of import and sequence of push statements) 743sub get_more_refs { 744my$cmd=shift; 745my@refs; 746while(1) { 747my$line= <STDIN>; 748if($line=~m/^$cmd (.*)$/) { 749push(@refs,$1); 750}elsif($lineeq"\n") { 751return@refs; 752}else{ 753die("Invalid command in a '$cmd' batch: ".$_); 754} 755} 756} 757 758sub mw_import { 759# multiple import commands can follow each other. 760my@refs= (shift, get_more_refs("import")); 761foreachmy$ref(@refs) { 762 mw_import_ref($ref); 763} 764print STDOUT "done\n"; 765} 766 767sub mw_import_ref { 768my$ref=shift; 769# The remote helper will call "import HEAD" and 770# "import refs/heads/master". 771# Since HEAD is a symbolic ref to master (by convention, 772# followed by the output of the command "list" that we gave), 773# we don't need to do anything in this case. 774if($refeq"HEAD") { 775return; 776} 777 778 mw_connect_maybe(); 779 780print STDERR "Searching revisions...\n"; 781my$last_local= get_last_local_revision(); 782my$fetch_from=$last_local+1; 783if($fetch_from==1) { 784print STDERR ", fetching from beginning.\n"; 785}else{ 786print STDERR ", fetching from here.\n"; 787} 788 789my$n=0; 790if($fetch_strategyeq"by_rev") { 791print STDERR "Fetching & writing export data by revs...\n"; 792$n= mw_import_ref_by_revs($fetch_from); 793}elsif($fetch_strategyeq"by_page") { 794print STDERR "Fetching & writing export data by pages...\n"; 795$n= mw_import_ref_by_pages($fetch_from); 796}else{ 797print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 798print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 799exit1; 800} 801 802if($fetch_from==1&&$n==0) { 803print STDERR "You appear to have cloned an empty MediaWiki.\n"; 804# Something has to be done remote-helper side. If nothing is done, an error is 805# thrown saying that HEAD is referring to unknown object 0000000000000000000 806# and the clone fails. 807} 808} 809 810sub mw_import_ref_by_pages { 811 812my$fetch_from=shift; 813my%pages_hash= get_mw_pages(); 814my@pages=values(%pages_hash); 815 816my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 817 818@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 819my@revision_ids=map$_->{revid},@revisions; 820 821return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 822} 823 824sub mw_import_ref_by_revs { 825 826my$fetch_from=shift; 827my%pages_hash= get_mw_pages(); 828 829my$last_remote= get_last_global_remote_rev(); 830my@revision_ids=$fetch_from..$last_remote; 831return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 832} 833 834# Import revisions given in second argument (array of integers). 835# Only pages appearing in the third argument (hash indexed by page titles) 836# will be imported. 837sub mw_import_revids { 838my$fetch_from=shift; 839my$revision_ids=shift; 840my$pages=shift; 841 842my$n=0; 843my$n_actual=0; 844my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 845 846foreachmy$pagerevid(@$revision_ids) { 847# Count page even if we skip it, since we display 848# $n/$total and $total includes skipped pages. 849$n++; 850 851# fetch the content of the pages 852my$query= { 853 action =>'query', 854 prop =>'revisions', 855 rvprop =>'content|timestamp|comment|user|ids', 856 revids =>$pagerevid, 857}; 858 859my$result=$mediawiki->api($query); 860 861if(!$result) { 862die"Failed to retrieve modified page for revision$pagerevid"; 863} 864 865if(defined($result->{query}->{badrevids}->{$pagerevid})) { 866# The revision id does not exist on the remote wiki. 867next; 868} 869 870if(!defined($result->{query}->{pages})) { 871die"Invalid revision$pagerevid."; 872} 873 874my@result_pages=values(%{$result->{query}->{pages}}); 875my$result_page=$result_pages[0]; 876my$rev=$result_pages[0]->{revisions}->[0]; 877 878my$page_title=$result_page->{title}; 879 880if(!exists($pages->{$page_title})) { 881print STDERR "$n/",scalar(@$revision_ids), 882": Skipping revision #$rev->{revid} of$page_title\n"; 883next; 884} 885 886$n_actual++; 887 888my%commit; 889$commit{author} =$rev->{user} ||'Anonymous'; 890$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 891$commit{title} = mediawiki_smudge_filename($page_title); 892$commit{mw_revision} =$rev->{revid}; 893$commit{content} = mediawiki_smudge($rev->{'*'}); 894 895if(!defined($rev->{timestamp})) { 896$last_timestamp++; 897}else{ 898$last_timestamp=$rev->{timestamp}; 899} 900$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 901 902# Differentiates classic pages and media files. 903my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 904my%mediafile; 905if($namespace) { 906my$id= get_mw_namespace_id($namespace); 907if($id&&$id== get_mw_namespace_id("File")) { 908%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 909} 910} 911# If this is a revision of the media page for new version 912# of a file do one common commit for both file and media page. 913# Else do commit only for that page. 914print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 915 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 916} 917 918return$n_actual; 919} 920 921sub error_non_fast_forward { 922my$advice= run_git("config --bool advice.pushNonFastForward"); 923chomp($advice); 924if($advicene"false") { 925# Native git-push would show this after the summary. 926# We can't ask it to display it cleanly, so print it 927# ourselves before. 928print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 929print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 930print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 931} 932print STDOUT "error$_[0]\"non-fast-forward\"\n"; 933return0; 934} 935 936sub mw_upload_file { 937my$complete_file_name=shift; 938my$new_sha1=shift; 939my$extension=shift; 940my$file_deleted=shift; 941my$summary=shift; 942my$newrevid; 943my$path="File:".$complete_file_name; 944my%hashFiles= get_allowed_file_extensions(); 945if(!exists($hashFiles{$extension})) { 946print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 947print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 948return$newrevid; 949} 950# Deleting and uploading a file requires a priviledged user 951if($file_deleted) { 952 mw_connect_maybe(); 953my$query= { 954 action =>'delete', 955 title =>$path, 956 reason =>$summary 957}; 958if(!$mediawiki->edit($query)) { 959print STDERR "Failed to delete file on remote wiki\n"; 960print STDERR "Check your permissions on the remote site. Error code:\n"; 961print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 962exit1; 963} 964}else{ 965# Don't let perl try to interpret file content as UTF-8 => use "raw" 966my$content= run_git("cat-file blob$new_sha1","raw"); 967if($contentne"") { 968 mw_connect_maybe(); 969$mediawiki->{config}->{upload_url} = 970"$url/index.php/Special:Upload"; 971$mediawiki->edit({ 972 action =>'upload', 973 filename =>$complete_file_name, 974 comment =>$summary, 975 file => [undef, 976$complete_file_name, 977 Content =>$content], 978 ignorewarnings =>1, 979}, { 980 skip_encoding =>1 981} ) ||die$mediawiki->{error}->{code} .':' 982.$mediawiki->{error}->{details}; 983my$last_file_page=$mediawiki->get_page({title =>$path}); 984$newrevid=$last_file_page->{revid}; 985print STDERR "Pushed file:$new_sha1-$complete_file_name.\n"; 986}else{ 987print STDERR "Empty file$complete_file_namenot pushed.\n"; 988} 989} 990return$newrevid; 991} 992 993sub mw_push_file { 994my$diff_info=shift; 995# $diff_info contains a string in this format: 996# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status> 997my@diff_info_split=split(/[ \t]/,$diff_info); 998 999# Filename, including .mw extension1000my$complete_file_name=shift;1001# Commit message1002my$summary=shift;1003# MediaWiki revision number. Keep the previous one by default,1004# in case there's no edit to perform.1005my$oldrevid=shift;1006my$newrevid;10071008if($summaryeq EMPTY_MESSAGE) {1009$summary='';1010}10111012my$new_sha1=$diff_info_split[3];1013my$old_sha1=$diff_info_split[2];1014my$page_created= ($old_sha1eq NULL_SHA1);1015my$page_deleted= ($new_sha1eq NULL_SHA1);1016$complete_file_name= mediawiki_clean_filename($complete_file_name);10171018my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1019if(!defined($extension)) {1020$extension="";1021}1022if($extensioneq"mw") {1023my$ns= get_mw_namespace_id_for_page($complete_file_name);1024if($ns&&$ns== get_mw_namespace_id("File") && (!$export_media)) {1025print STDERR "Ignoring media file related page:$complete_file_name\n";1026return($oldrevid,"ok");1027}1028my$file_content;1029if($page_deleted) {1030# Deleting a page usually requires1031# special privileges. A common1032# convention is to replace the page1033# with this content instead:1034$file_content= DELETED_CONTENT;1035}else{1036$file_content= run_git("cat-file blob$new_sha1");1037}10381039 mw_connect_maybe();10401041my$result=$mediawiki->edit( {1042 action =>'edit',1043 summary =>$summary,1044 title =>$title,1045 basetimestamp =>$basetimestamps{$oldrevid},1046 text => mediawiki_clean($file_content,$page_created),1047}, {1048 skip_encoding =>1# Helps with names with accentuated characters1049});1050if(!$result) {1051if($mediawiki->{error}->{code} ==3) {1052# edit conflicts, considered as non-fast-forward1053print STDERR 'Warning: Error '.1054$mediawiki->{error}->{code} .1055' from mediwiki: '.$mediawiki->{error}->{details} .1056".\n";1057return($oldrevid,"non-fast-forward");1058}else{1059# Other errors. Shouldn't happen => just die()1060die'Fatal: Error '.1061$mediawiki->{error}->{code} .1062' from mediwiki: '.$mediawiki->{error}->{details};1063}1064}1065$newrevid=$result->{edit}->{newrevid};1066print STDERR "Pushed file:$new_sha1-$title\n";1067}elsif($export_media) {1068$newrevid= mw_upload_file($complete_file_name,$new_sha1,1069$extension,$page_deleted,1070$summary);1071}else{1072print STDERR "Ignoring media file$title\n";1073}1074$newrevid= ($newrevidor$oldrevid);1075return($newrevid,"ok");1076}10771078sub mw_push {1079# multiple push statements can follow each other1080my@refsspecs= (shift, get_more_refs("push"));1081my$pushed;1082formy$refspec(@refsspecs) {1083my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1084or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1085if($force) {1086print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1087}1088if($localeq"") {1089print STDERR "Cannot delete remote branch on a MediaWiki\n";1090print STDOUT "error$remotecannot delete\n";1091next;1092}1093if($remotene"refs/heads/master") {1094print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1095print STDOUT "error$remoteonly master allowed\n";1096next;1097}1098if(mw_push_revision($local,$remote)) {1099$pushed=1;1100}1101}11021103# Notify Git that the push is done1104print STDOUT "\n";11051106if($pushed&&$dumb_push) {1107print STDERR "Just pushed some revisions to MediaWiki.\n";1108print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1109print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1110print STDERR "\n";1111print STDERR " git pull --rebase\n";1112print STDERR "\n";1113}1114}11151116sub mw_push_revision {1117my$local=shift;1118my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1119my$last_local_revid= get_last_local_revision();1120print STDERR ".\n";# Finish sentence started by get_last_local_revision()1121my$last_remote_revid= get_last_remote_revision();1122my$mw_revision=$last_remote_revid;11231124# Get sha1 of commit pointed by local HEAD1125my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1126# Get sha1 of commit pointed by remotes/$remotename/master1127my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1128chomp($remoteorigin_sha1);11291130if($last_local_revid>0&&1131$last_local_revid<$last_remote_revid) {1132return error_non_fast_forward($remote);1133}11341135if($HEAD_sha1eq$remoteorigin_sha1) {1136# nothing to push1137return0;1138}11391140# Get every commit in between HEAD and refs/remotes/origin/master,1141# including HEAD and refs/remotes/origin/master1142my@commit_pairs= ();1143if($last_local_revid>0) {1144my$parsed_sha1=$remoteorigin_sha1;1145# Find a path from last MediaWiki commit to pushed commit1146print STDERR "Computing path from local to remote ...\n";1147my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents$local^$parsed_sha1"));1148my%local_ancestry;1149foreachmy$line(@local_ancestry) {1150if(my($child,$parents) =$line=~m/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1151foreachmy$parent(split(' ',$parents)) {1152$local_ancestry{$parent} =$child;1153}1154}elsif(!$line=~m/^([a-f0-9]+)/) {1155die"Unexpected output from git rev-list:$line";1156}1157}1158while($parsed_sha1ne$HEAD_sha1) {1159my$child=$local_ancestry{$parsed_sha1};1160if(!$child) {1161printf STDERR "Cannot find a path in history from remote commit to last commit\n";1162return error_non_fast_forward($remote);1163}1164push(@commit_pairs, [$parsed_sha1,$child]);1165$parsed_sha1=$child;1166}1167}else{1168# No remote mediawiki revision. Export the whole1169# history (linearized with --first-parent)1170print STDERR "Warning: no common ancestor, pushing complete history\n";1171my$history= run_git("rev-list --first-parent --children$local");1172my@history=split(/\n/,$history);1173@history=@history[1..$#history];1174foreachmy$line(reverse@history) {1175my@commit_info_split=split(/ |\n/,$line);1176push(@commit_pairs, \@commit_info_split);1177}1178}11791180foreachmy$commit_info_split(@commit_pairs) {1181my$sha1_child= @{$commit_info_split}[0];1182my$sha1_commit= @{$commit_info_split}[1];1183my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1184# TODO: we could detect rename, and encode them with a #redirect on the wiki.1185# TODO: for now, it's just a delete+add1186my@diff_info_list=split(/\0/,$diff_infos);1187# Keep the subject line of the commit message as mediawiki comment for the revision1188my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1189chomp($commit_msg);1190# Push every blob1191while(@diff_info_list) {1192my$status;1193# git diff-tree -z gives an output like1194# <metadata>\0<filename1>\01195# <metadata>\0<filename2>\01196# and we've split on \0.1197my$info=shift(@diff_info_list);1198my$file=shift(@diff_info_list);1199($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1200if($statuseq"non-fast-forward") {1201# we may already have sent part of the1202# commit to MediaWiki, but it's too1203# late to cancel it. Stop the push in1204# the middle, but still give an1205# accurate error message.1206return error_non_fast_forward($remote);1207}1208if($statusne"ok") {1209die("Unknown error from mw_push_file()");1210}1211}1212unless($dumb_push) {1213 run_git("notes --ref=$remotename/mediawikiadd -f -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1214 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1215}1216}12171218print STDOUT "ok$remote\n";1219return1;1220}12211222sub get_allowed_file_extensions {1223 mw_connect_maybe();12241225my$query= {1226 action =>'query',1227 meta =>'siteinfo',1228 siprop =>'fileextensions'1229};1230my$result=$mediawiki->api($query);1231my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1232my%hashFile=map{$_=>1}@file_extensions;12331234return%hashFile;1235}12361237# In memory cache for MediaWiki namespace ids.1238my%namespace_id;12391240# Namespaces whose id is cached in the configuration file1241# (to avoid duplicates)1242my%cached_mw_namespace_id;12431244# Return MediaWiki id for a canonical namespace name.1245# Ex.: "File", "Project".1246sub get_mw_namespace_id {1247 mw_connect_maybe();1248my$name=shift;12491250if(!exists$namespace_id{$name}) {1251# Look at configuration file, if the record for that namespace is1252# already cached. Namespaces are stored in form:1253# "Name_of_namespace:Id_namespace", ex.: "File:6".1254my@temp=split(/[\n]/, run_git("config --get-all remote."1255.$remotename.".namespaceCache"));1256chomp(@temp);1257foreachmy$ns(@temp) {1258my($n,$id) =split(/:/,$ns);1259if($ideq'notANameSpace') {1260$namespace_id{$n} = {is_namespace =>0};1261}else{1262$namespace_id{$n} = {is_namespace =>1, id =>$id};1263}1264$cached_mw_namespace_id{$n} =1;1265}1266}12671268if(!exists$namespace_id{$name}) {1269print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1270# NS not found => get namespace id from MW and store it in1271# configuration file.1272my$query= {1273 action =>'query',1274 meta =>'siteinfo',1275 siprop =>'namespaces'1276};1277my$result=$mediawiki->api($query);12781279while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1280if(defined($ns->{id}) &&defined($ns->{canonical})) {1281$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1282if($ns->{'*'}) {1283# alias (e.g. french Fichier: as alias for canonical File:)1284$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1285}1286}1287}1288}12891290my$ns=$namespace_id{$name};1291my$id;12921293unless(defined$ns) {1294print STDERR "No such namespace$nameon MediaWiki.\n";1295$ns= {is_namespace =>0};1296$namespace_id{$name} =$ns;1297}12981299if($ns->{is_namespace}) {1300$id=$ns->{id};1301}13021303# Store "notANameSpace" as special value for inexisting namespaces1304my$store_id= ($id||'notANameSpace');13051306# Store explicitely requested namespaces on disk1307if(!exists$cached_mw_namespace_id{$name}) {1308 run_git("config --add remote.".$remotename1309.".namespaceCache\"".$name.":".$store_id."\"");1310$cached_mw_namespace_id{$name} =1;1311}1312return$id;1313}13141315sub get_mw_namespace_id_for_page {1316if(my($namespace) =$_[0] =~/^([^:]*):/) {1317return get_mw_namespace_id($namespace);1318}else{1319return;1320}1321}