1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use DateTime::Format::ISO8601; 18use warnings; 19 20# By default, use UTF-8 to communicate with Git and the user 21binmode STDERR,":encoding(UTF-8)"; 22binmode STDOUT,":encoding(UTF-8)"; 23 24use URI::Escape; 25use IPC::Open2; 26 27# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 28useconstant SLASH_REPLACEMENT =>"%2F"; 29 30# It's not always possible to delete pages (may require some 31# privileges). Deleted pages are replaced with this content. 32useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 33 34# It's not possible to create empty pages. New empty files in Git are 35# sent with this content instead. 36useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 37 38# used to reflect file creation or deletion in diff. 39useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 40 41# Used on Git's side to reflect empty edit messages on the wiki 42useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 43 44my$remotename=$ARGV[0]; 45my$url=$ARGV[1]; 46 47# Accept both space-separated and multiple keys in config file. 48# Spaces should be written as _ anyway because we'll use chomp. 49my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 50chomp(@tracked_pages); 51 52# Just like @tracked_pages, but for MediaWiki categories. 53my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 54chomp(@tracked_categories); 55 56# Import media files on pull 57my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 58chomp($import_media); 59$import_media= ($import_mediaeq"true"); 60 61# Export media files on push 62my$export_media= run_git("config --get --bool remote.".$remotename.".mediaexport"); 63chomp($export_media); 64$export_media= !($export_mediaeq"false"); 65 66my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 67# Note: mwPassword is discourraged. Use the credential system instead. 68my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 69my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 70chomp($wiki_login); 71chomp($wiki_passwd); 72chomp($wiki_domain); 73 74# Import only last revisions (both for clone and fetch) 75my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 76chomp($shallow_import); 77$shallow_import= ($shallow_importeq"true"); 78 79# Fetch (clone and pull) by revisions instead of by pages. This behavior 80# is more efficient when we have a wiki with lots of pages and we fetch 81# the revisions quite often so that they concern only few pages. 82# Possible values: 83# - by_rev: perform one query per new revision on the remote wiki 84# - by_page: query each tracked page for new revision 85my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 86unless($fetch_strategy) { 87$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 88} 89chomp($fetch_strategy); 90unless($fetch_strategy) { 91$fetch_strategy="by_page"; 92} 93 94# Remember the timestamp corresponding to a revision id. 95my%basetimestamps; 96 97# Dumb push: don't update notes and mediawiki ref to reflect the last push. 98# 99# Configurable with mediawiki.dumbPush, or per-remote with 100# remote.<remotename>.dumbPush. 101# 102# This means the user will have to re-import the just-pushed 103# revisions. On the other hand, this means that the Git revisions 104# corresponding to MediaWiki revisions are all imported from the wiki, 105# regardless of whether they were initially created in Git or from the 106# web interface, hence all users will get the same history (i.e. if 107# the push from Git to MediaWiki loses some information, everybody 108# will get the history with information lost). If the import is 109# deterministic, this means everybody gets the same sha1 for each 110# MediaWiki revision. 111my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 112unless($dumb_push) { 113$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 114} 115chomp($dumb_push); 116$dumb_push= ($dumb_pusheq"true"); 117 118my$wiki_name=$url; 119$wiki_name=~s{[^/]*://}{}; 120# If URL is like http://user:password@example.com/, we clearly don't 121# want the password in $wiki_name. While we're there, also remove user 122# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 123$wiki_name=~s/^.*@//; 124 125# Commands parser 126my$entry; 127my@cmd; 128while(<STDIN>) { 129chomp; 130@cmd=split(/ /); 131if(defined($cmd[0])) { 132# Line not blank 133if($cmd[0]eq"capabilities") { 134die("Too many arguments for capabilities")unless(!defined($cmd[1])); 135 mw_capabilities(); 136}elsif($cmd[0]eq"list") { 137die("Too many arguments for list")unless(!defined($cmd[2])); 138 mw_list($cmd[1]); 139}elsif($cmd[0]eq"import") { 140die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 141 mw_import($cmd[1]); 142}elsif($cmd[0]eq"option") { 143die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 144 mw_option($cmd[1],$cmd[2]); 145}elsif($cmd[0]eq"push") { 146 mw_push($cmd[1]); 147}else{ 148print STDERR "Unknown command. Aborting...\n"; 149last; 150} 151}else{ 152# blank line: we should terminate 153last; 154} 155 156BEGIN{ $| =1}# flush STDOUT, to make sure the previous 157# command is fully processed. 158} 159 160########################## Functions ############################## 161 162# MediaWiki API instance, created lazily. 163my$mediawiki; 164 165sub mw_connect_maybe { 166if($mediawiki) { 167return; 168} 169$mediawiki= MediaWiki::API->new; 170$mediawiki->{config}->{api_url} ="$url/api.php"; 171if($wiki_login) { 172my%credential= ( 173'url'=>$url, 174'username'=>$wiki_login, 175'password'=>$wiki_passwd 176); 177 Git::credential(\%credential); 178my$request= {lgname =>$credential{username}, 179 lgpassword =>$credential{password}, 180 lgdomain =>$wiki_domain}; 181if($mediawiki->login($request)) { 182 Git::credential(\%credential,'approve'); 183print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 184}else{ 185print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 186print STDERR " (error ". 187$mediawiki->{error}->{code} .': '. 188$mediawiki->{error}->{details} .")\n"; 189 Git::credential(\%credential,'reject'); 190exit1; 191} 192} 193return; 194} 195 196sub fatal_mw_error { 197my$action=shift; 198print STDERR "fatal: could not$action.\n"; 199print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 200if($url=~/^https/) { 201print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 202print STDERR "fatal: and the SSL certificate is correct.\n"; 203}else{ 204print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 205} 206print STDERR "fatal: (error ". 207$mediawiki->{error}->{code} .': '. 208$mediawiki->{error}->{details} .")\n"; 209exit1; 210} 211 212## Functions for listing pages on the remote wiki 213sub get_mw_tracked_pages { 214my$pages=shift; 215 get_mw_page_list(\@tracked_pages,$pages); 216return; 217} 218 219sub get_mw_page_list { 220my$page_list=shift; 221my$pages=shift; 222my@some_pages=@$page_list; 223while(@some_pages) { 224my$last=50; 225if($#some_pages<$last) { 226$last=$#some_pages; 227} 228my@slice=@some_pages[0..$last]; 229 get_mw_first_pages(\@slice,$pages); 230@some_pages=@some_pages[51..$#some_pages]; 231} 232return; 233} 234 235sub get_mw_tracked_categories { 236my$pages=shift; 237foreachmy$category(@tracked_categories) { 238if(index($category,':') <0) { 239# Mediawiki requires the Category 240# prefix, but let's not force the user 241# to specify it. 242$category="Category:".$category; 243} 244my$mw_pages=$mediawiki->list( { 245 action =>'query', 246 list =>'categorymembers', 247 cmtitle =>$category, 248 cmlimit =>'max'} ) 249||die$mediawiki->{error}->{code} .': ' 250.$mediawiki->{error}->{details}; 251foreachmy$page(@{$mw_pages}) { 252$pages->{$page->{title}} =$page; 253} 254} 255return; 256} 257 258sub get_mw_all_pages { 259my$pages=shift; 260# No user-provided list, get the list of pages from the API. 261my$mw_pages=$mediawiki->list({ 262 action =>'query', 263 list =>'allpages', 264 aplimit =>'max' 265}); 266if(!defined($mw_pages)) { 267 fatal_mw_error("get the list of wiki pages"); 268} 269foreachmy$page(@{$mw_pages}) { 270$pages->{$page->{title}} =$page; 271} 272return; 273} 274 275# queries the wiki for a set of pages. Meant to be used within a loop 276# querying the wiki for slices of page list. 277sub get_mw_first_pages { 278my$some_pages=shift; 279my@some_pages= @{$some_pages}; 280 281my$pages=shift; 282 283# pattern 'page1|page2|...' required by the API 284my$titles=join('|',@some_pages); 285 286my$mw_pages=$mediawiki->api({ 287 action =>'query', 288 titles =>$titles, 289}); 290if(!defined($mw_pages)) { 291 fatal_mw_error("query the list of wiki pages"); 292} 293while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 294if($id<0) { 295print STDERR "Warning: page$page->{title} not found on wiki\n"; 296}else{ 297$pages->{$page->{title}} =$page; 298} 299} 300return; 301} 302 303# Get the list of pages to be fetched according to configuration. 304sub get_mw_pages { 305 mw_connect_maybe(); 306 307print STDERR "Listing pages on remote wiki...\n"; 308 309my%pages;# hash on page titles to avoid duplicates 310my$user_defined; 311if(@tracked_pages) { 312$user_defined=1; 313# The user provided a list of pages titles, but we 314# still need to query the API to get the page IDs. 315 get_mw_tracked_pages(\%pages); 316} 317if(@tracked_categories) { 318$user_defined=1; 319 get_mw_tracked_categories(\%pages); 320} 321if(!$user_defined) { 322 get_mw_all_pages(\%pages); 323} 324if($import_media) { 325print STDERR "Getting media files for selected pages...\n"; 326if($user_defined) { 327 get_linked_mediafiles(\%pages); 328}else{ 329 get_all_mediafiles(\%pages); 330} 331} 332print STDERR (scalar keys%pages) ." pages found.\n"; 333return%pages; 334} 335 336# usage: $out = run_git("command args"); 337# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 338sub run_git { 339my$args=shift; 340my$encoding= (shift||"encoding(UTF-8)"); 341open(my$git,"-|:$encoding","git ".$args); 342my$res=do{local$/; <$git> }; 343close($git); 344 345return$res; 346} 347 348 349sub get_all_mediafiles { 350my$pages=shift; 351# Attach list of all pages for media files from the API, 352# they are in a different namespace, only one namespace 353# can be queried at the same moment 354my$mw_pages=$mediawiki->list({ 355 action =>'query', 356 list =>'allpages', 357 apnamespace => get_mw_namespace_id("File"), 358 aplimit =>'max' 359}); 360if(!defined($mw_pages)) { 361print STDERR "fatal: could not get the list of pages for media files.\n"; 362print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 363print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 364exit1; 365} 366foreachmy$page(@{$mw_pages}) { 367$pages->{$page->{title}} =$page; 368} 369return; 370} 371 372sub get_linked_mediafiles { 373my$pages=shift; 374my@titles=map{$_->{title} }values(%{$pages}); 375 376# The query is split in small batches because of the MW API limit of 377# the number of links to be returned (500 links max). 378my$batch=10; 379while(@titles) { 380if($#titles<$batch) { 381$batch=$#titles; 382} 383my@slice=@titles[0..$batch]; 384 385# pattern 'page1|page2|...' required by the API 386my$mw_titles=join('|',@slice); 387 388# Media files could be included or linked from 389# a page, get all related 390my$query= { 391 action =>'query', 392 prop =>'links|images', 393 titles =>$mw_titles, 394 plnamespace => get_mw_namespace_id("File"), 395 pllimit =>'max' 396}; 397my$result=$mediawiki->api($query); 398 399while(my($id,$page) =each(%{$result->{query}->{pages}})) { 400my@media_titles; 401if(defined($page->{links})) { 402my@link_titles 403=map{$_->{title} } @{$page->{links}}; 404push(@media_titles,@link_titles); 405} 406if(defined($page->{images})) { 407my@image_titles 408=map{$_->{title} } @{$page->{images}}; 409push(@media_titles,@image_titles); 410} 411if(@media_titles) { 412 get_mw_page_list(\@media_titles,$pages); 413} 414} 415 416@titles=@titles[($batch+1)..$#titles]; 417} 418return; 419} 420 421sub get_mw_mediafile_for_page_revision { 422# Name of the file on Wiki, with the prefix. 423my$filename=shift; 424my$timestamp=shift; 425my%mediafile; 426 427# Search if on a media file with given timestamp exists on 428# MediaWiki. In that case download the file. 429my$query= { 430 action =>'query', 431 prop =>'imageinfo', 432 titles =>"File:".$filename, 433 iistart =>$timestamp, 434 iiend =>$timestamp, 435 iiprop =>'timestamp|archivename|url', 436 iilimit =>1 437}; 438my$result=$mediawiki->api($query); 439 440my($fileid,$file) =each( %{$result->{query}->{pages}} ); 441# If not defined it means there is no revision of the file for 442# given timestamp. 443if(defined($file->{imageinfo})) { 444$mediafile{title} =$filename; 445 446my$fileinfo=pop(@{$file->{imageinfo}}); 447$mediafile{timestamp} =$fileinfo->{timestamp}; 448# Mediawiki::API's download function doesn't support https URLs 449# and can't download old versions of files. 450print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 451$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 452} 453return%mediafile; 454} 455 456sub download_mw_mediafile { 457my$url=shift; 458 459my$response=$mediawiki->{ua}->get($url); 460if($response->code==200) { 461return$response->decoded_content; 462}else{ 463print STDERR "Error downloading mediafile from :\n"; 464print STDERR "URL:$url\n"; 465print STDERR "Server response: ".$response->code." ".$response->message."\n"; 466exit1; 467} 468} 469 470sub get_last_local_revision { 471# Get note regarding last mediawiki revision 472my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 473my@note_info=split(/ /,$note); 474 475my$lastrevision_number; 476if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 477print STDERR "No previous mediawiki revision found"; 478$lastrevision_number=0; 479}else{ 480# Notes are formatted : mediawiki_revision: #number 481$lastrevision_number=$note_info[1]; 482chomp($lastrevision_number); 483print STDERR "Last local mediawiki revision found is$lastrevision_number"; 484} 485return$lastrevision_number; 486} 487 488# Get the last remote revision without taking in account which pages are 489# tracked or not. This function makes a single request to the wiki thus 490# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 491# option. 492sub get_last_global_remote_rev { 493 mw_connect_maybe(); 494 495my$query= { 496 action =>'query', 497 list =>'recentchanges', 498 prop =>'revisions', 499 rclimit =>'1', 500 rcdir =>'older', 501}; 502my$result=$mediawiki->api($query); 503return$result->{query}->{recentchanges}[0]->{revid}; 504} 505 506# Get the last remote revision concerning the tracked pages and the tracked 507# categories. 508sub get_last_remote_revision { 509 mw_connect_maybe(); 510 511my%pages_hash= get_mw_pages(); 512my@pages=values(%pages_hash); 513 514my$max_rev_num=0; 515 516print STDERR "Getting last revision id on tracked pages...\n"; 517 518foreachmy$page(@pages) { 519my$id=$page->{pageid}; 520 521my$query= { 522 action =>'query', 523 prop =>'revisions', 524 rvprop =>'ids|timestamp', 525 pageids =>$id, 526}; 527 528my$result=$mediawiki->api($query); 529 530my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 531 532$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 533 534$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 535} 536 537print STDERR "Last remote revision found is$max_rev_num.\n"; 538return$max_rev_num; 539} 540 541# Clean content before sending it to MediaWiki 542sub mediawiki_clean { 543my$string=shift; 544my$page_created=shift; 545# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 546# This function right trims a string and adds a \n at the end to follow this rule 547$string=~s/\s+$//; 548if($stringeq""&&$page_created) { 549# Creating empty pages is forbidden. 550$string= EMPTY_CONTENT; 551} 552return$string."\n"; 553} 554 555# Filter applied on MediaWiki data before adding them to Git 556sub mediawiki_smudge { 557my$string=shift; 558if($stringeq EMPTY_CONTENT) { 559$string=""; 560} 561# This \n is important. This is due to mediawiki's way to handle end of files. 562return$string."\n"; 563} 564 565sub mediawiki_clean_filename { 566my$filename=shift; 567$filename=~s{@{[SLASH_REPLACEMENT]}}{/}g; 568# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 569# Do a variant of URL-encoding, i.e. looks like URL-encoding, 570# but with _ added to prevent MediaWiki from thinking this is 571# an actual special character. 572$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 573# If we use the uri escape before 574# we should unescape here, before anything 575 576return$filename; 577} 578 579sub mediawiki_smudge_filename { 580my$filename=shift; 581$filename=~s{/}{@{[SLASH_REPLACEMENT]}}g; 582$filename=~s/ /_/g; 583# Decode forbidden characters encoded in mediawiki_clean_filename 584$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 585return$filename; 586} 587 588sub literal_data { 589my($content) =@_; 590print STDOUT "data ", bytes::length($content),"\n",$content; 591return; 592} 593 594sub literal_data_raw { 595# Output possibly binary content. 596my($content) =@_; 597# Avoid confusion between size in bytes and in characters 598 utf8::downgrade($content); 599binmode STDOUT,":raw"; 600print STDOUT "data ", bytes::length($content),"\n",$content; 601binmode STDOUT,":encoding(UTF-8)"; 602return; 603} 604 605sub mw_capabilities { 606# Revisions are imported to the private namespace 607# refs/mediawiki/$remotename/ by the helper and fetched into 608# refs/remotes/$remotename later by fetch. 609print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 610print STDOUT "import\n"; 611print STDOUT "list\n"; 612print STDOUT "push\n"; 613print STDOUT "\n"; 614return; 615} 616 617sub mw_list { 618# MediaWiki do not have branches, we consider one branch arbitrarily 619# called master, and HEAD pointing to it. 620print STDOUT "? refs/heads/master\n"; 621print STDOUT "\@refs/heads/masterHEAD\n"; 622print STDOUT "\n"; 623return; 624} 625 626sub mw_option { 627print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 628print STDOUT "unsupported\n"; 629return; 630} 631 632sub fetch_mw_revisions_for_page { 633my$page=shift; 634my$id=shift; 635my$fetch_from=shift; 636my@page_revs= (); 637my$query= { 638 action =>'query', 639 prop =>'revisions', 640 rvprop =>'ids', 641 rvdir =>'newer', 642 rvstartid =>$fetch_from, 643 rvlimit =>500, 644 pageids =>$id, 645}; 646 647my$revnum=0; 648# Get 500 revisions at a time due to the mediawiki api limit 649while(1) { 650my$result=$mediawiki->api($query); 651 652# Parse each of those 500 revisions 653foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 654my$page_rev_ids; 655$page_rev_ids->{pageid} =$page->{pageid}; 656$page_rev_ids->{revid} =$revision->{revid}; 657push(@page_revs,$page_rev_ids); 658$revnum++; 659} 660last unless$result->{'query-continue'}; 661$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 662} 663if($shallow_import&&@page_revs) { 664print STDERR " Found 1 revision (shallow import).\n"; 665@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 666return$page_revs[0]; 667} 668print STDERR " Found ",$revnum," revision(s).\n"; 669return@page_revs; 670} 671 672sub fetch_mw_revisions { 673my$pages=shift;my@pages= @{$pages}; 674my$fetch_from=shift; 675 676my@revisions= (); 677my$n=1; 678foreachmy$page(@pages) { 679my$id=$page->{pageid}; 680 681print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 682$n++; 683my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 684@revisions= (@page_revs,@revisions); 685} 686 687return($n,@revisions); 688} 689 690sub fe_escape_path { 691my$path=shift; 692$path=~s/\\/\\\\/g; 693$path=~s/"/\\"/g; 694$path=~s/\n/\\n/g; 695return'"'.$path.'"'; 696} 697 698sub import_file_revision { 699my$commit=shift; 700my%commit= %{$commit}; 701my$full_import=shift; 702my$n=shift; 703my$mediafile=shift; 704my%mediafile; 705if($mediafile) { 706%mediafile= %{$mediafile}; 707} 708 709my$title=$commit{title}; 710my$comment=$commit{comment}; 711my$content=$commit{content}; 712my$author=$commit{author}; 713my$date=$commit{date}; 714 715print STDOUT "commit refs/mediawiki/$remotename/master\n"; 716print STDOUT "mark :$n\n"; 717print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 718 literal_data($comment); 719 720# If it's not a clone, we need to know where to start from 721if(!$full_import&&$n==1) { 722print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 723} 724if($contentne DELETED_CONTENT) { 725print STDOUT "M 644 inline ". 726 fe_escape_path($title.".mw") ."\n"; 727 literal_data($content); 728if(%mediafile) { 729print STDOUT "M 644 inline " 730. fe_escape_path($mediafile{title}) ."\n"; 731 literal_data_raw($mediafile{content}); 732} 733print STDOUT "\n\n"; 734}else{ 735print STDOUT "D ". fe_escape_path($title.".mw") ."\n"; 736} 737 738# mediawiki revision number in the git note 739if($full_import&&$n==1) { 740print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 741} 742print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 743print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 744 literal_data("Note added by git-mediawiki during import"); 745if(!$full_import&&$n==1) { 746print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 747} 748print STDOUT "N inline :$n\n"; 749 literal_data("mediawiki_revision: ".$commit{mw_revision}); 750print STDOUT "\n\n"; 751return; 752} 753 754# parse a sequence of 755# <cmd> <arg1> 756# <cmd> <arg2> 757# \n 758# (like batch sequence of import and sequence of push statements) 759sub get_more_refs { 760my$cmd=shift; 761my@refs; 762while(1) { 763my$line= <STDIN>; 764if($line=~/^$cmd (.*)$/) { 765push(@refs,$1); 766}elsif($lineeq"\n") { 767return@refs; 768}else{ 769die("Invalid command in a '$cmd' batch: ".$_); 770} 771} 772return; 773} 774 775sub mw_import { 776# multiple import commands can follow each other. 777my@refs= (shift, get_more_refs("import")); 778foreachmy$ref(@refs) { 779 mw_import_ref($ref); 780} 781print STDOUT "done\n"; 782return; 783} 784 785sub mw_import_ref { 786my$ref=shift; 787# The remote helper will call "import HEAD" and 788# "import refs/heads/master". 789# Since HEAD is a symbolic ref to master (by convention, 790# followed by the output of the command "list" that we gave), 791# we don't need to do anything in this case. 792if($refeq"HEAD") { 793return; 794} 795 796 mw_connect_maybe(); 797 798print STDERR "Searching revisions...\n"; 799my$last_local= get_last_local_revision(); 800my$fetch_from=$last_local+1; 801if($fetch_from==1) { 802print STDERR ", fetching from beginning.\n"; 803}else{ 804print STDERR ", fetching from here.\n"; 805} 806 807my$n=0; 808if($fetch_strategyeq"by_rev") { 809print STDERR "Fetching & writing export data by revs...\n"; 810$n= mw_import_ref_by_revs($fetch_from); 811}elsif($fetch_strategyeq"by_page") { 812print STDERR "Fetching & writing export data by pages...\n"; 813$n= mw_import_ref_by_pages($fetch_from); 814}else{ 815print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 816print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 817exit1; 818} 819 820if($fetch_from==1&&$n==0) { 821print STDERR "You appear to have cloned an empty MediaWiki.\n"; 822# Something has to be done remote-helper side. If nothing is done, an error is 823# thrown saying that HEAD is referring to unknown object 0000000000000000000 824# and the clone fails. 825} 826return; 827} 828 829sub mw_import_ref_by_pages { 830 831my$fetch_from=shift; 832my%pages_hash= get_mw_pages(); 833my@pages=values(%pages_hash); 834 835my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 836 837@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 838my@revision_ids=map{$_->{revid} }@revisions; 839 840return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 841} 842 843sub mw_import_ref_by_revs { 844 845my$fetch_from=shift; 846my%pages_hash= get_mw_pages(); 847 848my$last_remote= get_last_global_remote_rev(); 849my@revision_ids=$fetch_from..$last_remote; 850return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 851} 852 853# Import revisions given in second argument (array of integers). 854# Only pages appearing in the third argument (hash indexed by page titles) 855# will be imported. 856sub mw_import_revids { 857my$fetch_from=shift; 858my$revision_ids=shift; 859my$pages=shift; 860 861my$n=0; 862my$n_actual=0; 863my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 864 865foreachmy$pagerevid(@$revision_ids) { 866# Count page even if we skip it, since we display 867# $n/$total and $total includes skipped pages. 868$n++; 869 870# fetch the content of the pages 871my$query= { 872 action =>'query', 873 prop =>'revisions', 874 rvprop =>'content|timestamp|comment|user|ids', 875 revids =>$pagerevid, 876}; 877 878my$result=$mediawiki->api($query); 879 880if(!$result) { 881die"Failed to retrieve modified page for revision$pagerevid"; 882} 883 884if(defined($result->{query}->{badrevids}->{$pagerevid})) { 885# The revision id does not exist on the remote wiki. 886next; 887} 888 889if(!defined($result->{query}->{pages})) { 890die"Invalid revision$pagerevid."; 891} 892 893my@result_pages=values(%{$result->{query}->{pages}}); 894my$result_page=$result_pages[0]; 895my$rev=$result_pages[0]->{revisions}->[0]; 896 897my$page_title=$result_page->{title}; 898 899if(!exists($pages->{$page_title})) { 900print STDERR "$n/",scalar(@$revision_ids), 901": Skipping revision #$rev->{revid} of$page_title\n"; 902next; 903} 904 905$n_actual++; 906 907my%commit; 908$commit{author} =$rev->{user} ||'Anonymous'; 909$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 910$commit{title} = mediawiki_smudge_filename($page_title); 911$commit{mw_revision} =$rev->{revid}; 912$commit{content} = mediawiki_smudge($rev->{'*'}); 913 914if(!defined($rev->{timestamp})) { 915$last_timestamp++; 916}else{ 917$last_timestamp=$rev->{timestamp}; 918} 919$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 920 921# Differentiates classic pages and media files. 922my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 923my%mediafile; 924if($namespace) { 925my$id= get_mw_namespace_id($namespace); 926if($id&&$id== get_mw_namespace_id("File")) { 927%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 928} 929} 930# If this is a revision of the media page for new version 931# of a file do one common commit for both file and media page. 932# Else do commit only for that page. 933print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 934 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 935} 936 937return$n_actual; 938} 939 940sub error_non_fast_forward { 941my$advice= run_git("config --bool advice.pushNonFastForward"); 942chomp($advice); 943if($advicene"false") { 944# Native git-push would show this after the summary. 945# We can't ask it to display it cleanly, so print it 946# ourselves before. 947print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 948print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 949print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 950} 951print STDOUT "error$_[0]\"non-fast-forward\"\n"; 952return0; 953} 954 955sub mw_upload_file { 956my$complete_file_name=shift; 957my$new_sha1=shift; 958my$extension=shift; 959my$file_deleted=shift; 960my$summary=shift; 961my$newrevid; 962my$path="File:".$complete_file_name; 963my%hashFiles= get_allowed_file_extensions(); 964if(!exists($hashFiles{$extension})) { 965print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 966print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 967return$newrevid; 968} 969# Deleting and uploading a file requires a priviledged user 970if($file_deleted) { 971 mw_connect_maybe(); 972my$query= { 973 action =>'delete', 974 title =>$path, 975 reason =>$summary 976}; 977if(!$mediawiki->edit($query)) { 978print STDERR "Failed to delete file on remote wiki\n"; 979print STDERR "Check your permissions on the remote site. Error code:\n"; 980print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 981exit1; 982} 983}else{ 984# Don't let perl try to interpret file content as UTF-8 => use "raw" 985my$content= run_git("cat-file blob$new_sha1","raw"); 986if($contentne"") { 987 mw_connect_maybe(); 988$mediawiki->{config}->{upload_url} = 989"$url/index.php/Special:Upload"; 990$mediawiki->edit({ 991 action =>'upload', 992 filename =>$complete_file_name, 993 comment =>$summary, 994 file => [undef, 995$complete_file_name, 996 Content =>$content], 997 ignorewarnings =>1, 998}, { 999 skip_encoding =>11000} ) ||die$mediawiki->{error}->{code} .':'1001.$mediawiki->{error}->{details};1002my$last_file_page=$mediawiki->get_page({title =>$path});1003$newrevid=$last_file_page->{revid};1004print STDERR "Pushed file:$new_sha1-$complete_file_name.\n";1005}else{1006print STDERR "Empty file$complete_file_namenot pushed.\n";1007}1008}1009return$newrevid;1010}10111012sub mw_push_file {1013my$diff_info=shift;1014# $diff_info contains a string in this format:1015# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1016my@diff_info_split=split(/[ \t]/,$diff_info);10171018# Filename, including .mw extension1019my$complete_file_name=shift;1020# Commit message1021my$summary=shift;1022# MediaWiki revision number. Keep the previous one by default,1023# in case there's no edit to perform.1024my$oldrevid=shift;1025my$newrevid;10261027if($summaryeq EMPTY_MESSAGE) {1028$summary='';1029}10301031my$new_sha1=$diff_info_split[3];1032my$old_sha1=$diff_info_split[2];1033my$page_created= ($old_sha1eq NULL_SHA1);1034my$page_deleted= ($new_sha1eq NULL_SHA1);1035$complete_file_name= mediawiki_clean_filename($complete_file_name);10361037my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1038if(!defined($extension)) {1039$extension="";1040}1041if($extensioneq"mw") {1042my$ns= get_mw_namespace_id_for_page($complete_file_name);1043if($ns&&$ns== get_mw_namespace_id("File") && (!$export_media)) {1044print STDERR "Ignoring media file related page:$complete_file_name\n";1045return($oldrevid,"ok");1046}1047my$file_content;1048if($page_deleted) {1049# Deleting a page usually requires1050# special privileges. A common1051# convention is to replace the page1052# with this content instead:1053$file_content= DELETED_CONTENT;1054}else{1055$file_content= run_git("cat-file blob$new_sha1");1056}10571058 mw_connect_maybe();10591060my$result=$mediawiki->edit( {1061 action =>'edit',1062 summary =>$summary,1063 title =>$title,1064 basetimestamp =>$basetimestamps{$oldrevid},1065 text => mediawiki_clean($file_content,$page_created),1066}, {1067 skip_encoding =>1# Helps with names with accentuated characters1068});1069if(!$result) {1070if($mediawiki->{error}->{code} ==3) {1071# edit conflicts, considered as non-fast-forward1072print STDERR 'Warning: Error '.1073$mediawiki->{error}->{code} .1074' from mediwiki: '.$mediawiki->{error}->{details} .1075".\n";1076return($oldrevid,"non-fast-forward");1077}else{1078# Other errors. Shouldn't happen => just die()1079die'Fatal: Error '.1080$mediawiki->{error}->{code} .1081' from mediwiki: '.$mediawiki->{error}->{details};1082}1083}1084$newrevid=$result->{edit}->{newrevid};1085print STDERR "Pushed file:$new_sha1-$title\n";1086}elsif($export_media) {1087$newrevid= mw_upload_file($complete_file_name,$new_sha1,1088$extension,$page_deleted,1089$summary);1090}else{1091print STDERR "Ignoring media file$title\n";1092}1093$newrevid= ($newrevidor$oldrevid);1094return($newrevid,"ok");1095}10961097sub mw_push {1098# multiple push statements can follow each other1099my@refsspecs= (shift, get_more_refs("push"));1100my$pushed;1101formy$refspec(@refsspecs) {1102my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1103or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1104if($force) {1105print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1106}1107if($localeq"") {1108print STDERR "Cannot delete remote branch on a MediaWiki\n";1109print STDOUT "error$remotecannot delete\n";1110next;1111}1112if($remotene"refs/heads/master") {1113print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1114print STDOUT "error$remoteonly master allowed\n";1115next;1116}1117if(mw_push_revision($local,$remote)) {1118$pushed=1;1119}1120}11211122# Notify Git that the push is done1123print STDOUT "\n";11241125if($pushed&&$dumb_push) {1126print STDERR "Just pushed some revisions to MediaWiki.\n";1127print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1128print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1129print STDERR "\n";1130print STDERR " git pull --rebase\n";1131print STDERR "\n";1132}1133return;1134}11351136sub mw_push_revision {1137my$local=shift;1138my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1139my$last_local_revid= get_last_local_revision();1140print STDERR ".\n";# Finish sentence started by get_last_local_revision()1141my$last_remote_revid= get_last_remote_revision();1142my$mw_revision=$last_remote_revid;11431144# Get sha1 of commit pointed by local HEAD1145my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1146# Get sha1 of commit pointed by remotes/$remotename/master1147my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1148chomp($remoteorigin_sha1);11491150if($last_local_revid>0&&1151$last_local_revid<$last_remote_revid) {1152return error_non_fast_forward($remote);1153}11541155if($HEAD_sha1eq$remoteorigin_sha1) {1156# nothing to push1157return0;1158}11591160# Get every commit in between HEAD and refs/remotes/origin/master,1161# including HEAD and refs/remotes/origin/master1162my@commit_pairs= ();1163if($last_local_revid>0) {1164my$parsed_sha1=$remoteorigin_sha1;1165# Find a path from last MediaWiki commit to pushed commit1166print STDERR "Computing path from local to remote ...\n";1167my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents$local^$parsed_sha1"));1168my%local_ancestry;1169foreachmy$line(@local_ancestry) {1170if(my($child,$parents) =$line=~/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1171foreachmy$parent(split(/ /,$parents)) {1172$local_ancestry{$parent} =$child;1173}1174}elsif(!$line=~/^([a-f0-9]+)/) {1175die"Unexpected output from git rev-list:$line";1176}1177}1178while($parsed_sha1ne$HEAD_sha1) {1179my$child=$local_ancestry{$parsed_sha1};1180if(!$child) {1181printf STDERR "Cannot find a path in history from remote commit to last commit\n";1182return error_non_fast_forward($remote);1183}1184push(@commit_pairs, [$parsed_sha1,$child]);1185$parsed_sha1=$child;1186}1187}else{1188# No remote mediawiki revision. Export the whole1189# history (linearized with --first-parent)1190print STDERR "Warning: no common ancestor, pushing complete history\n";1191my$history= run_git("rev-list --first-parent --children$local");1192my@history=split(/\n/,$history);1193@history=@history[1..$#history];1194foreachmy$line(reverse@history) {1195my@commit_info_split=split(/[ \n]/,$line);1196push(@commit_pairs, \@commit_info_split);1197}1198}11991200foreachmy$commit_info_split(@commit_pairs) {1201my$sha1_child= @{$commit_info_split}[0];1202my$sha1_commit= @{$commit_info_split}[1];1203my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1204# TODO: we could detect rename, and encode them with a #redirect on the wiki.1205# TODO: for now, it's just a delete+add1206my@diff_info_list=split(/\0/,$diff_infos);1207# Keep the subject line of the commit message as mediawiki comment for the revision1208my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1209chomp($commit_msg);1210# Push every blob1211while(@diff_info_list) {1212my$status;1213# git diff-tree -z gives an output like1214# <metadata>\0<filename1>\01215# <metadata>\0<filename2>\01216# and we've split on \0.1217my$info=shift(@diff_info_list);1218my$file=shift(@diff_info_list);1219($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1220if($statuseq"non-fast-forward") {1221# we may already have sent part of the1222# commit to MediaWiki, but it's too1223# late to cancel it. Stop the push in1224# the middle, but still give an1225# accurate error message.1226return error_non_fast_forward($remote);1227}1228if($statusne"ok") {1229die("Unknown error from mw_push_file()");1230}1231}1232unless($dumb_push) {1233 run_git("notes --ref=$remotename/mediawikiadd -f -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1234 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1235}1236}12371238print STDOUT "ok$remote\n";1239return1;1240}12411242sub get_allowed_file_extensions {1243 mw_connect_maybe();12441245my$query= {1246 action =>'query',1247 meta =>'siteinfo',1248 siprop =>'fileextensions'1249};1250my$result=$mediawiki->api($query);1251my@file_extensions=map{$_->{ext}} @{$result->{query}->{fileextensions}};1252my%hashFile=map{$_=>1}@file_extensions;12531254return%hashFile;1255}12561257# In memory cache for MediaWiki namespace ids.1258my%namespace_id;12591260# Namespaces whose id is cached in the configuration file1261# (to avoid duplicates)1262my%cached_mw_namespace_id;12631264# Return MediaWiki id for a canonical namespace name.1265# Ex.: "File", "Project".1266sub get_mw_namespace_id {1267 mw_connect_maybe();1268my$name=shift;12691270if(!exists$namespace_id{$name}) {1271# Look at configuration file, if the record for that namespace is1272# already cached. Namespaces are stored in form:1273# "Name_of_namespace:Id_namespace", ex.: "File:6".1274my@temp=split(/[\n]/, run_git("config --get-all remote."1275.$remotename.".namespaceCache"));1276chomp(@temp);1277foreachmy$ns(@temp) {1278my($n,$id) =split(/:/,$ns);1279if($ideq'notANameSpace') {1280$namespace_id{$n} = {is_namespace =>0};1281}else{1282$namespace_id{$n} = {is_namespace =>1, id =>$id};1283}1284$cached_mw_namespace_id{$n} =1;1285}1286}12871288if(!exists$namespace_id{$name}) {1289print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1290# NS not found => get namespace id from MW and store it in1291# configuration file.1292my$query= {1293 action =>'query',1294 meta =>'siteinfo',1295 siprop =>'namespaces'1296};1297my$result=$mediawiki->api($query);12981299while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1300if(defined($ns->{id}) &&defined($ns->{canonical})) {1301$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1302if($ns->{'*'}) {1303# alias (e.g. french Fichier: as alias for canonical File:)1304$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1305}1306}1307}1308}13091310my$ns=$namespace_id{$name};1311my$id;13121313unless(defined$ns) {1314print STDERR "No such namespace$nameon MediaWiki.\n";1315$ns= {is_namespace =>0};1316$namespace_id{$name} =$ns;1317}13181319if($ns->{is_namespace}) {1320$id=$ns->{id};1321}13221323# Store "notANameSpace" as special value for inexisting namespaces1324my$store_id= ($id||'notANameSpace');13251326# Store explicitely requested namespaces on disk1327if(!exists$cached_mw_namespace_id{$name}) {1328 run_git("config --add remote.".$remotename1329.".namespaceCache\"".$name.":".$store_id."\"");1330$cached_mw_namespace_id{$name} =1;1331}1332return$id;1333}13341335sub get_mw_namespace_id_for_page {1336my$namespace=shift;1337if($namespace=~/^([^:]*):/) {1338return get_mw_namespace_id($namespace);1339}else{1340return;1341}1342}