1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use DateTime::Format::ISO8601; 18use warnings; 19 20# By default, use UTF-8 to communicate with Git and the user 21binmode STDERR,':encoding(UTF-8)'; 22binmode STDOUT,':encoding(UTF-8)'; 23 24use URI::Escape; 25 26# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 27useconstant SLASH_REPLACEMENT =>'%2F'; 28 29# It's not always possible to delete pages (may require some 30# privileges). Deleted pages are replaced with this content. 31useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 32 33# It's not possible to create empty pages. New empty files in Git are 34# sent with this content instead. 35useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 36 37# used to reflect file creation or deletion in diff. 38useconstant NULL_SHA1 =>'0000000000000000000000000000000000000000'; 39 40# Used on Git's side to reflect empty edit messages on the wiki 41useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 42 43useconstant EMPTY =>q{}; 44 45my$remotename=$ARGV[0]; 46my$url=$ARGV[1]; 47 48# Accept both space-separated and multiple keys in config file. 49# Spaces should be written as _ anyway because we'll use chomp. 50my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.${remotename}.pages")); 51chomp(@tracked_pages); 52 53# Just like @tracked_pages, but for MediaWiki categories. 54my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.${remotename}.categories")); 55chomp(@tracked_categories); 56 57# Import media files on pull 58my$import_media= run_git("config --get --bool remote.${remotename}.mediaimport"); 59chomp($import_media); 60$import_media= ($import_mediaeq'true'); 61 62# Export media files on push 63my$export_media= run_git("config --get --bool remote.${remotename}.mediaexport"); 64chomp($export_media); 65$export_media= !($export_mediaeq'false'); 66 67my$wiki_login= run_git("config --get remote.${remotename}.mwLogin"); 68# Note: mwPassword is discourraged. Use the credential system instead. 69my$wiki_passwd= run_git("config --get remote.${remotename}.mwPassword"); 70my$wiki_domain= run_git("config --get remote.${remotename}.mwDomain"); 71chomp($wiki_login); 72chomp($wiki_passwd); 73chomp($wiki_domain); 74 75# Import only last revisions (both for clone and fetch) 76my$shallow_import= run_git("config --get --bool remote.${remotename}.shallow"); 77chomp($shallow_import); 78$shallow_import= ($shallow_importeq'true'); 79 80# Fetch (clone and pull) by revisions instead of by pages. This behavior 81# is more efficient when we have a wiki with lots of pages and we fetch 82# the revisions quite often so that they concern only few pages. 83# Possible values: 84# - by_rev: perform one query per new revision on the remote wiki 85# - by_page: query each tracked page for new revision 86my$fetch_strategy= run_git("config --get remote.${remotename}.fetchStrategy"); 87if(!$fetch_strategy) { 88$fetch_strategy= run_git('config --get mediawiki.fetchStrategy'); 89} 90chomp($fetch_strategy); 91if(!$fetch_strategy) { 92$fetch_strategy='by_page'; 93} 94 95# Remember the timestamp corresponding to a revision id. 96my%basetimestamps; 97 98# Dumb push: don't update notes and mediawiki ref to reflect the last push. 99# 100# Configurable with mediawiki.dumbPush, or per-remote with 101# remote.<remotename>.dumbPush. 102# 103# This means the user will have to re-import the just-pushed 104# revisions. On the other hand, this means that the Git revisions 105# corresponding to MediaWiki revisions are all imported from the wiki, 106# regardless of whether they were initially created in Git or from the 107# web interface, hence all users will get the same history (i.e. if 108# the push from Git to MediaWiki loses some information, everybody 109# will get the history with information lost). If the import is 110# deterministic, this means everybody gets the same sha1 for each 111# MediaWiki revision. 112my$dumb_push= run_git("config --get --bool remote.${remotename}.dumbPush"); 113if(!$dumb_push) { 114$dumb_push= run_git('config --get --bool mediawiki.dumbPush'); 115} 116chomp($dumb_push); 117$dumb_push= ($dumb_pusheq'true'); 118 119my$wiki_name=$url; 120$wiki_name=~s{[^/]*://}{}; 121# If URL is like http://user:password@example.com/, we clearly don't 122# want the password in $wiki_name. While we're there, also remove user 123# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 124$wiki_name=~s/^.*@//; 125 126# Commands parser 127while(<STDIN>) { 128chomp; 129 130if(!parse_command($_)) { 131last; 132} 133 134BEGIN{ $| =1}# flush STDOUT, to make sure the previous 135# command is fully processed. 136} 137 138########################## Functions ############################## 139 140sub parse_command { 141my($line) =@_; 142my@cmd=split(/ /,$line); 143if(!defined$cmd[0]) { 144return0; 145} 146if($cmd[0]eq'capabilities') { 147die("Too many arguments for capabilities\n") 148if(defined($cmd[1])); 149 mw_capabilities(); 150}elsif($cmd[0]eq'list') { 151die("Too many arguments for list\n")if(defined($cmd[2])); 152 mw_list($cmd[1]); 153}elsif($cmd[0]eq'import') { 154die("Invalid arguments for import\n") 155if($cmd[1]eq EMPTY ||defined($cmd[2])); 156 mw_import($cmd[1]); 157}elsif($cmd[0]eq'option') { 158die("Too many arguments for option\n") 159if($cmd[1]eq EMPTY ||$cmd[2]eq EMPTY ||defined($cmd[3])); 160 mw_option($cmd[1],$cmd[2]); 161}elsif($cmd[0]eq'push') { 162 mw_push($cmd[1]); 163}else{ 164print{*STDERR}"Unknown command. Aborting...\n"; 165return0; 166} 167return1; 168} 169 170# MediaWiki API instance, created lazily. 171my$mediawiki; 172 173sub mw_connect_maybe { 174if($mediawiki) { 175return; 176} 177$mediawiki= MediaWiki::API->new; 178$mediawiki->{config}->{api_url} ="${url}/api.php"; 179if($wiki_login) { 180my%credential= ( 181'url'=>$url, 182'username'=>$wiki_login, 183'password'=>$wiki_passwd 184); 185 Git::credential(\%credential); 186my$request= {lgname =>$credential{username}, 187 lgpassword =>$credential{password}, 188 lgdomain =>$wiki_domain}; 189if($mediawiki->login($request)) { 190 Git::credential(\%credential,'approve'); 191print{*STDERR}qq(Logged in mediawiki user "$credential{username}".\n); 192}else{ 193print{*STDERR}qq(Failed to log in mediawiki user "$credential{username}" on ${url}\n); 194print{*STDERR}' (error '. 195$mediawiki->{error}->{code} .': '. 196$mediawiki->{error}->{details} .")\n"; 197 Git::credential(\%credential,'reject'); 198exit1; 199} 200} 201return; 202} 203 204sub fatal_mw_error { 205my$action=shift; 206print STDERR "fatal: could not$action.\n"; 207print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 208if($url=~/^https/) { 209print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 210print STDERR "fatal: and the SSL certificate is correct.\n"; 211}else{ 212print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 213} 214print STDERR "fatal: (error ". 215$mediawiki->{error}->{code} .': '. 216$mediawiki->{error}->{details} .")\n"; 217exit1; 218} 219 220## Functions for listing pages on the remote wiki 221sub get_mw_tracked_pages { 222my$pages=shift; 223 get_mw_page_list(\@tracked_pages,$pages); 224return; 225} 226 227sub get_mw_page_list { 228my$page_list=shift; 229my$pages=shift; 230my@some_pages=@$page_list; 231while(@some_pages) { 232my$last_page=50; 233if($#some_pages<$last_page) { 234$last_page=$#some_pages; 235} 236my@slice=@some_pages[0..$last_page]; 237 get_mw_first_pages(\@slice,$pages); 238@some_pages=@some_pages[51..$#some_pages]; 239} 240return; 241} 242 243sub get_mw_tracked_categories { 244my$pages=shift; 245foreachmy$category(@tracked_categories) { 246if(index($category,':') <0) { 247# Mediawiki requires the Category 248# prefix, but let's not force the user 249# to specify it. 250$category="Category:${category}"; 251} 252my$mw_pages=$mediawiki->list( { 253 action =>'query', 254 list =>'categorymembers', 255 cmtitle =>$category, 256 cmlimit =>'max'} ) 257||die$mediawiki->{error}->{code} .': ' 258.$mediawiki->{error}->{details} ."\n"; 259foreachmy$page(@{$mw_pages}) { 260$pages->{$page->{title}} =$page; 261} 262} 263return; 264} 265 266sub get_mw_all_pages { 267my$pages=shift; 268# No user-provided list, get the list of pages from the API. 269my$mw_pages=$mediawiki->list({ 270 action =>'query', 271 list =>'allpages', 272 aplimit =>'max' 273}); 274if(!defined($mw_pages)) { 275 fatal_mw_error("get the list of wiki pages"); 276} 277foreachmy$page(@{$mw_pages}) { 278$pages->{$page->{title}} =$page; 279} 280return; 281} 282 283# queries the wiki for a set of pages. Meant to be used within a loop 284# querying the wiki for slices of page list. 285sub get_mw_first_pages { 286my$some_pages=shift; 287my@some_pages= @{$some_pages}; 288 289my$pages=shift; 290 291# pattern 'page1|page2|...' required by the API 292my$titles=join('|',@some_pages); 293 294my$mw_pages=$mediawiki->api({ 295 action =>'query', 296 titles =>$titles, 297}); 298if(!defined($mw_pages)) { 299 fatal_mw_error("query the list of wiki pages"); 300} 301while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 302if($id<0) { 303print{*STDERR}"Warning: page$page->{title} not found on wiki\n"; 304}else{ 305$pages->{$page->{title}} =$page; 306} 307} 308return; 309} 310 311# Get the list of pages to be fetched according to configuration. 312sub get_mw_pages { 313 mw_connect_maybe(); 314 315print{*STDERR}"Listing pages on remote wiki...\n"; 316 317my%pages;# hash on page titles to avoid duplicates 318my$user_defined; 319if(@tracked_pages) { 320$user_defined=1; 321# The user provided a list of pages titles, but we 322# still need to query the API to get the page IDs. 323 get_mw_tracked_pages(\%pages); 324} 325if(@tracked_categories) { 326$user_defined=1; 327 get_mw_tracked_categories(\%pages); 328} 329if(!$user_defined) { 330 get_mw_all_pages(\%pages); 331} 332if($import_media) { 333print{*STDERR}"Getting media files for selected pages...\n"; 334if($user_defined) { 335 get_linked_mediafiles(\%pages); 336}else{ 337 get_all_mediafiles(\%pages); 338} 339} 340print{*STDERR} (scalar keys%pages) ." pages found.\n"; 341return%pages; 342} 343 344# usage: $out = run_git("command args"); 345# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 346sub run_git { 347my$args=shift; 348my$encoding= (shift||'encoding(UTF-8)'); 349open(my$git,"-|:${encoding}","git ${args}") 350or die"Unable to fork:$!\n"; 351my$res=do{ 352local$/=undef; 353<$git> 354}; 355close($git); 356 357return$res; 358} 359 360 361sub get_all_mediafiles { 362my$pages=shift; 363# Attach list of all pages for media files from the API, 364# they are in a different namespace, only one namespace 365# can be queried at the same moment 366my$mw_pages=$mediawiki->list({ 367 action =>'query', 368 list =>'allpages', 369 apnamespace => get_mw_namespace_id('File'), 370 aplimit =>'max' 371}); 372if(!defined($mw_pages)) { 373print{*STDERR}"fatal: could not get the list of pages for media files.\n"; 374print{*STDERR}"fatal: '$url' does not appear to be a mediawiki\n"; 375print{*STDERR}"fatal: make sure '$url/api.php' is a valid page.\n"; 376exit1; 377} 378foreachmy$page(@{$mw_pages}) { 379$pages->{$page->{title}} =$page; 380} 381return; 382} 383 384sub get_linked_mediafiles { 385my$pages=shift; 386my@titles=map{$_->{title} }values(%{$pages}); 387 388# The query is split in small batches because of the MW API limit of 389# the number of links to be returned (500 links max). 390my$batch=10; 391while(@titles) { 392if($#titles<$batch) { 393$batch=$#titles; 394} 395my@slice=@titles[0..$batch]; 396 397# pattern 'page1|page2|...' required by the API 398my$mw_titles=join('|',@slice); 399 400# Media files could be included or linked from 401# a page, get all related 402my$query= { 403 action =>'query', 404 prop =>'links|images', 405 titles =>$mw_titles, 406 plnamespace => get_mw_namespace_id('File'), 407 pllimit =>'max' 408}; 409my$result=$mediawiki->api($query); 410 411while(my($id,$page) =each(%{$result->{query}->{pages}})) { 412my@media_titles; 413if(defined($page->{links})) { 414my@link_titles 415=map{$_->{title} } @{$page->{links}}; 416push(@media_titles,@link_titles); 417} 418if(defined($page->{images})) { 419my@image_titles 420=map{$_->{title} } @{$page->{images}}; 421push(@media_titles,@image_titles); 422} 423if(@media_titles) { 424 get_mw_page_list(\@media_titles,$pages); 425} 426} 427 428@titles=@titles[($batch+1)..$#titles]; 429} 430return; 431} 432 433sub get_mw_mediafile_for_page_revision { 434# Name of the file on Wiki, with the prefix. 435my$filename=shift; 436my$timestamp=shift; 437my%mediafile; 438 439# Search if on a media file with given timestamp exists on 440# MediaWiki. In that case download the file. 441my$query= { 442 action =>'query', 443 prop =>'imageinfo', 444 titles =>"File:${filename}", 445 iistart =>$timestamp, 446 iiend =>$timestamp, 447 iiprop =>'timestamp|archivename|url', 448 iilimit =>1 449}; 450my$result=$mediawiki->api($query); 451 452my($fileid,$file) =each( %{$result->{query}->{pages}} ); 453# If not defined it means there is no revision of the file for 454# given timestamp. 455if(defined($file->{imageinfo})) { 456$mediafile{title} =$filename; 457 458my$fileinfo=pop(@{$file->{imageinfo}}); 459$mediafile{timestamp} =$fileinfo->{timestamp}; 460# Mediawiki::API's download function doesn't support https URLs 461# and can't download old versions of files. 462print{*STDERR}"\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 463$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 464} 465return%mediafile; 466} 467 468sub download_mw_mediafile { 469my$download_url=shift; 470 471my$response=$mediawiki->{ua}->get($download_url); 472if($response->code==200) { 473return$response->decoded_content; 474}else{ 475print{*STDERR}"Error downloading mediafile from :\n"; 476print{*STDERR}"URL: ${download_url}\n"; 477print{*STDERR}'Server response: '.$response->code.q{ }.$response->message."\n"; 478exit1; 479} 480} 481 482sub get_last_local_revision { 483# Get note regarding last mediawiki revision 484my$note= run_git("notes --ref=${remotename}/mediawiki show refs/mediawiki/${remotename}/master 2>/dev/null"); 485my@note_info=split(/ /,$note); 486 487my$lastrevision_number; 488if(!(defined($note_info[0]) &&$note_info[0]eq'mediawiki_revision:')) { 489print{*STDERR}'No previous mediawiki revision found'; 490$lastrevision_number=0; 491}else{ 492# Notes are formatted : mediawiki_revision: #number 493$lastrevision_number=$note_info[1]; 494chomp($lastrevision_number); 495print{*STDERR}"Last local mediawiki revision found is ${lastrevision_number}"; 496} 497return$lastrevision_number; 498} 499 500# Get the last remote revision without taking in account which pages are 501# tracked or not. This function makes a single request to the wiki thus 502# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 503# option. 504sub get_last_global_remote_rev { 505 mw_connect_maybe(); 506 507my$query= { 508 action =>'query', 509 list =>'recentchanges', 510 prop =>'revisions', 511 rclimit =>'1', 512 rcdir =>'older', 513}; 514my$result=$mediawiki->api($query); 515return$result->{query}->{recentchanges}[0]->{revid}; 516} 517 518# Get the last remote revision concerning the tracked pages and the tracked 519# categories. 520sub get_last_remote_revision { 521 mw_connect_maybe(); 522 523my%pages_hash= get_mw_pages(); 524my@pages=values(%pages_hash); 525 526my$max_rev_num=0; 527 528print{*STDERR}"Getting last revision id on tracked pages...\n"; 529 530foreachmy$page(@pages) { 531my$id=$page->{pageid}; 532 533my$query= { 534 action =>'query', 535 prop =>'revisions', 536 rvprop =>'ids|timestamp', 537 pageids =>$id, 538}; 539 540my$result=$mediawiki->api($query); 541 542my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 543 544$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 545 546$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 547} 548 549print{*STDERR}"Last remote revision found is$max_rev_num.\n"; 550return$max_rev_num; 551} 552 553# Clean content before sending it to MediaWiki 554sub mediawiki_clean { 555my$string=shift; 556my$page_created=shift; 557# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 558# This function right trims a string and adds a \n at the end to follow this rule 559$string=~s/\s+$//; 560if($stringeq EMPTY &&$page_created) { 561# Creating empty pages is forbidden. 562$string= EMPTY_CONTENT; 563} 564return$string."\n"; 565} 566 567# Filter applied on MediaWiki data before adding them to Git 568sub mediawiki_smudge { 569my$string=shift; 570if($stringeq EMPTY_CONTENT) { 571$string= EMPTY; 572} 573# This \n is important. This is due to mediawiki's way to handle end of files. 574return"${string}\n"; 575} 576 577sub mediawiki_clean_filename { 578my$filename=shift; 579$filename=~s{@{[SLASH_REPLACEMENT]}}{/}g; 580# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 581# Do a variant of URL-encoding, i.e. looks like URL-encoding, 582# but with _ added to prevent MediaWiki from thinking this is 583# an actual special character. 584$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 585# If we use the uri escape before 586# we should unescape here, before anything 587 588return$filename; 589} 590 591sub mediawiki_smudge_filename { 592my$filename=shift; 593$filename=~s{/}{@{[SLASH_REPLACEMENT]}}g; 594$filename=~s/ /_/g; 595# Decode forbidden characters encoded in mediawiki_clean_filename 596$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf('%c', hex($1))/ge; 597return$filename; 598} 599 600sub literal_data { 601my($content) =@_; 602print{*STDOUT}'data ', bytes::length($content),"\n",$content; 603return; 604} 605 606sub literal_data_raw { 607# Output possibly binary content. 608my($content) =@_; 609# Avoid confusion between size in bytes and in characters 610 utf8::downgrade($content); 611binmode{*STDOUT},':raw'; 612print{*STDOUT}'data ', bytes::length($content),"\n",$content; 613binmode{*STDOUT},':encoding(UTF-8)'; 614return; 615} 616 617sub mw_capabilities { 618# Revisions are imported to the private namespace 619# refs/mediawiki/$remotename/ by the helper and fetched into 620# refs/remotes/$remotename later by fetch. 621print{*STDOUT}"refspec refs/heads/*:refs/mediawiki/${remotename}/*\n"; 622print{*STDOUT}"import\n"; 623print{*STDOUT}"list\n"; 624print{*STDOUT}"push\n"; 625print{*STDOUT}"\n"; 626return; 627} 628 629sub mw_list { 630# MediaWiki do not have branches, we consider one branch arbitrarily 631# called master, and HEAD pointing to it. 632print{*STDOUT}"? refs/heads/master\n"; 633print{*STDOUT}"\@refs/heads/masterHEAD\n"; 634print{*STDOUT}"\n"; 635return; 636} 637 638sub mw_option { 639print{*STDERR}"remote-helper command 'option$_[0]' not yet implemented\n"; 640print{*STDOUT}"unsupported\n"; 641return; 642} 643 644sub fetch_mw_revisions_for_page { 645my$page=shift; 646my$id=shift; 647my$fetch_from=shift; 648my@page_revs= (); 649my$query= { 650 action =>'query', 651 prop =>'revisions', 652 rvprop =>'ids', 653 rvdir =>'newer', 654 rvstartid =>$fetch_from, 655 rvlimit =>500, 656 pageids =>$id, 657}; 658 659my$revnum=0; 660# Get 500 revisions at a time due to the mediawiki api limit 661while(1) { 662my$result=$mediawiki->api($query); 663 664# Parse each of those 500 revisions 665foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 666my$page_rev_ids; 667$page_rev_ids->{pageid} =$page->{pageid}; 668$page_rev_ids->{revid} =$revision->{revid}; 669push(@page_revs,$page_rev_ids); 670$revnum++; 671} 672last if(!$result->{'query-continue'}); 673$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 674} 675if($shallow_import&&@page_revs) { 676print{*STDERR}" Found 1 revision (shallow import).\n"; 677@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 678return$page_revs[0]; 679} 680print{*STDERR}" Found ${revnum} revision(s).\n"; 681return@page_revs; 682} 683 684sub fetch_mw_revisions { 685my$pages=shift;my@pages= @{$pages}; 686my$fetch_from=shift; 687 688my@revisions= (); 689my$n=1; 690foreachmy$page(@pages) { 691my$id=$page->{pageid}; 692print{*STDERR}"page ${n}/",scalar(@pages),': ',$page->{title},"\n"; 693$n++; 694my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 695@revisions= (@page_revs,@revisions); 696} 697 698return($n,@revisions); 699} 700 701sub fe_escape_path { 702my$path=shift; 703$path=~s/\\/\\\\/g; 704$path=~s/"/\\"/g; 705$path=~s/\n/\\n/g; 706returnqq("${path}"); 707} 708 709sub import_file_revision { 710my$commit=shift; 711my%commit= %{$commit}; 712my$full_import=shift; 713my$n=shift; 714my$mediafile=shift; 715my%mediafile; 716if($mediafile) { 717%mediafile= %{$mediafile}; 718} 719 720my$title=$commit{title}; 721my$comment=$commit{comment}; 722my$content=$commit{content}; 723my$author=$commit{author}; 724my$date=$commit{date}; 725 726print{*STDOUT}"commit refs/mediawiki/${remotename}/master\n"; 727print{*STDOUT}"mark :${n}\n"; 728print{*STDOUT}"committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 729 literal_data($comment); 730 731# If it's not a clone, we need to know where to start from 732if(!$full_import&&$n==1) { 733print{*STDOUT}"from refs/mediawiki/${remotename}/master^0\n"; 734} 735if($contentne DELETED_CONTENT) { 736print{*STDOUT}'M 644 inline '. 737 fe_escape_path("${title}.mw") ."\n"; 738 literal_data($content); 739if(%mediafile) { 740print{*STDOUT}'M 644 inline ' 741. fe_escape_path($mediafile{title}) ."\n"; 742 literal_data_raw($mediafile{content}); 743} 744print{*STDOUT}"\n\n"; 745}else{ 746print{*STDOUT}'D '. fe_escape_path("${title}.mw") ."\n"; 747} 748 749# mediawiki revision number in the git note 750if($full_import&&$n==1) { 751print{*STDOUT}"reset refs/notes/${remotename}/mediawiki\n"; 752} 753print{*STDOUT}"commit refs/notes/${remotename}/mediawiki\n"; 754print{*STDOUT}"committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 755 literal_data('Note added by git-mediawiki during import'); 756if(!$full_import&&$n==1) { 757print{*STDOUT}"from refs/notes/${remotename}/mediawiki^0\n"; 758} 759print{*STDOUT}"N inline :${n}\n"; 760 literal_data("mediawiki_revision:$commit{mw_revision}"); 761print{*STDOUT}"\n\n"; 762return; 763} 764 765# parse a sequence of 766# <cmd> <arg1> 767# <cmd> <arg2> 768# \n 769# (like batch sequence of import and sequence of push statements) 770sub get_more_refs { 771my$cmd=shift; 772my@refs; 773while(1) { 774my$line= <STDIN>; 775if($line=~/^$cmd (.*)$/) { 776push(@refs,$1); 777}elsif($lineeq"\n") { 778return@refs; 779}else{ 780die("Invalid command in a '$cmd' batch:$_\n"); 781} 782} 783return; 784} 785 786sub mw_import { 787# multiple import commands can follow each other. 788my@refs= (shift, get_more_refs('import')); 789foreachmy$ref(@refs) { 790 mw_import_ref($ref); 791} 792print{*STDOUT}"done\n"; 793return; 794} 795 796sub mw_import_ref { 797my$ref=shift; 798# The remote helper will call "import HEAD" and 799# "import refs/heads/master". 800# Since HEAD is a symbolic ref to master (by convention, 801# followed by the output of the command "list" that we gave), 802# we don't need to do anything in this case. 803if($refeq'HEAD') { 804return; 805} 806 807 mw_connect_maybe(); 808 809print{*STDERR}"Searching revisions...\n"; 810my$last_local= get_last_local_revision(); 811my$fetch_from=$last_local+1; 812if($fetch_from==1) { 813print{*STDERR}", fetching from beginning.\n"; 814}else{ 815print{*STDERR}", fetching from here.\n"; 816} 817 818my$n=0; 819if($fetch_strategyeq'by_rev') { 820print{*STDERR}"Fetching & writing export data by revs...\n"; 821$n= mw_import_ref_by_revs($fetch_from); 822}elsif($fetch_strategyeq'by_page') { 823print{*STDERR}"Fetching & writing export data by pages...\n"; 824$n= mw_import_ref_by_pages($fetch_from); 825}else{ 826print{*STDERR}qq(fatal: invalid fetch strategy "${fetch_strategy}".\n); 827print{*STDERR}"Check your configuration variables remote.${remotename}.fetchStrategy and mediawiki.fetchStrategy\n"; 828exit1; 829} 830 831if($fetch_from==1&&$n==0) { 832print{*STDERR}"You appear to have cloned an empty MediaWiki.\n"; 833# Something has to be done remote-helper side. If nothing is done, an error is 834# thrown saying that HEAD is referring to unknown object 0000000000000000000 835# and the clone fails. 836} 837return; 838} 839 840sub mw_import_ref_by_pages { 841 842my$fetch_from=shift; 843my%pages_hash= get_mw_pages(); 844my@pages=values(%pages_hash); 845 846my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 847 848@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 849my@revision_ids=map{$_->{revid} }@revisions; 850 851return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 852} 853 854sub mw_import_ref_by_revs { 855 856my$fetch_from=shift; 857my%pages_hash= get_mw_pages(); 858 859my$last_remote= get_last_global_remote_rev(); 860my@revision_ids=$fetch_from..$last_remote; 861return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 862} 863 864# Import revisions given in second argument (array of integers). 865# Only pages appearing in the third argument (hash indexed by page titles) 866# will be imported. 867sub mw_import_revids { 868my$fetch_from=shift; 869my$revision_ids=shift; 870my$pages=shift; 871 872my$n=0; 873my$n_actual=0; 874my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 875 876foreachmy$pagerevid(@$revision_ids) { 877# Count page even if we skip it, since we display 878# $n/$total and $total includes skipped pages. 879$n++; 880 881# fetch the content of the pages 882my$query= { 883 action =>'query', 884 prop =>'revisions', 885 rvprop =>'content|timestamp|comment|user|ids', 886 revids =>$pagerevid, 887}; 888 889my$result=$mediawiki->api($query); 890 891if(!$result) { 892die"Failed to retrieve modified page for revision$pagerevid\n"; 893} 894 895if(defined($result->{query}->{badrevids}->{$pagerevid})) { 896# The revision id does not exist on the remote wiki. 897next; 898} 899 900if(!defined($result->{query}->{pages})) { 901die"Invalid revision ${pagerevid}.\n"; 902} 903 904my@result_pages=values(%{$result->{query}->{pages}}); 905my$result_page=$result_pages[0]; 906my$rev=$result_pages[0]->{revisions}->[0]; 907 908my$page_title=$result_page->{title}; 909 910if(!exists($pages->{$page_title})) { 911print{*STDERR}"${n}/",scalar(@$revision_ids), 912": Skipping revision #$rev->{revid} of ${page_title}\n"; 913next; 914} 915 916$n_actual++; 917 918my%commit; 919$commit{author} =$rev->{user} ||'Anonymous'; 920$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 921$commit{title} = mediawiki_smudge_filename($page_title); 922$commit{mw_revision} =$rev->{revid}; 923$commit{content} = mediawiki_smudge($rev->{'*'}); 924 925if(!defined($rev->{timestamp})) { 926$last_timestamp++; 927}else{ 928$last_timestamp=$rev->{timestamp}; 929} 930$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 931 932# Differentiates classic pages and media files. 933my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 934my%mediafile; 935if($namespace) { 936my$id= get_mw_namespace_id($namespace); 937if($id&&$id== get_mw_namespace_id('File')) { 938%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 939} 940} 941# If this is a revision of the media page for new version 942# of a file do one common commit for both file and media page. 943# Else do commit only for that page. 944print{*STDERR}"${n}/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 945 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 946} 947 948return$n_actual; 949} 950 951sub error_non_fast_forward { 952my$advice= run_git('config --bool advice.pushNonFastForward'); 953chomp($advice); 954if($advicene'false') { 955# Native git-push would show this after the summary. 956# We can't ask it to display it cleanly, so print it 957# ourselves before. 958print{*STDERR}"To prevent you from losing history, non-fast-forward updates were rejected\n"; 959print{*STDERR}"Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 960print{*STDERR}"'Note about fast-forwards' section of 'git push --help' for details.\n"; 961} 962print{*STDOUT}qq(error$_[0] "non-fast-forward"\n); 963return0; 964} 965 966sub mw_upload_file { 967my$complete_file_name=shift; 968my$new_sha1=shift; 969my$extension=shift; 970my$file_deleted=shift; 971my$summary=shift; 972my$newrevid; 973my$path="File:${complete_file_name}"; 974my%hashFiles= get_allowed_file_extensions(); 975if(!exists($hashFiles{$extension})) { 976print{*STDERR}"${complete_file_name} is not a permitted file on this wiki.\n"; 977print{*STDERR}"Check the configuration of file uploads in your mediawiki.\n"; 978return$newrevid; 979} 980# Deleting and uploading a file requires a priviledged user 981if($file_deleted) { 982 mw_connect_maybe(); 983my$query= { 984 action =>'delete', 985 title =>$path, 986 reason =>$summary 987}; 988if(!$mediawiki->edit($query)) { 989print{*STDERR}"Failed to delete file on remote wiki\n"; 990print{*STDERR}"Check your permissions on the remote site. Error code:\n"; 991print{*STDERR}$mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 992exit1; 993} 994}else{ 995# Don't let perl try to interpret file content as UTF-8 => use "raw" 996my$content= run_git("cat-file blob ${new_sha1}",'raw'); 997if($contentne EMPTY) { 998 mw_connect_maybe(); 999$mediawiki->{config}->{upload_url} =1000"${url}/index.php/Special:Upload";1001$mediawiki->edit({1002 action =>'upload',1003 filename =>$complete_file_name,1004 comment =>$summary,1005 file => [undef,1006$complete_file_name,1007 Content =>$content],1008 ignorewarnings =>1,1009}, {1010 skip_encoding =>11011} ) ||die$mediawiki->{error}->{code} .':'1012.$mediawiki->{error}->{details} ."\n";1013my$last_file_page=$mediawiki->get_page({title =>$path});1014$newrevid=$last_file_page->{revid};1015print{*STDERR}"Pushed file: ${new_sha1} - ${complete_file_name}.\n";1016}else{1017print{*STDERR}"Empty file ${complete_file_name} not pushed.\n";1018}1019}1020return$newrevid;1021}10221023sub mw_push_file {1024my$diff_info=shift;1025# $diff_info contains a string in this format:1026# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1027my@diff_info_split=split(/[ \t]/,$diff_info);10281029# Filename, including .mw extension1030my$complete_file_name=shift;1031# Commit message1032my$summary=shift;1033# MediaWiki revision number. Keep the previous one by default,1034# in case there's no edit to perform.1035my$oldrevid=shift;1036my$newrevid;10371038if($summaryeq EMPTY_MESSAGE) {1039$summary= EMPTY;1040}10411042my$new_sha1=$diff_info_split[3];1043my$old_sha1=$diff_info_split[2];1044my$page_created= ($old_sha1eq NULL_SHA1);1045my$page_deleted= ($new_sha1eq NULL_SHA1);1046$complete_file_name= mediawiki_clean_filename($complete_file_name);10471048my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1049if(!defined($extension)) {1050$extension= EMPTY;1051}1052if($extensioneq'mw') {1053my$ns= get_mw_namespace_id_for_page($complete_file_name);1054if($ns&&$ns== get_mw_namespace_id('File') && (!$export_media)) {1055print{*STDERR}"Ignoring media file related page: ${complete_file_name}\n";1056return($oldrevid,'ok');1057}1058my$file_content;1059if($page_deleted) {1060# Deleting a page usually requires1061# special privileges. A common1062# convention is to replace the page1063# with this content instead:1064$file_content= DELETED_CONTENT;1065}else{1066$file_content= run_git("cat-file blob ${new_sha1}");1067}10681069 mw_connect_maybe();10701071my$result=$mediawiki->edit( {1072 action =>'edit',1073 summary =>$summary,1074 title =>$title,1075 basetimestamp =>$basetimestamps{$oldrevid},1076 text => mediawiki_clean($file_content,$page_created),1077}, {1078 skip_encoding =>1# Helps with names with accentuated characters1079});1080if(!$result) {1081if($mediawiki->{error}->{code} ==3) {1082# edit conflicts, considered as non-fast-forward1083print{*STDERR}'Warning: Error '.1084$mediawiki->{error}->{code} .1085' from mediwiki: '.$mediawiki->{error}->{details} .1086".\n";1087return($oldrevid,'non-fast-forward');1088}else{1089# Other errors. Shouldn't happen => just die()1090die'Fatal: Error '.1091$mediawiki->{error}->{code} .1092' from mediwiki: '.$mediawiki->{error}->{details} ."\n";1093}1094}1095$newrevid=$result->{edit}->{newrevid};1096print{*STDERR}"Pushed file: ${new_sha1} - ${title}\n";1097}elsif($export_media) {1098$newrevid= mw_upload_file($complete_file_name,$new_sha1,1099$extension,$page_deleted,1100$summary);1101}else{1102print{*STDERR}"Ignoring media file ${title}\n";1103}1104$newrevid= ($newrevidor$oldrevid);1105return($newrevid,'ok');1106}11071108sub mw_push {1109# multiple push statements can follow each other1110my@refsspecs= (shift, get_more_refs('push'));1111my$pushed;1112formy$refspec(@refsspecs) {1113my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1114or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>\n");1115if($force) {1116print{*STDERR}"Warning: forced push not allowed on a MediaWiki.\n";1117}1118if($localeq EMPTY) {1119print{*STDERR}"Cannot delete remote branch on a MediaWiki\n";1120print{*STDOUT}"error ${remote} cannot delete\n";1121next;1122}1123if($remotene'refs/heads/master') {1124print{*STDERR}"Only push to the branch 'master' is supported on a MediaWiki\n";1125print{*STDOUT}"error ${remote} only master allowed\n";1126next;1127}1128if(mw_push_revision($local,$remote)) {1129$pushed=1;1130}1131}11321133# Notify Git that the push is done1134print{*STDOUT}"\n";11351136if($pushed&&$dumb_push) {1137print{*STDERR}"Just pushed some revisions to MediaWiki.\n";1138print{*STDERR}"The pushed revisions now have to be re-imported, and your current branch\n";1139print{*STDERR}"needs to be updated with these re-imported commits. You can do this with\n";1140print{*STDERR}"\n";1141print{*STDERR}" git pull --rebase\n";1142print{*STDERR}"\n";1143}1144return;1145}11461147sub mw_push_revision {1148my$local=shift;1149my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1150my$last_local_revid= get_last_local_revision();1151print{*STDERR}".\n";# Finish sentence started by get_last_local_revision()1152my$last_remote_revid= get_last_remote_revision();1153my$mw_revision=$last_remote_revid;11541155# Get sha1 of commit pointed by local HEAD1156my$HEAD_sha1= run_git("rev-parse ${local} 2>/dev/null");1157chomp($HEAD_sha1);1158# Get sha1 of commit pointed by remotes/$remotename/master1159my$remoteorigin_sha1= run_git("rev-parse refs/remotes/${remotename}/master 2>/dev/null");1160chomp($remoteorigin_sha1);11611162if($last_local_revid>0&&1163$last_local_revid<$last_remote_revid) {1164return error_non_fast_forward($remote);1165}11661167if($HEAD_sha1eq$remoteorigin_sha1) {1168# nothing to push1169return0;1170}11711172# Get every commit in between HEAD and refs/remotes/origin/master,1173# including HEAD and refs/remotes/origin/master1174my@commit_pairs= ();1175if($last_local_revid>0) {1176my$parsed_sha1=$remoteorigin_sha1;1177# Find a path from last MediaWiki commit to pushed commit1178print{*STDERR}"Computing path from local to remote ...\n";1179my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents ${local} ^${parsed_sha1}"));1180my%local_ancestry;1181foreachmy$line(@local_ancestry) {1182if(my($child,$parents) =$line=~/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1183foreachmy$parent(split(/ /,$parents)) {1184$local_ancestry{$parent} =$child;1185}1186}elsif(!$line=~/^([a-f0-9]+)/) {1187die"Unexpected output from git rev-list: ${line}\n";1188}1189}1190while($parsed_sha1ne$HEAD_sha1) {1191my$child=$local_ancestry{$parsed_sha1};1192if(!$child) {1193print{*STDERR}"Cannot find a path in history from remote commit to last commit\n";1194return error_non_fast_forward($remote);1195}1196push(@commit_pairs, [$parsed_sha1,$child]);1197$parsed_sha1=$child;1198}1199}else{1200# No remote mediawiki revision. Export the whole1201# history (linearized with --first-parent)1202print{*STDERR}"Warning: no common ancestor, pushing complete history\n";1203my$history= run_git("rev-list --first-parent --children ${local}");1204my@history=split(/\n/,$history);1205@history=@history[1..$#history];1206foreachmy$line(reverse@history) {1207my@commit_info_split=split(/[ \n]/,$line);1208push(@commit_pairs, \@commit_info_split);1209}1210}12111212foreachmy$commit_info_split(@commit_pairs) {1213my$sha1_child= @{$commit_info_split}[0];1214my$sha1_commit= @{$commit_info_split}[1];1215my$diff_infos= run_git("diff-tree -r --raw -z ${sha1_child} ${sha1_commit}");1216# TODO: we could detect rename, and encode them with a #redirect on the wiki.1217# TODO: for now, it's just a delete+add1218my@diff_info_list=split(/\0/,$diff_infos);1219# Keep the subject line of the commit message as mediawiki comment for the revision1220my$commit_msg= run_git(qq(log --no-walk --format="%s" ${sha1_commit}));1221chomp($commit_msg);1222# Push every blob1223while(@diff_info_list) {1224my$status;1225# git diff-tree -z gives an output like1226# <metadata>\0<filename1>\01227# <metadata>\0<filename2>\01228# and we've split on \0.1229my$info=shift(@diff_info_list);1230my$file=shift(@diff_info_list);1231($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1232if($statuseq'non-fast-forward') {1233# we may already have sent part of the1234# commit to MediaWiki, but it's too1235# late to cancel it. Stop the push in1236# the middle, but still give an1237# accurate error message.1238return error_non_fast_forward($remote);1239}1240if($statusne'ok') {1241die("Unknown error from mw_push_file()\n");1242}1243}1244if(!$dumb_push) {1245 run_git(qq(notes --ref=${remotename}/mediawiki add -f -m "mediawiki_revision: ${mw_revision}" ${sha1_commit}));1246 run_git(qq(update-ref -m "Git-MediaWiki push" refs/mediawiki/${remotename}/master ${sha1_commit} ${sha1_child}));1247}1248}12491250print{*STDOUT}"ok ${remote}\n";1251return1;1252}12531254sub get_allowed_file_extensions {1255 mw_connect_maybe();12561257my$query= {1258 action =>'query',1259 meta =>'siteinfo',1260 siprop =>'fileextensions'1261};1262my$result=$mediawiki->api($query);1263my@file_extensions=map{$_->{ext}} @{$result->{query}->{fileextensions}};1264my%hashFile=map{$_=>1}@file_extensions;12651266return%hashFile;1267}12681269# In memory cache for MediaWiki namespace ids.1270my%namespace_id;12711272# Namespaces whose id is cached in the configuration file1273# (to avoid duplicates)1274my%cached_mw_namespace_id;12751276# Return MediaWiki id for a canonical namespace name.1277# Ex.: "File", "Project".1278sub get_mw_namespace_id {1279 mw_connect_maybe();1280my$name=shift;12811282if(!exists$namespace_id{$name}) {1283# Look at configuration file, if the record for that namespace is1284# already cached. Namespaces are stored in form:1285# "Name_of_namespace:Id_namespace", ex.: "File:6".1286my@temp=split(/\n/,1287 run_git("config --get-all remote.${remotename}.namespaceCache"));1288chomp(@temp);1289foreachmy$ns(@temp) {1290my($n,$id) =split(/:/,$ns);1291if($ideq'notANameSpace') {1292$namespace_id{$n} = {is_namespace =>0};1293}else{1294$namespace_id{$n} = {is_namespace =>1, id =>$id};1295}1296$cached_mw_namespace_id{$n} =1;1297}1298}12991300if(!exists$namespace_id{$name}) {1301print{*STDERR}"Namespace ${name} not found in cache, querying the wiki ...\n";1302# NS not found => get namespace id from MW and store it in1303# configuration file.1304my$query= {1305 action =>'query',1306 meta =>'siteinfo',1307 siprop =>'namespaces'1308};1309my$result=$mediawiki->api($query);13101311while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1312if(defined($ns->{id}) &&defined($ns->{canonical})) {1313$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1314if($ns->{'*'}) {1315# alias (e.g. french Fichier: as alias for canonical File:)1316$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1317}1318}1319}1320}13211322my$ns=$namespace_id{$name};1323my$id;13241325if(!defined$ns) {1326print{*STDERR}"No such namespace ${name} on MediaWiki.\n";1327$ns= {is_namespace =>0};1328$namespace_id{$name} =$ns;1329}13301331if($ns->{is_namespace}) {1332$id=$ns->{id};1333}13341335# Store "notANameSpace" as special value for inexisting namespaces1336my$store_id= ($id||'notANameSpace');13371338# Store explicitely requested namespaces on disk1339if(!exists$cached_mw_namespace_id{$name}) {1340 run_git(qq(config --add remote.${remotename}.namespaceCache "${name}:${store_id}"));1341$cached_mw_namespace_id{$name} =1;1342}1343return$id;1344}13451346sub get_mw_namespace_id_for_page {1347my$namespace=shift;1348if($namespace=~/^([^:]*):/) {1349return get_mw_namespace_id($namespace);1350}else{1351return;1352}1353}