1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use DateTime::Format::ISO8601; 18use warnings; 19 20# By default, use UTF-8 to communicate with Git and the user 21binmode STDERR,':encoding(UTF-8)'; 22binmode STDOUT,':encoding(UTF-8)'; 23 24use URI::Escape; 25 26# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 27useconstant SLASH_REPLACEMENT =>'%2F'; 28 29# It's not always possible to delete pages (may require some 30# privileges). Deleted pages are replaced with this content. 31useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 32 33# It's not possible to create empty pages. New empty files in Git are 34# sent with this content instead. 35useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 36 37# used to reflect file creation or deletion in diff. 38useconstant NULL_SHA1 =>'0000000000000000000000000000000000000000'; 39 40# Used on Git's side to reflect empty edit messages on the wiki 41useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 42 43my$remotename=$ARGV[0]; 44my$url=$ARGV[1]; 45 46# Accept both space-separated and multiple keys in config file. 47# Spaces should be written as _ anyway because we'll use chomp. 48my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.${remotename}.pages")); 49chomp(@tracked_pages); 50 51# Just like @tracked_pages, but for MediaWiki categories. 52my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.${remotename}.categories")); 53chomp(@tracked_categories); 54 55# Import media files on pull 56my$import_media= run_git("config --get --bool remote.${remotename}.mediaimport"); 57chomp($import_media); 58$import_media= ($import_mediaeq'true'); 59 60# Export media files on push 61my$export_media= run_git("config --get --bool remote.${remotename}.mediaexport"); 62chomp($export_media); 63$export_media= !($export_mediaeq'false'); 64 65my$wiki_login= run_git("config --get remote.${remotename}.mwLogin"); 66# Note: mwPassword is discourraged. Use the credential system instead. 67my$wiki_passwd= run_git("config --get remote.${remotename}.mwPassword"); 68my$wiki_domain= run_git("config --get remote.${remotename}.mwDomain"); 69chomp($wiki_login); 70chomp($wiki_passwd); 71chomp($wiki_domain); 72 73# Import only last revisions (both for clone and fetch) 74my$shallow_import= run_git("config --get --bool remote.${remotename}.shallow"); 75chomp($shallow_import); 76$shallow_import= ($shallow_importeq'true'); 77 78# Fetch (clone and pull) by revisions instead of by pages. This behavior 79# is more efficient when we have a wiki with lots of pages and we fetch 80# the revisions quite often so that they concern only few pages. 81# Possible values: 82# - by_rev: perform one query per new revision on the remote wiki 83# - by_page: query each tracked page for new revision 84my$fetch_strategy= run_git("config --get remote.${remotename}.fetchStrategy"); 85unless($fetch_strategy) { 86$fetch_strategy= run_git('config --get mediawiki.fetchStrategy'); 87} 88chomp($fetch_strategy); 89unless($fetch_strategy) { 90$fetch_strategy='by_page'; 91} 92 93# Remember the timestamp corresponding to a revision id. 94my%basetimestamps; 95 96# Dumb push: don't update notes and mediawiki ref to reflect the last push. 97# 98# Configurable with mediawiki.dumbPush, or per-remote with 99# remote.<remotename>.dumbPush. 100# 101# This means the user will have to re-import the just-pushed 102# revisions. On the other hand, this means that the Git revisions 103# corresponding to MediaWiki revisions are all imported from the wiki, 104# regardless of whether they were initially created in Git or from the 105# web interface, hence all users will get the same history (i.e. if 106# the push from Git to MediaWiki loses some information, everybody 107# will get the history with information lost). If the import is 108# deterministic, this means everybody gets the same sha1 for each 109# MediaWiki revision. 110my$dumb_push= run_git("config --get --bool remote.${remotename}.dumbPush"); 111unless($dumb_push) { 112$dumb_push= run_git('config --get --bool mediawiki.dumbPush'); 113} 114chomp($dumb_push); 115$dumb_push= ($dumb_pusheq'true'); 116 117my$wiki_name=$url; 118$wiki_name=~s{[^/]*://}{}; 119# If URL is like http://user:password@example.com/, we clearly don't 120# want the password in $wiki_name. While we're there, also remove user 121# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 122$wiki_name=~s/^.*@//; 123 124# Commands parser 125while(<STDIN>) { 126chomp; 127 128if(!parse_command($_)) { 129last; 130} 131 132BEGIN{ $| =1}# flush STDOUT, to make sure the previous 133# command is fully processed. 134} 135 136########################## Functions ############################## 137 138sub parse_command { 139my($line) =@_; 140my@cmd=split(/ /,$line); 141if(!defined$cmd[0]) { 142return0; 143} 144if($cmd[0]eq'capabilities') { 145die("Too many arguments for capabilities\n") 146if(defined($cmd[1])); 147 mw_capabilities(); 148}elsif($cmd[0]eq'list') { 149die("Too many arguments for list\n")if(defined($cmd[2])); 150 mw_list($cmd[1]); 151}elsif($cmd[0]eq'import') { 152die("Invalid arguments for import\n") 153if($cmd[1]eq""||defined($cmd[2])); 154 mw_import($cmd[1]); 155}elsif($cmd[0]eq'option') { 156die("Too many arguments for option\n") 157if($cmd[1]eq""||$cmd[2]eq""||defined($cmd[3])); 158 mw_option($cmd[1],$cmd[2]); 159}elsif($cmd[0]eq'push') { 160 mw_push($cmd[1]); 161}else{ 162print STDERR "Unknown command. Aborting...\n"; 163return0; 164} 165return1; 166} 167 168# MediaWiki API instance, created lazily. 169my$mediawiki; 170 171sub mw_connect_maybe { 172if($mediawiki) { 173return; 174} 175$mediawiki= MediaWiki::API->new; 176$mediawiki->{config}->{api_url} ="${url}/api.php"; 177if($wiki_login) { 178my%credential= ( 179'url'=>$url, 180'username'=>$wiki_login, 181'password'=>$wiki_passwd 182); 183 Git::credential(\%credential); 184my$request= {lgname =>$credential{username}, 185 lgpassword =>$credential{password}, 186 lgdomain =>$wiki_domain}; 187if($mediawiki->login($request)) { 188 Git::credential(\%credential,'approve'); 189print STDERR qq(Logged in mediawiki user "$credential{username}".\n); 190}else{ 191print STDERR qq(Failed to log in mediawiki user "$credential{username}" on ${url}\n); 192print STDERR ' (error '. 193$mediawiki->{error}->{code} .': '. 194$mediawiki->{error}->{details} .")\n"; 195 Git::credential(\%credential,'reject'); 196exit1; 197} 198} 199return; 200} 201 202sub fatal_mw_error { 203my$action=shift; 204print STDERR "fatal: could not$action.\n"; 205print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 206if($url=~/^https/) { 207print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 208print STDERR "fatal: and the SSL certificate is correct.\n"; 209}else{ 210print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 211} 212print STDERR "fatal: (error ". 213$mediawiki->{error}->{code} .': '. 214$mediawiki->{error}->{details} .")\n"; 215exit1; 216} 217 218## Functions for listing pages on the remote wiki 219sub get_mw_tracked_pages { 220my$pages=shift; 221 get_mw_page_list(\@tracked_pages,$pages); 222return; 223} 224 225sub get_mw_page_list { 226my$page_list=shift; 227my$pages=shift; 228my@some_pages=@$page_list; 229while(@some_pages) { 230my$last_page=50; 231if($#some_pages<$last_page) { 232$last_page=$#some_pages; 233} 234my@slice=@some_pages[0..$last_page]; 235 get_mw_first_pages(\@slice,$pages); 236@some_pages=@some_pages[51..$#some_pages]; 237} 238return; 239} 240 241sub get_mw_tracked_categories { 242my$pages=shift; 243foreachmy$category(@tracked_categories) { 244if(index($category,':') <0) { 245# Mediawiki requires the Category 246# prefix, but let's not force the user 247# to specify it. 248$category="Category:${category}"; 249} 250my$mw_pages=$mediawiki->list( { 251 action =>'query', 252 list =>'categorymembers', 253 cmtitle =>$category, 254 cmlimit =>'max'} ) 255||die$mediawiki->{error}->{code} .': ' 256.$mediawiki->{error}->{details} ."\n"; 257foreachmy$page(@{$mw_pages}) { 258$pages->{$page->{title}} =$page; 259} 260} 261return; 262} 263 264sub get_mw_all_pages { 265my$pages=shift; 266# No user-provided list, get the list of pages from the API. 267my$mw_pages=$mediawiki->list({ 268 action =>'query', 269 list =>'allpages', 270 aplimit =>'max' 271}); 272if(!defined($mw_pages)) { 273 fatal_mw_error("get the list of wiki pages"); 274} 275foreachmy$page(@{$mw_pages}) { 276$pages->{$page->{title}} =$page; 277} 278return; 279} 280 281# queries the wiki for a set of pages. Meant to be used within a loop 282# querying the wiki for slices of page list. 283sub get_mw_first_pages { 284my$some_pages=shift; 285my@some_pages= @{$some_pages}; 286 287my$pages=shift; 288 289# pattern 'page1|page2|...' required by the API 290my$titles=join('|',@some_pages); 291 292my$mw_pages=$mediawiki->api({ 293 action =>'query', 294 titles =>$titles, 295}); 296if(!defined($mw_pages)) { 297 fatal_mw_error("query the list of wiki pages"); 298} 299while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 300if($id<0) { 301print STDERR "Warning: page$page->{title} not found on wiki\n"; 302}else{ 303$pages->{$page->{title}} =$page; 304} 305} 306return; 307} 308 309# Get the list of pages to be fetched according to configuration. 310sub get_mw_pages { 311 mw_connect_maybe(); 312 313print STDERR "Listing pages on remote wiki...\n"; 314 315my%pages;# hash on page titles to avoid duplicates 316my$user_defined; 317if(@tracked_pages) { 318$user_defined=1; 319# The user provided a list of pages titles, but we 320# still need to query the API to get the page IDs. 321 get_mw_tracked_pages(\%pages); 322} 323if(@tracked_categories) { 324$user_defined=1; 325 get_mw_tracked_categories(\%pages); 326} 327if(!$user_defined) { 328 get_mw_all_pages(\%pages); 329} 330if($import_media) { 331print STDERR "Getting media files for selected pages...\n"; 332if($user_defined) { 333 get_linked_mediafiles(\%pages); 334}else{ 335 get_all_mediafiles(\%pages); 336} 337} 338print STDERR (scalar keys%pages) ." pages found.\n"; 339return%pages; 340} 341 342# usage: $out = run_git("command args"); 343# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 344sub run_git { 345my$args=shift; 346my$encoding= (shift||'encoding(UTF-8)'); 347open(my$git,"-|:${encoding}","git ${args}") 348or die"Unable to fork:$!\n"; 349my$res=do{ 350local$/=undef; 351<$git> 352}; 353close($git); 354 355return$res; 356} 357 358 359sub get_all_mediafiles { 360my$pages=shift; 361# Attach list of all pages for media files from the API, 362# they are in a different namespace, only one namespace 363# can be queried at the same moment 364my$mw_pages=$mediawiki->list({ 365 action =>'query', 366 list =>'allpages', 367 apnamespace => get_mw_namespace_id('File'), 368 aplimit =>'max' 369}); 370if(!defined($mw_pages)) { 371print STDERR "fatal: could not get the list of pages for media files.\n"; 372print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 373print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 374exit1; 375} 376foreachmy$page(@{$mw_pages}) { 377$pages->{$page->{title}} =$page; 378} 379return; 380} 381 382sub get_linked_mediafiles { 383my$pages=shift; 384my@titles=map{$_->{title} }values(%{$pages}); 385 386# The query is split in small batches because of the MW API limit of 387# the number of links to be returned (500 links max). 388my$batch=10; 389while(@titles) { 390if($#titles<$batch) { 391$batch=$#titles; 392} 393my@slice=@titles[0..$batch]; 394 395# pattern 'page1|page2|...' required by the API 396my$mw_titles=join('|',@slice); 397 398# Media files could be included or linked from 399# a page, get all related 400my$query= { 401 action =>'query', 402 prop =>'links|images', 403 titles =>$mw_titles, 404 plnamespace => get_mw_namespace_id('File'), 405 pllimit =>'max' 406}; 407my$result=$mediawiki->api($query); 408 409while(my($id,$page) =each(%{$result->{query}->{pages}})) { 410my@media_titles; 411if(defined($page->{links})) { 412my@link_titles 413=map{$_->{title} } @{$page->{links}}; 414push(@media_titles,@link_titles); 415} 416if(defined($page->{images})) { 417my@image_titles 418=map{$_->{title} } @{$page->{images}}; 419push(@media_titles,@image_titles); 420} 421if(@media_titles) { 422 get_mw_page_list(\@media_titles,$pages); 423} 424} 425 426@titles=@titles[($batch+1)..$#titles]; 427} 428return; 429} 430 431sub get_mw_mediafile_for_page_revision { 432# Name of the file on Wiki, with the prefix. 433my$filename=shift; 434my$timestamp=shift; 435my%mediafile; 436 437# Search if on a media file with given timestamp exists on 438# MediaWiki. In that case download the file. 439my$query= { 440 action =>'query', 441 prop =>'imageinfo', 442 titles =>"File:${filename}", 443 iistart =>$timestamp, 444 iiend =>$timestamp, 445 iiprop =>'timestamp|archivename|url', 446 iilimit =>1 447}; 448my$result=$mediawiki->api($query); 449 450my($fileid,$file) =each( %{$result->{query}->{pages}} ); 451# If not defined it means there is no revision of the file for 452# given timestamp. 453if(defined($file->{imageinfo})) { 454$mediafile{title} =$filename; 455 456my$fileinfo=pop(@{$file->{imageinfo}}); 457$mediafile{timestamp} =$fileinfo->{timestamp}; 458# Mediawiki::API's download function doesn't support https URLs 459# and can't download old versions of files. 460print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 461$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 462} 463return%mediafile; 464} 465 466sub download_mw_mediafile { 467my$download_url=shift; 468 469my$response=$mediawiki->{ua}->get($download_url); 470if($response->code==200) { 471return$response->decoded_content; 472}else{ 473print STDERR "Error downloading mediafile from :\n"; 474print STDERR "URL: ${download_url}\n"; 475print STDERR 'Server response: '.$response->code.q{ }.$response->message."\n"; 476exit1; 477} 478} 479 480sub get_last_local_revision { 481# Get note regarding last mediawiki revision 482my$note= run_git("notes --ref=${remotename}/mediawiki show refs/mediawiki/${remotename}/master 2>/dev/null"); 483my@note_info=split(/ /,$note); 484 485my$lastrevision_number; 486if(!(defined($note_info[0]) &&$note_info[0]eq'mediawiki_revision:')) { 487print STDERR 'No previous mediawiki revision found'; 488$lastrevision_number=0; 489}else{ 490# Notes are formatted : mediawiki_revision: #number 491$lastrevision_number=$note_info[1]; 492chomp($lastrevision_number); 493print STDERR "Last local mediawiki revision found is ${lastrevision_number}"; 494} 495return$lastrevision_number; 496} 497 498# Get the last remote revision without taking in account which pages are 499# tracked or not. This function makes a single request to the wiki thus 500# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 501# option. 502sub get_last_global_remote_rev { 503 mw_connect_maybe(); 504 505my$query= { 506 action =>'query', 507 list =>'recentchanges', 508 prop =>'revisions', 509 rclimit =>'1', 510 rcdir =>'older', 511}; 512my$result=$mediawiki->api($query); 513return$result->{query}->{recentchanges}[0]->{revid}; 514} 515 516# Get the last remote revision concerning the tracked pages and the tracked 517# categories. 518sub get_last_remote_revision { 519 mw_connect_maybe(); 520 521my%pages_hash= get_mw_pages(); 522my@pages=values(%pages_hash); 523 524my$max_rev_num=0; 525 526print STDERR "Getting last revision id on tracked pages...\n"; 527 528foreachmy$page(@pages) { 529my$id=$page->{pageid}; 530 531my$query= { 532 action =>'query', 533 prop =>'revisions', 534 rvprop =>'ids|timestamp', 535 pageids =>$id, 536}; 537 538my$result=$mediawiki->api($query); 539 540my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 541 542$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 543 544$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 545} 546 547print STDERR "Last remote revision found is$max_rev_num.\n"; 548return$max_rev_num; 549} 550 551# Clean content before sending it to MediaWiki 552sub mediawiki_clean { 553my$string=shift; 554my$page_created=shift; 555# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 556# This function right trims a string and adds a \n at the end to follow this rule 557$string=~s/\s+$//; 558if($stringeq""&&$page_created) { 559# Creating empty pages is forbidden. 560$string= EMPTY_CONTENT; 561} 562return$string."\n"; 563} 564 565# Filter applied on MediaWiki data before adding them to Git 566sub mediawiki_smudge { 567my$string=shift; 568if($stringeq EMPTY_CONTENT) { 569$string=""; 570} 571# This \n is important. This is due to mediawiki's way to handle end of files. 572return"${string}\n"; 573} 574 575sub mediawiki_clean_filename { 576my$filename=shift; 577$filename=~s{@{[SLASH_REPLACEMENT]}}{/}g; 578# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 579# Do a variant of URL-encoding, i.e. looks like URL-encoding, 580# but with _ added to prevent MediaWiki from thinking this is 581# an actual special character. 582$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 583# If we use the uri escape before 584# we should unescape here, before anything 585 586return$filename; 587} 588 589sub mediawiki_smudge_filename { 590my$filename=shift; 591$filename=~s{/}{@{[SLASH_REPLACEMENT]}}g; 592$filename=~s/ /_/g; 593# Decode forbidden characters encoded in mediawiki_clean_filename 594$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf('%c', hex($1))/ge; 595return$filename; 596} 597 598sub literal_data { 599my($content) =@_; 600print STDOUT 'data ', bytes::length($content),"\n",$content; 601return; 602} 603 604sub literal_data_raw { 605# Output possibly binary content. 606my($content) =@_; 607# Avoid confusion between size in bytes and in characters 608 utf8::downgrade($content); 609binmode STDOUT,':raw'; 610print STDOUT 'data ', bytes::length($content),"\n",$content; 611binmode STDOUT,':encoding(UTF-8)'; 612return; 613} 614 615sub mw_capabilities { 616# Revisions are imported to the private namespace 617# refs/mediawiki/$remotename/ by the helper and fetched into 618# refs/remotes/$remotename later by fetch. 619print STDOUT "refspec refs/heads/*:refs/mediawiki/${remotename}/*\n"; 620print STDOUT "import\n"; 621print STDOUT "list\n"; 622print STDOUT "push\n"; 623print STDOUT "\n"; 624return; 625} 626 627sub mw_list { 628# MediaWiki do not have branches, we consider one branch arbitrarily 629# called master, and HEAD pointing to it. 630print STDOUT "? refs/heads/master\n"; 631print STDOUT "\@refs/heads/masterHEAD\n"; 632print STDOUT "\n"; 633return; 634} 635 636sub mw_option { 637print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 638print STDOUT "unsupported\n"; 639return; 640} 641 642sub fetch_mw_revisions_for_page { 643my$page=shift; 644my$id=shift; 645my$fetch_from=shift; 646my@page_revs= (); 647my$query= { 648 action =>'query', 649 prop =>'revisions', 650 rvprop =>'ids', 651 rvdir =>'newer', 652 rvstartid =>$fetch_from, 653 rvlimit =>500, 654 pageids =>$id, 655}; 656 657my$revnum=0; 658# Get 500 revisions at a time due to the mediawiki api limit 659while(1) { 660my$result=$mediawiki->api($query); 661 662# Parse each of those 500 revisions 663foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 664my$page_rev_ids; 665$page_rev_ids->{pageid} =$page->{pageid}; 666$page_rev_ids->{revid} =$revision->{revid}; 667push(@page_revs,$page_rev_ids); 668$revnum++; 669} 670last unless$result->{'query-continue'}; 671$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 672} 673if($shallow_import&&@page_revs) { 674print STDERR " Found 1 revision (shallow import).\n"; 675@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 676return$page_revs[0]; 677} 678print STDERR " Found ${revnum} revision(s).\n"; 679return@page_revs; 680} 681 682sub fetch_mw_revisions { 683my$pages=shift;my@pages= @{$pages}; 684my$fetch_from=shift; 685 686my@revisions= (); 687my$n=1; 688foreachmy$page(@pages) { 689my$id=$page->{pageid}; 690print STDERR "page ${n}/",scalar(@pages),': ',$page->{title},"\n"; 691$n++; 692my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 693@revisions= (@page_revs,@revisions); 694} 695 696return($n,@revisions); 697} 698 699sub fe_escape_path { 700my$path=shift; 701$path=~s/\\/\\\\/g; 702$path=~s/"/\\"/g; 703$path=~s/\n/\\n/g; 704returnqq("${path}"); 705} 706 707sub import_file_revision { 708my$commit=shift; 709my%commit= %{$commit}; 710my$full_import=shift; 711my$n=shift; 712my$mediafile=shift; 713my%mediafile; 714if($mediafile) { 715%mediafile= %{$mediafile}; 716} 717 718my$title=$commit{title}; 719my$comment=$commit{comment}; 720my$content=$commit{content}; 721my$author=$commit{author}; 722my$date=$commit{date}; 723 724print STDOUT "commit refs/mediawiki/${remotename}/master\n"; 725print STDOUT "mark :${n}\n"; 726print STDOUT "committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 727 literal_data($comment); 728 729# If it's not a clone, we need to know where to start from 730if(!$full_import&&$n==1) { 731print STDOUT "from refs/mediawiki/${remotename}/master^0\n"; 732} 733if($contentne DELETED_CONTENT) { 734print STDOUT 'M 644 inline '. 735 fe_escape_path("${title}.mw") ."\n"; 736 literal_data($content); 737if(%mediafile) { 738print STDOUT 'M 644 inline ' 739. fe_escape_path($mediafile{title}) ."\n"; 740 literal_data_raw($mediafile{content}); 741} 742print STDOUT "\n\n"; 743}else{ 744print STDOUT 'D '. fe_escape_path("${title}.mw") ."\n"; 745} 746 747# mediawiki revision number in the git note 748if($full_import&&$n==1) { 749print STDOUT "reset refs/notes/${remotename}/mediawiki\n"; 750} 751print STDOUT "commit refs/notes/${remotename}/mediawiki\n"; 752print STDOUT "committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 753 literal_data('Note added by git-mediawiki during import'); 754if(!$full_import&&$n==1) { 755print STDOUT "from refs/notes/${remotename}/mediawiki^0\n"; 756} 757print STDOUT "N inline :${n}\n"; 758 literal_data("mediawiki_revision:$commit{mw_revision}"); 759print STDOUT "\n\n"; 760return; 761} 762 763# parse a sequence of 764# <cmd> <arg1> 765# <cmd> <arg2> 766# \n 767# (like batch sequence of import and sequence of push statements) 768sub get_more_refs { 769my$cmd=shift; 770my@refs; 771while(1) { 772my$line= <STDIN>; 773if($line=~/^$cmd (.*)$/) { 774push(@refs,$1); 775}elsif($lineeq"\n") { 776return@refs; 777}else{ 778die("Invalid command in a '$cmd' batch:$_\n"); 779} 780} 781return; 782} 783 784sub mw_import { 785# multiple import commands can follow each other. 786my@refs= (shift, get_more_refs('import')); 787foreachmy$ref(@refs) { 788 mw_import_ref($ref); 789} 790print STDOUT "done\n"; 791return; 792} 793 794sub mw_import_ref { 795my$ref=shift; 796# The remote helper will call "import HEAD" and 797# "import refs/heads/master". 798# Since HEAD is a symbolic ref to master (by convention, 799# followed by the output of the command "list" that we gave), 800# we don't need to do anything in this case. 801if($refeq'HEAD') { 802return; 803} 804 805 mw_connect_maybe(); 806 807print STDERR "Searching revisions...\n"; 808my$last_local= get_last_local_revision(); 809my$fetch_from=$last_local+1; 810if($fetch_from==1) { 811print STDERR ", fetching from beginning.\n"; 812}else{ 813print STDERR ", fetching from here.\n"; 814} 815 816my$n=0; 817if($fetch_strategyeq'by_rev') { 818print STDERR "Fetching & writing export data by revs...\n"; 819$n= mw_import_ref_by_revs($fetch_from); 820}elsif($fetch_strategyeq'by_page') { 821print STDERR "Fetching & writing export data by pages...\n"; 822$n= mw_import_ref_by_pages($fetch_from); 823}else{ 824print STDERR qq(fatal: invalid fetch strategy "${fetch_strategy}".\n); 825print STDERR "Check your configuration variables remote.${remotename}.fetchStrategy and mediawiki.fetchStrategy\n"; 826exit1; 827} 828 829if($fetch_from==1&&$n==0) { 830print STDERR "You appear to have cloned an empty MediaWiki.\n"; 831# Something has to be done remote-helper side. If nothing is done, an error is 832# thrown saying that HEAD is referring to unknown object 0000000000000000000 833# and the clone fails. 834} 835return; 836} 837 838sub mw_import_ref_by_pages { 839 840my$fetch_from=shift; 841my%pages_hash= get_mw_pages(); 842my@pages=values(%pages_hash); 843 844my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 845 846@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 847my@revision_ids=map{$_->{revid} }@revisions; 848 849return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 850} 851 852sub mw_import_ref_by_revs { 853 854my$fetch_from=shift; 855my%pages_hash= get_mw_pages(); 856 857my$last_remote= get_last_global_remote_rev(); 858my@revision_ids=$fetch_from..$last_remote; 859return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 860} 861 862# Import revisions given in second argument (array of integers). 863# Only pages appearing in the third argument (hash indexed by page titles) 864# will be imported. 865sub mw_import_revids { 866my$fetch_from=shift; 867my$revision_ids=shift; 868my$pages=shift; 869 870my$n=0; 871my$n_actual=0; 872my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 873 874foreachmy$pagerevid(@$revision_ids) { 875# Count page even if we skip it, since we display 876# $n/$total and $total includes skipped pages. 877$n++; 878 879# fetch the content of the pages 880my$query= { 881 action =>'query', 882 prop =>'revisions', 883 rvprop =>'content|timestamp|comment|user|ids', 884 revids =>$pagerevid, 885}; 886 887my$result=$mediawiki->api($query); 888 889if(!$result) { 890die"Failed to retrieve modified page for revision$pagerevid\n"; 891} 892 893if(defined($result->{query}->{badrevids}->{$pagerevid})) { 894# The revision id does not exist on the remote wiki. 895next; 896} 897 898if(!defined($result->{query}->{pages})) { 899die"Invalid revision ${pagerevid}.\n"; 900} 901 902my@result_pages=values(%{$result->{query}->{pages}}); 903my$result_page=$result_pages[0]; 904my$rev=$result_pages[0]->{revisions}->[0]; 905 906my$page_title=$result_page->{title}; 907 908if(!exists($pages->{$page_title})) { 909print STDERR "${n}/",scalar(@$revision_ids), 910": Skipping revision #$rev->{revid} of ${page_title}\n"; 911next; 912} 913 914$n_actual++; 915 916my%commit; 917$commit{author} =$rev->{user} ||'Anonymous'; 918$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 919$commit{title} = mediawiki_smudge_filename($page_title); 920$commit{mw_revision} =$rev->{revid}; 921$commit{content} = mediawiki_smudge($rev->{'*'}); 922 923if(!defined($rev->{timestamp})) { 924$last_timestamp++; 925}else{ 926$last_timestamp=$rev->{timestamp}; 927} 928$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 929 930# Differentiates classic pages and media files. 931my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 932my%mediafile; 933if($namespace) { 934my$id= get_mw_namespace_id($namespace); 935if($id&&$id== get_mw_namespace_id('File')) { 936%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 937} 938} 939# If this is a revision of the media page for new version 940# of a file do one common commit for both file and media page. 941# Else do commit only for that page. 942print STDERR "${n}/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 943 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 944} 945 946return$n_actual; 947} 948 949sub error_non_fast_forward { 950my$advice= run_git('config --bool advice.pushNonFastForward'); 951chomp($advice); 952if($advicene'false') { 953# Native git-push would show this after the summary. 954# We can't ask it to display it cleanly, so print it 955# ourselves before. 956print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 957print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 958print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 959} 960print STDOUT qq(error$_[0] "non-fast-forward"\n); 961return0; 962} 963 964sub mw_upload_file { 965my$complete_file_name=shift; 966my$new_sha1=shift; 967my$extension=shift; 968my$file_deleted=shift; 969my$summary=shift; 970my$newrevid; 971my$path="File:${complete_file_name}"; 972my%hashFiles= get_allowed_file_extensions(); 973if(!exists($hashFiles{$extension})) { 974print STDERR "${complete_file_name} is not a permitted file on this wiki.\n"; 975print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 976return$newrevid; 977} 978# Deleting and uploading a file requires a priviledged user 979if($file_deleted) { 980 mw_connect_maybe(); 981my$query= { 982 action =>'delete', 983 title =>$path, 984 reason =>$summary 985}; 986if(!$mediawiki->edit($query)) { 987print STDERR "Failed to delete file on remote wiki\n"; 988print STDERR "Check your permissions on the remote site. Error code:\n"; 989print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 990exit1; 991} 992}else{ 993# Don't let perl try to interpret file content as UTF-8 => use "raw" 994my$content= run_git("cat-file blob ${new_sha1}",'raw'); 995if($contentne"") { 996 mw_connect_maybe(); 997$mediawiki->{config}->{upload_url} = 998"${url}/index.php/Special:Upload"; 999$mediawiki->edit({1000 action =>'upload',1001 filename =>$complete_file_name,1002 comment =>$summary,1003 file => [undef,1004$complete_file_name,1005 Content =>$content],1006 ignorewarnings =>1,1007}, {1008 skip_encoding =>11009} ) ||die$mediawiki->{error}->{code} .':'1010.$mediawiki->{error}->{details} ."\n";1011my$last_file_page=$mediawiki->get_page({title =>$path});1012$newrevid=$last_file_page->{revid};1013print STDERR "Pushed file: ${new_sha1} - ${complete_file_name}.\n";1014}else{1015print STDERR "Empty file ${complete_file_name} not pushed.\n";1016}1017}1018return$newrevid;1019}10201021sub mw_push_file {1022my$diff_info=shift;1023# $diff_info contains a string in this format:1024# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1025my@diff_info_split=split(/[ \t]/,$diff_info);10261027# Filename, including .mw extension1028my$complete_file_name=shift;1029# Commit message1030my$summary=shift;1031# MediaWiki revision number. Keep the previous one by default,1032# in case there's no edit to perform.1033my$oldrevid=shift;1034my$newrevid;10351036if($summaryeq EMPTY_MESSAGE) {1037$summary='';1038}10391040my$new_sha1=$diff_info_split[3];1041my$old_sha1=$diff_info_split[2];1042my$page_created= ($old_sha1eq NULL_SHA1);1043my$page_deleted= ($new_sha1eq NULL_SHA1);1044$complete_file_name= mediawiki_clean_filename($complete_file_name);10451046my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1047if(!defined($extension)) {1048$extension="";1049}1050if($extensioneq'mw') {1051my$ns= get_mw_namespace_id_for_page($complete_file_name);1052if($ns&&$ns== get_mw_namespace_id('File') && (!$export_media)) {1053print STDERR "Ignoring media file related page: ${complete_file_name}\n";1054return($oldrevid,'ok');1055}1056my$file_content;1057if($page_deleted) {1058# Deleting a page usually requires1059# special privileges. A common1060# convention is to replace the page1061# with this content instead:1062$file_content= DELETED_CONTENT;1063}else{1064$file_content= run_git("cat-file blob ${new_sha1}");1065}10661067 mw_connect_maybe();10681069my$result=$mediawiki->edit( {1070 action =>'edit',1071 summary =>$summary,1072 title =>$title,1073 basetimestamp =>$basetimestamps{$oldrevid},1074 text => mediawiki_clean($file_content,$page_created),1075}, {1076 skip_encoding =>1# Helps with names with accentuated characters1077});1078if(!$result) {1079if($mediawiki->{error}->{code} ==3) {1080# edit conflicts, considered as non-fast-forward1081print STDERR 'Warning: Error '.1082$mediawiki->{error}->{code} .1083' from mediwiki: '.$mediawiki->{error}->{details} .1084".\n";1085return($oldrevid,'non-fast-forward');1086}else{1087# Other errors. Shouldn't happen => just die()1088die'Fatal: Error '.1089$mediawiki->{error}->{code} .1090' from mediwiki: '.$mediawiki->{error}->{details} ."\n";1091}1092}1093$newrevid=$result->{edit}->{newrevid};1094print STDERR "Pushed file: ${new_sha1} - ${title}\n";1095}elsif($export_media) {1096$newrevid= mw_upload_file($complete_file_name,$new_sha1,1097$extension,$page_deleted,1098$summary);1099}else{1100print STDERR "Ignoring media file ${title}\n";1101}1102$newrevid= ($newrevidor$oldrevid);1103return($newrevid,'ok');1104}11051106sub mw_push {1107# multiple push statements can follow each other1108my@refsspecs= (shift, get_more_refs('push'));1109my$pushed;1110formy$refspec(@refsspecs) {1111my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1112or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>\n");1113if($force) {1114print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1115}1116if($localeq"") {1117print STDERR "Cannot delete remote branch on a MediaWiki\n";1118print STDOUT "error ${remote} cannot delete\n";1119next;1120}1121if($remotene'refs/heads/master') {1122print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1123print STDOUT "error ${remote} only master allowed\n";1124next;1125}1126if(mw_push_revision($local,$remote)) {1127$pushed=1;1128}1129}11301131# Notify Git that the push is done1132print STDOUT "\n";11331134if($pushed&&$dumb_push) {1135print STDERR "Just pushed some revisions to MediaWiki.\n";1136print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1137print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1138print STDERR "\n";1139print STDERR " git pull --rebase\n";1140print STDERR "\n";1141}1142return;1143}11441145sub mw_push_revision {1146my$local=shift;1147my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1148my$last_local_revid= get_last_local_revision();1149print STDERR ".\n";# Finish sentence started by get_last_local_revision()1150my$last_remote_revid= get_last_remote_revision();1151my$mw_revision=$last_remote_revid;11521153# Get sha1 of commit pointed by local HEAD1154my$HEAD_sha1= run_git("rev-parse ${local} 2>/dev/null");1155chomp($HEAD_sha1);1156# Get sha1 of commit pointed by remotes/$remotename/master1157my$remoteorigin_sha1= run_git("rev-parse refs/remotes/${remotename}/master 2>/dev/null");1158chomp($remoteorigin_sha1);11591160if($last_local_revid>0&&1161$last_local_revid<$last_remote_revid) {1162return error_non_fast_forward($remote);1163}11641165if($HEAD_sha1eq$remoteorigin_sha1) {1166# nothing to push1167return0;1168}11691170# Get every commit in between HEAD and refs/remotes/origin/master,1171# including HEAD and refs/remotes/origin/master1172my@commit_pairs= ();1173if($last_local_revid>0) {1174my$parsed_sha1=$remoteorigin_sha1;1175# Find a path from last MediaWiki commit to pushed commit1176print STDERR "Computing path from local to remote ...\n";1177my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents ${local} ^${parsed_sha1}"));1178my%local_ancestry;1179foreachmy$line(@local_ancestry) {1180if(my($child,$parents) =$line=~/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1181foreachmy$parent(split(/ /,$parents)) {1182$local_ancestry{$parent} =$child;1183}1184}elsif(!$line=~/^([a-f0-9]+)/) {1185die"Unexpected output from git rev-list: ${line}\n";1186}1187}1188while($parsed_sha1ne$HEAD_sha1) {1189my$child=$local_ancestry{$parsed_sha1};1190if(!$child) {1191printf STDERR "Cannot find a path in history from remote commit to last commit\n";1192return error_non_fast_forward($remote);1193}1194push(@commit_pairs, [$parsed_sha1,$child]);1195$parsed_sha1=$child;1196}1197}else{1198# No remote mediawiki revision. Export the whole1199# history (linearized with --first-parent)1200print STDERR "Warning: no common ancestor, pushing complete history\n";1201my$history= run_git("rev-list --first-parent --children ${local}");1202my@history=split(/\n/,$history);1203@history=@history[1..$#history];1204foreachmy$line(reverse@history) {1205my@commit_info_split=split(/[ \n]/,$line);1206push(@commit_pairs, \@commit_info_split);1207}1208}12091210foreachmy$commit_info_split(@commit_pairs) {1211my$sha1_child= @{$commit_info_split}[0];1212my$sha1_commit= @{$commit_info_split}[1];1213my$diff_infos= run_git("diff-tree -r --raw -z ${sha1_child} ${sha1_commit}");1214# TODO: we could detect rename, and encode them with a #redirect on the wiki.1215# TODO: for now, it's just a delete+add1216my@diff_info_list=split(/\0/,$diff_infos);1217# Keep the subject line of the commit message as mediawiki comment for the revision1218my$commit_msg= run_git(qq(log --no-walk --format="%s" ${sha1_commit}));1219chomp($commit_msg);1220# Push every blob1221while(@diff_info_list) {1222my$status;1223# git diff-tree -z gives an output like1224# <metadata>\0<filename1>\01225# <metadata>\0<filename2>\01226# and we've split on \0.1227my$info=shift(@diff_info_list);1228my$file=shift(@diff_info_list);1229($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1230if($statuseq'non-fast-forward') {1231# we may already have sent part of the1232# commit to MediaWiki, but it's too1233# late to cancel it. Stop the push in1234# the middle, but still give an1235# accurate error message.1236return error_non_fast_forward($remote);1237}1238if($statusne'ok') {1239die("Unknown error from mw_push_file()\n");1240}1241}1242unless($dumb_push) {1243 run_git(qq(notes --ref=${remotename}/mediawiki add -f -m "mediawiki_revision: ${mw_revision}" ${sha1_commit}));1244 run_git(qq(update-ref -m "Git-MediaWiki push" refs/mediawiki/${remotename}/master ${sha1_commit} ${sha1_child}));1245}1246}12471248print STDOUT "ok ${remote}\n";1249return1;1250}12511252sub get_allowed_file_extensions {1253 mw_connect_maybe();12541255my$query= {1256 action =>'query',1257 meta =>'siteinfo',1258 siprop =>'fileextensions'1259};1260my$result=$mediawiki->api($query);1261my@file_extensions=map{$_->{ext}} @{$result->{query}->{fileextensions}};1262my%hashFile=map{$_=>1}@file_extensions;12631264return%hashFile;1265}12661267# In memory cache for MediaWiki namespace ids.1268my%namespace_id;12691270# Namespaces whose id is cached in the configuration file1271# (to avoid duplicates)1272my%cached_mw_namespace_id;12731274# Return MediaWiki id for a canonical namespace name.1275# Ex.: "File", "Project".1276sub get_mw_namespace_id {1277 mw_connect_maybe();1278my$name=shift;12791280if(!exists$namespace_id{$name}) {1281# Look at configuration file, if the record for that namespace is1282# already cached. Namespaces are stored in form:1283# "Name_of_namespace:Id_namespace", ex.: "File:6".1284my@temp=split(/\n/,1285 run_git("config --get-all remote.${remotename}.namespaceCache"));1286chomp(@temp);1287foreachmy$ns(@temp) {1288my($n,$id) =split(/:/,$ns);1289if($ideq'notANameSpace') {1290$namespace_id{$n} = {is_namespace =>0};1291}else{1292$namespace_id{$n} = {is_namespace =>1, id =>$id};1293}1294$cached_mw_namespace_id{$n} =1;1295}1296}12971298if(!exists$namespace_id{$name}) {1299print STDERR "Namespace ${name} not found in cache, querying the wiki ...\n";1300# NS not found => get namespace id from MW and store it in1301# configuration file.1302my$query= {1303 action =>'query',1304 meta =>'siteinfo',1305 siprop =>'namespaces'1306};1307my$result=$mediawiki->api($query);13081309while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1310if(defined($ns->{id}) &&defined($ns->{canonical})) {1311$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1312if($ns->{'*'}) {1313# alias (e.g. french Fichier: as alias for canonical File:)1314$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1315}1316}1317}1318}13191320my$ns=$namespace_id{$name};1321my$id;13221323unless(defined$ns) {1324print STDERR "No such namespace ${name} on MediaWiki.\n";1325$ns= {is_namespace =>0};1326$namespace_id{$name} =$ns;1327}13281329if($ns->{is_namespace}) {1330$id=$ns->{id};1331}13321333# Store "notANameSpace" as special value for inexisting namespaces1334my$store_id= ($id||'notANameSpace');13351336# Store explicitely requested namespaces on disk1337if(!exists$cached_mw_namespace_id{$name}) {1338 run_git(qq(config --add remote.${remotename}.namespaceCache "${name}:${store_id}"));1339$cached_mw_namespace_id{$name} =1;1340}1341return$id;1342}13431344sub get_mw_namespace_id_for_page {1345my$namespace=shift;1346if($namespace=~/^([^:]*):/) {1347return get_mw_namespace_id($namespace);1348}else{1349return;1350}1351}