1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use DateTime::Format::ISO8601; 18use warnings; 19 20# By default, use UTF-8 to communicate with Git and the user 21binmode STDERR,':encoding(UTF-8)'; 22binmode STDOUT,':encoding(UTF-8)'; 23 24use URI::Escape; 25 26# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 27useconstant SLASH_REPLACEMENT =>'%2F'; 28 29# It's not always possible to delete pages (may require some 30# privileges). Deleted pages are replaced with this content. 31useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 32 33# It's not possible to create empty pages. New empty files in Git are 34# sent with this content instead. 35useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 36 37# used to reflect file creation or deletion in diff. 38useconstant NULL_SHA1 =>'0000000000000000000000000000000000000000'; 39 40# Used on Git's side to reflect empty edit messages on the wiki 41useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 42 43useconstant EMPTY =>q{}; 44 45# Number of pages taken into account at once in submodule get_mw_page_list 46useconstant SLICE_SIZE =>50; 47 48# Number of linked mediafile to get at once in get_linked_mediafiles 49# The query is split in small batches because of the MW API limit of 50# the number of links to be returned (500 links max). 51useconstant BATCH_SIZE =>10; 52 53useconstant HTTP_CODE_OK =>200; 54 55my$remotename=$ARGV[0]; 56my$url=$ARGV[1]; 57 58# Accept both space-separated and multiple keys in config file. 59# Spaces should be written as _ anyway because we'll use chomp. 60my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.${remotename}.pages")); 61chomp(@tracked_pages); 62 63# Just like @tracked_pages, but for MediaWiki categories. 64my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.${remotename}.categories")); 65chomp(@tracked_categories); 66 67# Import media files on pull 68my$import_media= run_git("config --get --bool remote.${remotename}.mediaimport"); 69chomp($import_media); 70$import_media= ($import_mediaeq'true'); 71 72# Export media files on push 73my$export_media= run_git("config --get --bool remote.${remotename}.mediaexport"); 74chomp($export_media); 75$export_media= !($export_mediaeq'false'); 76 77my$wiki_login= run_git("config --get remote.${remotename}.mwLogin"); 78# Note: mwPassword is discourraged. Use the credential system instead. 79my$wiki_passwd= run_git("config --get remote.${remotename}.mwPassword"); 80my$wiki_domain= run_git("config --get remote.${remotename}.mwDomain"); 81chomp($wiki_login); 82chomp($wiki_passwd); 83chomp($wiki_domain); 84 85# Import only last revisions (both for clone and fetch) 86my$shallow_import= run_git("config --get --bool remote.${remotename}.shallow"); 87chomp($shallow_import); 88$shallow_import= ($shallow_importeq'true'); 89 90# Fetch (clone and pull) by revisions instead of by pages. This behavior 91# is more efficient when we have a wiki with lots of pages and we fetch 92# the revisions quite often so that they concern only few pages. 93# Possible values: 94# - by_rev: perform one query per new revision on the remote wiki 95# - by_page: query each tracked page for new revision 96my$fetch_strategy= run_git("config --get remote.${remotename}.fetchStrategy"); 97if(!$fetch_strategy) { 98$fetch_strategy= run_git('config --get mediawiki.fetchStrategy'); 99} 100chomp($fetch_strategy); 101if(!$fetch_strategy) { 102$fetch_strategy='by_page'; 103} 104 105# Remember the timestamp corresponding to a revision id. 106my%basetimestamps; 107 108# Dumb push: don't update notes and mediawiki ref to reflect the last push. 109# 110# Configurable with mediawiki.dumbPush, or per-remote with 111# remote.<remotename>.dumbPush. 112# 113# This means the user will have to re-import the just-pushed 114# revisions. On the other hand, this means that the Git revisions 115# corresponding to MediaWiki revisions are all imported from the wiki, 116# regardless of whether they were initially created in Git or from the 117# web interface, hence all users will get the same history (i.e. if 118# the push from Git to MediaWiki loses some information, everybody 119# will get the history with information lost). If the import is 120# deterministic, this means everybody gets the same sha1 for each 121# MediaWiki revision. 122my$dumb_push= run_git("config --get --bool remote.${remotename}.dumbPush"); 123if(!$dumb_push) { 124$dumb_push= run_git('config --get --bool mediawiki.dumbPush'); 125} 126chomp($dumb_push); 127$dumb_push= ($dumb_pusheq'true'); 128 129my$wiki_name=$url; 130$wiki_name=~s{[^/]*://}{}; 131# If URL is like http://user:password@example.com/, we clearly don't 132# want the password in $wiki_name. While we're there, also remove user 133# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 134$wiki_name=~s/^.*@//; 135 136# Commands parser 137while(<STDIN>) { 138chomp; 139 140if(!parse_command($_)) { 141last; 142} 143 144BEGIN{ $| =1}# flush STDOUT, to make sure the previous 145# command is fully processed. 146} 147 148########################## Functions ############################## 149 150sub parse_command { 151my($line) =@_; 152my@cmd=split(/ /,$line); 153if(!defined$cmd[0]) { 154return0; 155} 156if($cmd[0]eq'capabilities') { 157die("Too many arguments for capabilities\n") 158if(defined($cmd[1])); 159 mw_capabilities(); 160}elsif($cmd[0]eq'list') { 161die("Too many arguments for list\n")if(defined($cmd[2])); 162 mw_list($cmd[1]); 163}elsif($cmd[0]eq'import') { 164die("Invalid arguments for import\n") 165if($cmd[1]eq EMPTY ||defined($cmd[2])); 166 mw_import($cmd[1]); 167}elsif($cmd[0]eq'option') { 168die("Too many arguments for option\n") 169if($cmd[1]eq EMPTY ||$cmd[2]eq EMPTY ||defined($cmd[3])); 170 mw_option($cmd[1],$cmd[2]); 171}elsif($cmd[0]eq'push') { 172 mw_push($cmd[1]); 173}else{ 174print{*STDERR}"Unknown command. Aborting...\n"; 175return0; 176} 177return1; 178} 179 180# MediaWiki API instance, created lazily. 181my$mediawiki; 182 183sub mw_connect_maybe { 184if($mediawiki) { 185return; 186} 187$mediawiki= MediaWiki::API->new; 188$mediawiki->{config}->{api_url} ="${url}/api.php"; 189if($wiki_login) { 190my%credential= ( 191'url'=>$url, 192'username'=>$wiki_login, 193'password'=>$wiki_passwd 194); 195 Git::credential(\%credential); 196my$request= {lgname =>$credential{username}, 197 lgpassword =>$credential{password}, 198 lgdomain =>$wiki_domain}; 199if($mediawiki->login($request)) { 200 Git::credential(\%credential,'approve'); 201print{*STDERR}qq(Logged in mediawiki user "$credential{username}".\n); 202}else{ 203print{*STDERR}qq(Failed to log in mediawiki user "$credential{username}" on ${url}\n); 204print{*STDERR}' (error '. 205$mediawiki->{error}->{code} .': '. 206$mediawiki->{error}->{details} .")\n"; 207 Git::credential(\%credential,'reject'); 208exit1; 209} 210} 211return; 212} 213 214sub fatal_mw_error { 215my$action=shift; 216print STDERR "fatal: could not$action.\n"; 217print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 218if($url=~/^https/) { 219print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 220print STDERR "fatal: and the SSL certificate is correct.\n"; 221}else{ 222print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 223} 224print STDERR "fatal: (error ". 225$mediawiki->{error}->{code} .': '. 226$mediawiki->{error}->{details} .")\n"; 227exit1; 228} 229 230## Functions for listing pages on the remote wiki 231sub get_mw_tracked_pages { 232my$pages=shift; 233 get_mw_page_list(\@tracked_pages,$pages); 234return; 235} 236 237sub get_mw_page_list { 238my$page_list=shift; 239my$pages=shift; 240my@some_pages=@$page_list; 241while(@some_pages) { 242my$last_page= SLICE_SIZE; 243if($#some_pages<$last_page) { 244$last_page=$#some_pages; 245} 246my@slice=@some_pages[0..$last_page]; 247 get_mw_first_pages(\@slice,$pages); 248@some_pages=@some_pages[(SLICE_SIZE +1)..$#some_pages]; 249} 250return; 251} 252 253sub get_mw_tracked_categories { 254my$pages=shift; 255foreachmy$category(@tracked_categories) { 256if(index($category,':') <0) { 257# Mediawiki requires the Category 258# prefix, but let's not force the user 259# to specify it. 260$category="Category:${category}"; 261} 262my$mw_pages=$mediawiki->list( { 263 action =>'query', 264 list =>'categorymembers', 265 cmtitle =>$category, 266 cmlimit =>'max'} ) 267||die$mediawiki->{error}->{code} .': ' 268.$mediawiki->{error}->{details} ."\n"; 269foreachmy$page(@{$mw_pages}) { 270$pages->{$page->{title}} =$page; 271} 272} 273return; 274} 275 276sub get_mw_all_pages { 277my$pages=shift; 278# No user-provided list, get the list of pages from the API. 279my$mw_pages=$mediawiki->list({ 280 action =>'query', 281 list =>'allpages', 282 aplimit =>'max' 283}); 284if(!defined($mw_pages)) { 285 fatal_mw_error("get the list of wiki pages"); 286} 287foreachmy$page(@{$mw_pages}) { 288$pages->{$page->{title}} =$page; 289} 290return; 291} 292 293# queries the wiki for a set of pages. Meant to be used within a loop 294# querying the wiki for slices of page list. 295sub get_mw_first_pages { 296my$some_pages=shift; 297my@some_pages= @{$some_pages}; 298 299my$pages=shift; 300 301# pattern 'page1|page2|...' required by the API 302my$titles=join('|',@some_pages); 303 304my$mw_pages=$mediawiki->api({ 305 action =>'query', 306 titles =>$titles, 307}); 308if(!defined($mw_pages)) { 309 fatal_mw_error("query the list of wiki pages"); 310} 311while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 312if($id<0) { 313print{*STDERR}"Warning: page$page->{title} not found on wiki\n"; 314}else{ 315$pages->{$page->{title}} =$page; 316} 317} 318return; 319} 320 321# Get the list of pages to be fetched according to configuration. 322sub get_mw_pages { 323 mw_connect_maybe(); 324 325print{*STDERR}"Listing pages on remote wiki...\n"; 326 327my%pages;# hash on page titles to avoid duplicates 328my$user_defined; 329if(@tracked_pages) { 330$user_defined=1; 331# The user provided a list of pages titles, but we 332# still need to query the API to get the page IDs. 333 get_mw_tracked_pages(\%pages); 334} 335if(@tracked_categories) { 336$user_defined=1; 337 get_mw_tracked_categories(\%pages); 338} 339if(!$user_defined) { 340 get_mw_all_pages(\%pages); 341} 342if($import_media) { 343print{*STDERR}"Getting media files for selected pages...\n"; 344if($user_defined) { 345 get_linked_mediafiles(\%pages); 346}else{ 347 get_all_mediafiles(\%pages); 348} 349} 350print{*STDERR} (scalar keys%pages) ." pages found.\n"; 351return%pages; 352} 353 354# usage: $out = run_git("command args"); 355# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 356sub run_git { 357my$args=shift; 358my$encoding= (shift||'encoding(UTF-8)'); 359open(my$git,"-|:${encoding}","git ${args}") 360or die"Unable to fork:$!\n"; 361my$res=do{ 362local$/=undef; 363<$git> 364}; 365close($git); 366 367return$res; 368} 369 370 371sub get_all_mediafiles { 372my$pages=shift; 373# Attach list of all pages for media files from the API, 374# they are in a different namespace, only one namespace 375# can be queried at the same moment 376my$mw_pages=$mediawiki->list({ 377 action =>'query', 378 list =>'allpages', 379 apnamespace => get_mw_namespace_id('File'), 380 aplimit =>'max' 381}); 382if(!defined($mw_pages)) { 383print{*STDERR}"fatal: could not get the list of pages for media files.\n"; 384print{*STDERR}"fatal: '$url' does not appear to be a mediawiki\n"; 385print{*STDERR}"fatal: make sure '$url/api.php' is a valid page.\n"; 386exit1; 387} 388foreachmy$page(@{$mw_pages}) { 389$pages->{$page->{title}} =$page; 390} 391return; 392} 393 394sub get_linked_mediafiles { 395my$pages=shift; 396my@titles=map{$_->{title} }values(%{$pages}); 397 398my$batch= BATCH_SIZE; 399while(@titles) { 400if($#titles<$batch) { 401$batch=$#titles; 402} 403my@slice=@titles[0..$batch]; 404 405# pattern 'page1|page2|...' required by the API 406my$mw_titles=join('|',@slice); 407 408# Media files could be included or linked from 409# a page, get all related 410my$query= { 411 action =>'query', 412 prop =>'links|images', 413 titles =>$mw_titles, 414 plnamespace => get_mw_namespace_id('File'), 415 pllimit =>'max' 416}; 417my$result=$mediawiki->api($query); 418 419while(my($id,$page) =each(%{$result->{query}->{pages}})) { 420my@media_titles; 421if(defined($page->{links})) { 422my@link_titles 423=map{$_->{title} } @{$page->{links}}; 424push(@media_titles,@link_titles); 425} 426if(defined($page->{images})) { 427my@image_titles 428=map{$_->{title} } @{$page->{images}}; 429push(@media_titles,@image_titles); 430} 431if(@media_titles) { 432 get_mw_page_list(\@media_titles,$pages); 433} 434} 435 436@titles=@titles[($batch+1)..$#titles]; 437} 438return; 439} 440 441sub get_mw_mediafile_for_page_revision { 442# Name of the file on Wiki, with the prefix. 443my$filename=shift; 444my$timestamp=shift; 445my%mediafile; 446 447# Search if on a media file with given timestamp exists on 448# MediaWiki. In that case download the file. 449my$query= { 450 action =>'query', 451 prop =>'imageinfo', 452 titles =>"File:${filename}", 453 iistart =>$timestamp, 454 iiend =>$timestamp, 455 iiprop =>'timestamp|archivename|url', 456 iilimit =>1 457}; 458my$result=$mediawiki->api($query); 459 460my($fileid,$file) =each( %{$result->{query}->{pages}} ); 461# If not defined it means there is no revision of the file for 462# given timestamp. 463if(defined($file->{imageinfo})) { 464$mediafile{title} =$filename; 465 466my$fileinfo=pop(@{$file->{imageinfo}}); 467$mediafile{timestamp} =$fileinfo->{timestamp}; 468# Mediawiki::API's download function doesn't support https URLs 469# and can't download old versions of files. 470print{*STDERR}"\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 471$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 472} 473return%mediafile; 474} 475 476sub download_mw_mediafile { 477my$download_url=shift; 478 479my$response=$mediawiki->{ua}->get($download_url); 480if($response->code== HTTP_CODE_OK) { 481return$response->decoded_content; 482}else{ 483print{*STDERR}"Error downloading mediafile from :\n"; 484print{*STDERR}"URL: ${download_url}\n"; 485print{*STDERR}'Server response: '.$response->code.q{ }.$response->message."\n"; 486exit1; 487} 488} 489 490sub get_last_local_revision { 491# Get note regarding last mediawiki revision 492my$note= run_git("notes --ref=${remotename}/mediawiki show refs/mediawiki/${remotename}/master 2>/dev/null"); 493my@note_info=split(/ /,$note); 494 495my$lastrevision_number; 496if(!(defined($note_info[0]) &&$note_info[0]eq'mediawiki_revision:')) { 497print{*STDERR}'No previous mediawiki revision found'; 498$lastrevision_number=0; 499}else{ 500# Notes are formatted : mediawiki_revision: #number 501$lastrevision_number=$note_info[1]; 502chomp($lastrevision_number); 503print{*STDERR}"Last local mediawiki revision found is ${lastrevision_number}"; 504} 505return$lastrevision_number; 506} 507 508# Get the last remote revision without taking in account which pages are 509# tracked or not. This function makes a single request to the wiki thus 510# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 511# option. 512sub get_last_global_remote_rev { 513 mw_connect_maybe(); 514 515my$query= { 516 action =>'query', 517 list =>'recentchanges', 518 prop =>'revisions', 519 rclimit =>'1', 520 rcdir =>'older', 521}; 522my$result=$mediawiki->api($query); 523return$result->{query}->{recentchanges}[0]->{revid}; 524} 525 526# Get the last remote revision concerning the tracked pages and the tracked 527# categories. 528sub get_last_remote_revision { 529 mw_connect_maybe(); 530 531my%pages_hash= get_mw_pages(); 532my@pages=values(%pages_hash); 533 534my$max_rev_num=0; 535 536print{*STDERR}"Getting last revision id on tracked pages...\n"; 537 538foreachmy$page(@pages) { 539my$id=$page->{pageid}; 540 541my$query= { 542 action =>'query', 543 prop =>'revisions', 544 rvprop =>'ids|timestamp', 545 pageids =>$id, 546}; 547 548my$result=$mediawiki->api($query); 549 550my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 551 552$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 553 554$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 555} 556 557print{*STDERR}"Last remote revision found is$max_rev_num.\n"; 558return$max_rev_num; 559} 560 561# Clean content before sending it to MediaWiki 562sub mediawiki_clean { 563my$string=shift; 564my$page_created=shift; 565# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 566# This function right trims a string and adds a \n at the end to follow this rule 567$string=~s/\s+$//; 568if($stringeq EMPTY &&$page_created) { 569# Creating empty pages is forbidden. 570$string= EMPTY_CONTENT; 571} 572return$string."\n"; 573} 574 575# Filter applied on MediaWiki data before adding them to Git 576sub mediawiki_smudge { 577my$string=shift; 578if($stringeq EMPTY_CONTENT) { 579$string= EMPTY; 580} 581# This \n is important. This is due to mediawiki's way to handle end of files. 582return"${string}\n"; 583} 584 585sub mediawiki_clean_filename { 586my$filename=shift; 587$filename=~s{@{[SLASH_REPLACEMENT]}}{/}g; 588# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 589# Do a variant of URL-encoding, i.e. looks like URL-encoding, 590# but with _ added to prevent MediaWiki from thinking this is 591# an actual special character. 592$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 593# If we use the uri escape before 594# we should unescape here, before anything 595 596return$filename; 597} 598 599sub mediawiki_smudge_filename { 600my$filename=shift; 601$filename=~s{/}{@{[SLASH_REPLACEMENT]}}g; 602$filename=~s/ /_/g; 603# Decode forbidden characters encoded in mediawiki_clean_filename 604$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf('%c', hex($1))/ge; 605return$filename; 606} 607 608sub literal_data { 609my($content) =@_; 610print{*STDOUT}'data ', bytes::length($content),"\n",$content; 611return; 612} 613 614sub literal_data_raw { 615# Output possibly binary content. 616my($content) =@_; 617# Avoid confusion between size in bytes and in characters 618 utf8::downgrade($content); 619binmode{*STDOUT},':raw'; 620print{*STDOUT}'data ', bytes::length($content),"\n",$content; 621binmode{*STDOUT},':encoding(UTF-8)'; 622return; 623} 624 625sub mw_capabilities { 626# Revisions are imported to the private namespace 627# refs/mediawiki/$remotename/ by the helper and fetched into 628# refs/remotes/$remotename later by fetch. 629print{*STDOUT}"refspec refs/heads/*:refs/mediawiki/${remotename}/*\n"; 630print{*STDOUT}"import\n"; 631print{*STDOUT}"list\n"; 632print{*STDOUT}"push\n"; 633print{*STDOUT}"\n"; 634return; 635} 636 637sub mw_list { 638# MediaWiki do not have branches, we consider one branch arbitrarily 639# called master, and HEAD pointing to it. 640print{*STDOUT}"? refs/heads/master\n"; 641print{*STDOUT}"\@refs/heads/masterHEAD\n"; 642print{*STDOUT}"\n"; 643return; 644} 645 646sub mw_option { 647print{*STDERR}"remote-helper command 'option$_[0]' not yet implemented\n"; 648print{*STDOUT}"unsupported\n"; 649return; 650} 651 652sub fetch_mw_revisions_for_page { 653my$page=shift; 654my$id=shift; 655my$fetch_from=shift; 656my@page_revs= (); 657my$query= { 658 action =>'query', 659 prop =>'revisions', 660 rvprop =>'ids', 661 rvdir =>'newer', 662 rvstartid =>$fetch_from, 663 rvlimit =>500, 664 pageids =>$id, 665}; 666 667my$revnum=0; 668# Get 500 revisions at a time due to the mediawiki api limit 669while(1) { 670my$result=$mediawiki->api($query); 671 672# Parse each of those 500 revisions 673foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 674my$page_rev_ids; 675$page_rev_ids->{pageid} =$page->{pageid}; 676$page_rev_ids->{revid} =$revision->{revid}; 677push(@page_revs,$page_rev_ids); 678$revnum++; 679} 680last if(!$result->{'query-continue'}); 681$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 682} 683if($shallow_import&&@page_revs) { 684print{*STDERR}" Found 1 revision (shallow import).\n"; 685@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 686return$page_revs[0]; 687} 688print{*STDERR}" Found ${revnum} revision(s).\n"; 689return@page_revs; 690} 691 692sub fetch_mw_revisions { 693my$pages=shift;my@pages= @{$pages}; 694my$fetch_from=shift; 695 696my@revisions= (); 697my$n=1; 698foreachmy$page(@pages) { 699my$id=$page->{pageid}; 700print{*STDERR}"page ${n}/",scalar(@pages),': ',$page->{title},"\n"; 701$n++; 702my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 703@revisions= (@page_revs,@revisions); 704} 705 706return($n,@revisions); 707} 708 709sub fe_escape_path { 710my$path=shift; 711$path=~s/\\/\\\\/g; 712$path=~s/"/\\"/g; 713$path=~s/\n/\\n/g; 714returnqq("${path}"); 715} 716 717sub import_file_revision { 718my$commit=shift; 719my%commit= %{$commit}; 720my$full_import=shift; 721my$n=shift; 722my$mediafile=shift; 723my%mediafile; 724if($mediafile) { 725%mediafile= %{$mediafile}; 726} 727 728my$title=$commit{title}; 729my$comment=$commit{comment}; 730my$content=$commit{content}; 731my$author=$commit{author}; 732my$date=$commit{date}; 733 734print{*STDOUT}"commit refs/mediawiki/${remotename}/master\n"; 735print{*STDOUT}"mark :${n}\n"; 736print{*STDOUT}"committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 737 literal_data($comment); 738 739# If it's not a clone, we need to know where to start from 740if(!$full_import&&$n==1) { 741print{*STDOUT}"from refs/mediawiki/${remotename}/master^0\n"; 742} 743if($contentne DELETED_CONTENT) { 744print{*STDOUT}'M 644 inline '. 745 fe_escape_path("${title}.mw") ."\n"; 746 literal_data($content); 747if(%mediafile) { 748print{*STDOUT}'M 644 inline ' 749. fe_escape_path($mediafile{title}) ."\n"; 750 literal_data_raw($mediafile{content}); 751} 752print{*STDOUT}"\n\n"; 753}else{ 754print{*STDOUT}'D '. fe_escape_path("${title}.mw") ."\n"; 755} 756 757# mediawiki revision number in the git note 758if($full_import&&$n==1) { 759print{*STDOUT}"reset refs/notes/${remotename}/mediawiki\n"; 760} 761print{*STDOUT}"commit refs/notes/${remotename}/mediawiki\n"; 762print{*STDOUT}"committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 763 literal_data('Note added by git-mediawiki during import'); 764if(!$full_import&&$n==1) { 765print{*STDOUT}"from refs/notes/${remotename}/mediawiki^0\n"; 766} 767print{*STDOUT}"N inline :${n}\n"; 768 literal_data("mediawiki_revision:$commit{mw_revision}"); 769print{*STDOUT}"\n\n"; 770return; 771} 772 773# parse a sequence of 774# <cmd> <arg1> 775# <cmd> <arg2> 776# \n 777# (like batch sequence of import and sequence of push statements) 778sub get_more_refs { 779my$cmd=shift; 780my@refs; 781while(1) { 782my$line= <STDIN>; 783if($line=~/^$cmd (.*)$/) { 784push(@refs,$1); 785}elsif($lineeq"\n") { 786return@refs; 787}else{ 788die("Invalid command in a '$cmd' batch:$_\n"); 789} 790} 791return; 792} 793 794sub mw_import { 795# multiple import commands can follow each other. 796my@refs= (shift, get_more_refs('import')); 797foreachmy$ref(@refs) { 798 mw_import_ref($ref); 799} 800print{*STDOUT}"done\n"; 801return; 802} 803 804sub mw_import_ref { 805my$ref=shift; 806# The remote helper will call "import HEAD" and 807# "import refs/heads/master". 808# Since HEAD is a symbolic ref to master (by convention, 809# followed by the output of the command "list" that we gave), 810# we don't need to do anything in this case. 811if($refeq'HEAD') { 812return; 813} 814 815 mw_connect_maybe(); 816 817print{*STDERR}"Searching revisions...\n"; 818my$last_local= get_last_local_revision(); 819my$fetch_from=$last_local+1; 820if($fetch_from==1) { 821print{*STDERR}", fetching from beginning.\n"; 822}else{ 823print{*STDERR}", fetching from here.\n"; 824} 825 826my$n=0; 827if($fetch_strategyeq'by_rev') { 828print{*STDERR}"Fetching & writing export data by revs...\n"; 829$n= mw_import_ref_by_revs($fetch_from); 830}elsif($fetch_strategyeq'by_page') { 831print{*STDERR}"Fetching & writing export data by pages...\n"; 832$n= mw_import_ref_by_pages($fetch_from); 833}else{ 834print{*STDERR}qq(fatal: invalid fetch strategy "${fetch_strategy}".\n); 835print{*STDERR}"Check your configuration variables remote.${remotename}.fetchStrategy and mediawiki.fetchStrategy\n"; 836exit1; 837} 838 839if($fetch_from==1&&$n==0) { 840print{*STDERR}"You appear to have cloned an empty MediaWiki.\n"; 841# Something has to be done remote-helper side. If nothing is done, an error is 842# thrown saying that HEAD is referring to unknown object 0000000000000000000 843# and the clone fails. 844} 845return; 846} 847 848sub mw_import_ref_by_pages { 849 850my$fetch_from=shift; 851my%pages_hash= get_mw_pages(); 852my@pages=values(%pages_hash); 853 854my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 855 856@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 857my@revision_ids=map{$_->{revid} }@revisions; 858 859return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 860} 861 862sub mw_import_ref_by_revs { 863 864my$fetch_from=shift; 865my%pages_hash= get_mw_pages(); 866 867my$last_remote= get_last_global_remote_rev(); 868my@revision_ids=$fetch_from..$last_remote; 869return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 870} 871 872# Import revisions given in second argument (array of integers). 873# Only pages appearing in the third argument (hash indexed by page titles) 874# will be imported. 875sub mw_import_revids { 876my$fetch_from=shift; 877my$revision_ids=shift; 878my$pages=shift; 879 880my$n=0; 881my$n_actual=0; 882my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 883 884foreachmy$pagerevid(@$revision_ids) { 885# Count page even if we skip it, since we display 886# $n/$total and $total includes skipped pages. 887$n++; 888 889# fetch the content of the pages 890my$query= { 891 action =>'query', 892 prop =>'revisions', 893 rvprop =>'content|timestamp|comment|user|ids', 894 revids =>$pagerevid, 895}; 896 897my$result=$mediawiki->api($query); 898 899if(!$result) { 900die"Failed to retrieve modified page for revision$pagerevid\n"; 901} 902 903if(defined($result->{query}->{badrevids}->{$pagerevid})) { 904# The revision id does not exist on the remote wiki. 905next; 906} 907 908if(!defined($result->{query}->{pages})) { 909die"Invalid revision ${pagerevid}.\n"; 910} 911 912my@result_pages=values(%{$result->{query}->{pages}}); 913my$result_page=$result_pages[0]; 914my$rev=$result_pages[0]->{revisions}->[0]; 915 916my$page_title=$result_page->{title}; 917 918if(!exists($pages->{$page_title})) { 919print{*STDERR}"${n}/",scalar(@$revision_ids), 920": Skipping revision #$rev->{revid} of ${page_title}\n"; 921next; 922} 923 924$n_actual++; 925 926my%commit; 927$commit{author} =$rev->{user} ||'Anonymous'; 928$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 929$commit{title} = mediawiki_smudge_filename($page_title); 930$commit{mw_revision} =$rev->{revid}; 931$commit{content} = mediawiki_smudge($rev->{'*'}); 932 933if(!defined($rev->{timestamp})) { 934$last_timestamp++; 935}else{ 936$last_timestamp=$rev->{timestamp}; 937} 938$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 939 940# Differentiates classic pages and media files. 941my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 942my%mediafile; 943if($namespace) { 944my$id= get_mw_namespace_id($namespace); 945if($id&&$id== get_mw_namespace_id('File')) { 946%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 947} 948} 949# If this is a revision of the media page for new version 950# of a file do one common commit for both file and media page. 951# Else do commit only for that page. 952print{*STDERR}"${n}/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 953 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 954} 955 956return$n_actual; 957} 958 959sub error_non_fast_forward { 960my$advice= run_git('config --bool advice.pushNonFastForward'); 961chomp($advice); 962if($advicene'false') { 963# Native git-push would show this after the summary. 964# We can't ask it to display it cleanly, so print it 965# ourselves before. 966print{*STDERR}"To prevent you from losing history, non-fast-forward updates were rejected\n"; 967print{*STDERR}"Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 968print{*STDERR}"'Note about fast-forwards' section of 'git push --help' for details.\n"; 969} 970print{*STDOUT}qq(error$_[0] "non-fast-forward"\n); 971return0; 972} 973 974sub mw_upload_file { 975my$complete_file_name=shift; 976my$new_sha1=shift; 977my$extension=shift; 978my$file_deleted=shift; 979my$summary=shift; 980my$newrevid; 981my$path="File:${complete_file_name}"; 982my%hashFiles= get_allowed_file_extensions(); 983if(!exists($hashFiles{$extension})) { 984print{*STDERR}"${complete_file_name} is not a permitted file on this wiki.\n"; 985print{*STDERR}"Check the configuration of file uploads in your mediawiki.\n"; 986return$newrevid; 987} 988# Deleting and uploading a file requires a priviledged user 989if($file_deleted) { 990 mw_connect_maybe(); 991my$query= { 992 action =>'delete', 993 title =>$path, 994 reason =>$summary 995}; 996if(!$mediawiki->edit($query)) { 997print{*STDERR}"Failed to delete file on remote wiki\n"; 998print{*STDERR}"Check your permissions on the remote site. Error code:\n"; 999print{*STDERR}$mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details};1000exit1;1001}1002}else{1003# Don't let perl try to interpret file content as UTF-8 => use "raw"1004my$content= run_git("cat-file blob ${new_sha1}",'raw');1005if($contentne EMPTY) {1006 mw_connect_maybe();1007$mediawiki->{config}->{upload_url} =1008"${url}/index.php/Special:Upload";1009$mediawiki->edit({1010 action =>'upload',1011 filename =>$complete_file_name,1012 comment =>$summary,1013 file => [undef,1014$complete_file_name,1015 Content =>$content],1016 ignorewarnings =>1,1017}, {1018 skip_encoding =>11019} ) ||die$mediawiki->{error}->{code} .':'1020.$mediawiki->{error}->{details} ."\n";1021my$last_file_page=$mediawiki->get_page({title =>$path});1022$newrevid=$last_file_page->{revid};1023print{*STDERR}"Pushed file: ${new_sha1} - ${complete_file_name}.\n";1024}else{1025print{*STDERR}"Empty file ${complete_file_name} not pushed.\n";1026}1027}1028return$newrevid;1029}10301031sub mw_push_file {1032my$diff_info=shift;1033# $diff_info contains a string in this format:1034# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1035my@diff_info_split=split(/[ \t]/,$diff_info);10361037# Filename, including .mw extension1038my$complete_file_name=shift;1039# Commit message1040my$summary=shift;1041# MediaWiki revision number. Keep the previous one by default,1042# in case there's no edit to perform.1043my$oldrevid=shift;1044my$newrevid;10451046if($summaryeq EMPTY_MESSAGE) {1047$summary= EMPTY;1048}10491050my$new_sha1=$diff_info_split[3];1051my$old_sha1=$diff_info_split[2];1052my$page_created= ($old_sha1eq NULL_SHA1);1053my$page_deleted= ($new_sha1eq NULL_SHA1);1054$complete_file_name= mediawiki_clean_filename($complete_file_name);10551056my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1057if(!defined($extension)) {1058$extension= EMPTY;1059}1060if($extensioneq'mw') {1061my$ns= get_mw_namespace_id_for_page($complete_file_name);1062if($ns&&$ns== get_mw_namespace_id('File') && (!$export_media)) {1063print{*STDERR}"Ignoring media file related page: ${complete_file_name}\n";1064return($oldrevid,'ok');1065}1066my$file_content;1067if($page_deleted) {1068# Deleting a page usually requires1069# special privileges. A common1070# convention is to replace the page1071# with this content instead:1072$file_content= DELETED_CONTENT;1073}else{1074$file_content= run_git("cat-file blob ${new_sha1}");1075}10761077 mw_connect_maybe();10781079my$result=$mediawiki->edit( {1080 action =>'edit',1081 summary =>$summary,1082 title =>$title,1083 basetimestamp =>$basetimestamps{$oldrevid},1084 text => mediawiki_clean($file_content,$page_created),1085}, {1086 skip_encoding =>1# Helps with names with accentuated characters1087});1088if(!$result) {1089if($mediawiki->{error}->{code} ==3) {1090# edit conflicts, considered as non-fast-forward1091print{*STDERR}'Warning: Error '.1092$mediawiki->{error}->{code} .1093' from mediawiki: '.$mediawiki->{error}->{details} .1094".\n";1095return($oldrevid,'non-fast-forward');1096}else{1097# Other errors. Shouldn't happen => just die()1098die'Fatal: Error '.1099$mediawiki->{error}->{code} .1100' from mediawiki: '.$mediawiki->{error}->{details} ."\n";1101}1102}1103$newrevid=$result->{edit}->{newrevid};1104print{*STDERR}"Pushed file: ${new_sha1} - ${title}\n";1105}elsif($export_media) {1106$newrevid= mw_upload_file($complete_file_name,$new_sha1,1107$extension,$page_deleted,1108$summary);1109}else{1110print{*STDERR}"Ignoring media file ${title}\n";1111}1112$newrevid= ($newrevidor$oldrevid);1113return($newrevid,'ok');1114}11151116sub mw_push {1117# multiple push statements can follow each other1118my@refsspecs= (shift, get_more_refs('push'));1119my$pushed;1120formy$refspec(@refsspecs) {1121my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1122or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>\n");1123if($force) {1124print{*STDERR}"Warning: forced push not allowed on a MediaWiki.\n";1125}1126if($localeq EMPTY) {1127print{*STDERR}"Cannot delete remote branch on a MediaWiki\n";1128print{*STDOUT}"error ${remote} cannot delete\n";1129next;1130}1131if($remotene'refs/heads/master') {1132print{*STDERR}"Only push to the branch 'master' is supported on a MediaWiki\n";1133print{*STDOUT}"error ${remote} only master allowed\n";1134next;1135}1136if(mw_push_revision($local,$remote)) {1137$pushed=1;1138}1139}11401141# Notify Git that the push is done1142print{*STDOUT}"\n";11431144if($pushed&&$dumb_push) {1145print{*STDERR}"Just pushed some revisions to MediaWiki.\n";1146print{*STDERR}"The pushed revisions now have to be re-imported, and your current branch\n";1147print{*STDERR}"needs to be updated with these re-imported commits. You can do this with\n";1148print{*STDERR}"\n";1149print{*STDERR}" git pull --rebase\n";1150print{*STDERR}"\n";1151}1152return;1153}11541155sub mw_push_revision {1156my$local=shift;1157my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1158my$last_local_revid= get_last_local_revision();1159print{*STDERR}".\n";# Finish sentence started by get_last_local_revision()1160my$last_remote_revid= get_last_remote_revision();1161my$mw_revision=$last_remote_revid;11621163# Get sha1 of commit pointed by local HEAD1164my$HEAD_sha1= run_git("rev-parse ${local} 2>/dev/null");1165chomp($HEAD_sha1);1166# Get sha1 of commit pointed by remotes/$remotename/master1167my$remoteorigin_sha1= run_git("rev-parse refs/remotes/${remotename}/master 2>/dev/null");1168chomp($remoteorigin_sha1);11691170if($last_local_revid>0&&1171$last_local_revid<$last_remote_revid) {1172return error_non_fast_forward($remote);1173}11741175if($HEAD_sha1eq$remoteorigin_sha1) {1176# nothing to push1177return0;1178}11791180# Get every commit in between HEAD and refs/remotes/origin/master,1181# including HEAD and refs/remotes/origin/master1182my@commit_pairs= ();1183if($last_local_revid>0) {1184my$parsed_sha1=$remoteorigin_sha1;1185# Find a path from last MediaWiki commit to pushed commit1186print{*STDERR}"Computing path from local to remote ...\n";1187my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents ${local} ^${parsed_sha1}"));1188my%local_ancestry;1189foreachmy$line(@local_ancestry) {1190if(my($child,$parents) =$line=~/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1191foreachmy$parent(split(/ /,$parents)) {1192$local_ancestry{$parent} =$child;1193}1194}elsif(!$line=~/^([a-f0-9]+)/) {1195die"Unexpected output from git rev-list: ${line}\n";1196}1197}1198while($parsed_sha1ne$HEAD_sha1) {1199my$child=$local_ancestry{$parsed_sha1};1200if(!$child) {1201print{*STDERR}"Cannot find a path in history from remote commit to last commit\n";1202return error_non_fast_forward($remote);1203}1204push(@commit_pairs, [$parsed_sha1,$child]);1205$parsed_sha1=$child;1206}1207}else{1208# No remote mediawiki revision. Export the whole1209# history (linearized with --first-parent)1210print{*STDERR}"Warning: no common ancestor, pushing complete history\n";1211my$history= run_git("rev-list --first-parent --children ${local}");1212my@history=split(/\n/,$history);1213@history=@history[1..$#history];1214foreachmy$line(reverse@history) {1215my@commit_info_split=split(/[ \n]/,$line);1216push(@commit_pairs, \@commit_info_split);1217}1218}12191220foreachmy$commit_info_split(@commit_pairs) {1221my$sha1_child= @{$commit_info_split}[0];1222my$sha1_commit= @{$commit_info_split}[1];1223my$diff_infos= run_git("diff-tree -r --raw -z ${sha1_child} ${sha1_commit}");1224# TODO: we could detect rename, and encode them with a #redirect on the wiki.1225# TODO: for now, it's just a delete+add1226my@diff_info_list=split(/\0/,$diff_infos);1227# Keep the subject line of the commit message as mediawiki comment for the revision1228my$commit_msg= run_git(qq(log --no-walk --format="%s" ${sha1_commit}));1229chomp($commit_msg);1230# Push every blob1231while(@diff_info_list) {1232my$status;1233# git diff-tree -z gives an output like1234# <metadata>\0<filename1>\01235# <metadata>\0<filename2>\01236# and we've split on \0.1237my$info=shift(@diff_info_list);1238my$file=shift(@diff_info_list);1239($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1240if($statuseq'non-fast-forward') {1241# we may already have sent part of the1242# commit to MediaWiki, but it's too1243# late to cancel it. Stop the push in1244# the middle, but still give an1245# accurate error message.1246return error_non_fast_forward($remote);1247}1248if($statusne'ok') {1249die("Unknown error from mw_push_file()\n");1250}1251}1252if(!$dumb_push) {1253 run_git(qq(notes --ref=${remotename}/mediawiki add -f -m "mediawiki_revision: ${mw_revision}" ${sha1_commit}));1254 run_git(qq(update-ref -m "Git-MediaWiki push" refs/mediawiki/${remotename}/master ${sha1_commit} ${sha1_child}));1255}1256}12571258print{*STDOUT}"ok ${remote}\n";1259return1;1260}12611262sub get_allowed_file_extensions {1263 mw_connect_maybe();12641265my$query= {1266 action =>'query',1267 meta =>'siteinfo',1268 siprop =>'fileextensions'1269};1270my$result=$mediawiki->api($query);1271my@file_extensions=map{$_->{ext}} @{$result->{query}->{fileextensions}};1272my%hashFile=map{$_=>1}@file_extensions;12731274return%hashFile;1275}12761277# In memory cache for MediaWiki namespace ids.1278my%namespace_id;12791280# Namespaces whose id is cached in the configuration file1281# (to avoid duplicates)1282my%cached_mw_namespace_id;12831284# Return MediaWiki id for a canonical namespace name.1285# Ex.: "File", "Project".1286sub get_mw_namespace_id {1287 mw_connect_maybe();1288my$name=shift;12891290if(!exists$namespace_id{$name}) {1291# Look at configuration file, if the record for that namespace is1292# already cached. Namespaces are stored in form:1293# "Name_of_namespace:Id_namespace", ex.: "File:6".1294my@temp=split(/\n/,1295 run_git("config --get-all remote.${remotename}.namespaceCache"));1296chomp(@temp);1297foreachmy$ns(@temp) {1298my($n,$id) =split(/:/,$ns);1299if($ideq'notANameSpace') {1300$namespace_id{$n} = {is_namespace =>0};1301}else{1302$namespace_id{$n} = {is_namespace =>1, id =>$id};1303}1304$cached_mw_namespace_id{$n} =1;1305}1306}13071308if(!exists$namespace_id{$name}) {1309print{*STDERR}"Namespace ${name} not found in cache, querying the wiki ...\n";1310# NS not found => get namespace id from MW and store it in1311# configuration file.1312my$query= {1313 action =>'query',1314 meta =>'siteinfo',1315 siprop =>'namespaces'1316};1317my$result=$mediawiki->api($query);13181319while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1320if(defined($ns->{id}) &&defined($ns->{canonical})) {1321$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1322if($ns->{'*'}) {1323# alias (e.g. french Fichier: as alias for canonical File:)1324$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1325}1326}1327}1328}13291330my$ns=$namespace_id{$name};1331my$id;13321333if(!defined$ns) {1334print{*STDERR}"No such namespace ${name} on MediaWiki.\n";1335$ns= {is_namespace =>0};1336$namespace_id{$name} =$ns;1337}13381339if($ns->{is_namespace}) {1340$id=$ns->{id};1341}13421343# Store "notANameSpace" as special value for inexisting namespaces1344my$store_id= ($id||'notANameSpace');13451346# Store explicitely requested namespaces on disk1347if(!exists$cached_mw_namespace_id{$name}) {1348 run_git(qq(config --add remote.${remotename}.namespaceCache "${name}:${store_id}"));1349$cached_mw_namespace_id{$name} =1;1350}1351return$id;1352}13531354sub get_mw_namespace_id_for_page {1355my$namespace=shift;1356if($namespace=~/^([^:]*):/) {1357return get_mw_namespace_id($namespace);1358}else{1359return;1360}1361}