1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use Git::Mediawiki qw(clean_filename smudge_filename connect_maybe 18 EMPTY HTTP_CODE_OK); 19use DateTime::Format::ISO8601; 20use warnings; 21 22# By default, use UTF-8 to communicate with Git and the user 23binmode STDERR,':encoding(UTF-8)'; 24binmode STDOUT,':encoding(UTF-8)'; 25 26use URI::Escape; 27 28# It's not always possible to delete pages (may require some 29# privileges). Deleted pages are replaced with this content. 30useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 31 32# It's not possible to create empty pages. New empty files in Git are 33# sent with this content instead. 34useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 35 36# used to reflect file creation or deletion in diff. 37useconstant NULL_SHA1 =>'0000000000000000000000000000000000000000'; 38 39# Used on Git's side to reflect empty edit messages on the wiki 40useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 41 42# Number of pages taken into account at once in submodule get_mw_page_list 43useconstant SLICE_SIZE =>50; 44 45# Number of linked mediafile to get at once in get_linked_mediafiles 46# The query is split in small batches because of the MW API limit of 47# the number of links to be returned (500 links max). 48useconstant BATCH_SIZE =>10; 49 50if(@ARGV!=2) { 51 exit_error_usage(); 52} 53 54my$remotename=$ARGV[0]; 55my$url=$ARGV[1]; 56 57# Accept both space-separated and multiple keys in config file. 58# Spaces should be written as _ anyway because we'll use chomp. 59my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.${remotename}.pages")); 60chomp(@tracked_pages); 61 62# Just like @tracked_pages, but for MediaWiki categories. 63my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.${remotename}.categories")); 64chomp(@tracked_categories); 65 66# Just like @tracked_categories, but for MediaWiki namespaces. 67my@tracked_namespaces=split(/[ \n]/, run_git("config --get-all remote.${remotename}.namespaces")); 68for(@tracked_namespaces) {s/_/ /g; } 69chomp(@tracked_namespaces); 70 71# Import media files on pull 72my$import_media= run_git("config --get --bool remote.${remotename}.mediaimport"); 73chomp($import_media); 74$import_media= ($import_mediaeq'true'); 75 76# Export media files on push 77my$export_media= run_git("config --get --bool remote.${remotename}.mediaexport"); 78chomp($export_media); 79$export_media= !($export_mediaeq'false'); 80 81my$wiki_login= run_git("config --get remote.${remotename}.mwLogin"); 82# Note: mwPassword is discourraged. Use the credential system instead. 83my$wiki_passwd= run_git("config --get remote.${remotename}.mwPassword"); 84my$wiki_domain= run_git("config --get remote.${remotename}.mwDomain"); 85chomp($wiki_login); 86chomp($wiki_passwd); 87chomp($wiki_domain); 88 89# Import only last revisions (both for clone and fetch) 90my$shallow_import= run_git("config --get --bool remote.${remotename}.shallow"); 91chomp($shallow_import); 92$shallow_import= ($shallow_importeq'true'); 93 94# Fetch (clone and pull) by revisions instead of by pages. This behavior 95# is more efficient when we have a wiki with lots of pages and we fetch 96# the revisions quite often so that they concern only few pages. 97# Possible values: 98# - by_rev: perform one query per new revision on the remote wiki 99# - by_page: query each tracked page for new revision 100my$fetch_strategy= run_git("config --get remote.${remotename}.fetchStrategy"); 101if(!$fetch_strategy) { 102$fetch_strategy= run_git('config --get mediawiki.fetchStrategy'); 103} 104chomp($fetch_strategy); 105if(!$fetch_strategy) { 106$fetch_strategy='by_page'; 107} 108 109# Remember the timestamp corresponding to a revision id. 110my%basetimestamps; 111 112# Dumb push: don't update notes and mediawiki ref to reflect the last push. 113# 114# Configurable with mediawiki.dumbPush, or per-remote with 115# remote.<remotename>.dumbPush. 116# 117# This means the user will have to re-import the just-pushed 118# revisions. On the other hand, this means that the Git revisions 119# corresponding to MediaWiki revisions are all imported from the wiki, 120# regardless of whether they were initially created in Git or from the 121# web interface, hence all users will get the same history (i.e. if 122# the push from Git to MediaWiki loses some information, everybody 123# will get the history with information lost). If the import is 124# deterministic, this means everybody gets the same sha1 for each 125# MediaWiki revision. 126my$dumb_push= run_git("config --get --bool remote.${remotename}.dumbPush"); 127if(!$dumb_push) { 128$dumb_push= run_git('config --get --bool mediawiki.dumbPush'); 129} 130chomp($dumb_push); 131$dumb_push= ($dumb_pusheq'true'); 132 133my$wiki_name=$url; 134$wiki_name=~s{[^/]*://}{}; 135# If URL is like http://user:password@example.com/, we clearly don't 136# want the password in $wiki_name. While we're there, also remove user 137# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 138$wiki_name=~s/^.*@//; 139 140# Commands parser 141while(<STDIN>) { 142chomp; 143 144if(!parse_command($_)) { 145last; 146} 147 148BEGIN{ $| =1}# flush STDOUT, to make sure the previous 149# command is fully processed. 150} 151 152########################## Functions ############################## 153 154## error handling 155sub exit_error_usage { 156die"ERROR: git-remote-mediawiki module was not called with a correct number of\n". 157"parameters\n". 158"You may obtain this error because you attempted to run the git-remote-mediawiki\n". 159"module directly.\n". 160"This module can be used the following way:\n". 161"\tgit clone mediawiki://<address of a mediawiki>\n". 162"Then, use git commit, push and pull as with every normal git repository.\n"; 163} 164 165sub parse_command { 166my($line) =@_; 167my@cmd=split(/ /,$line); 168if(!defined$cmd[0]) { 169return0; 170} 171if($cmd[0]eq'capabilities') { 172die("Too many arguments for capabilities\n") 173if(defined($cmd[1])); 174 mw_capabilities(); 175}elsif($cmd[0]eq'list') { 176die("Too many arguments for list\n")if(defined($cmd[2])); 177 mw_list($cmd[1]); 178}elsif($cmd[0]eq'import') { 179die("Invalid argument for import\n") 180if($cmd[1]eq EMPTY); 181die("Too many arguments for import\n") 182if(defined($cmd[2])); 183 mw_import($cmd[1]); 184}elsif($cmd[0]eq'option') { 185die("Invalid arguments for option\n") 186if($cmd[1]eq EMPTY ||$cmd[2]eq EMPTY); 187die("Too many arguments for option\n") 188if(defined($cmd[3])); 189 mw_option($cmd[1],$cmd[2]); 190}elsif($cmd[0]eq'push') { 191 mw_push($cmd[1]); 192}else{ 193print{*STDERR}"Unknown command. Aborting...\n"; 194return0; 195} 196return1; 197} 198 199# MediaWiki API instance, created lazily. 200my$mediawiki; 201 202sub fatal_mw_error { 203my$action=shift; 204print STDERR "fatal: could not$action.\n"; 205print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 206if($url=~/^https/) { 207print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 208print STDERR "fatal: and the SSL certificate is correct.\n"; 209}else{ 210print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 211} 212print STDERR "fatal: (error ". 213$mediawiki->{error}->{code} .': '. 214$mediawiki->{error}->{details} .")\n"; 215exit1; 216} 217 218## Functions for listing pages on the remote wiki 219sub get_mw_tracked_pages { 220my$pages=shift; 221 get_mw_page_list(\@tracked_pages,$pages); 222return; 223} 224 225sub get_mw_page_list { 226my$page_list=shift; 227my$pages=shift; 228my@some_pages= @{$page_list}; 229while(@some_pages) { 230my$last_page= SLICE_SIZE; 231if($#some_pages<$last_page) { 232$last_page=$#some_pages; 233} 234my@slice=@some_pages[0..$last_page]; 235 get_mw_first_pages(\@slice,$pages); 236@some_pages=@some_pages[(SLICE_SIZE +1)..$#some_pages]; 237} 238return; 239} 240 241sub get_mw_tracked_categories { 242my$pages=shift; 243foreachmy$category(@tracked_categories) { 244if(index($category,':') <0) { 245# Mediawiki requires the Category 246# prefix, but let's not force the user 247# to specify it. 248$category="Category:${category}"; 249} 250my$mw_pages=$mediawiki->list( { 251 action =>'query', 252 list =>'categorymembers', 253 cmtitle =>$category, 254 cmlimit =>'max'} ) 255||die$mediawiki->{error}->{code} .': ' 256.$mediawiki->{error}->{details} ."\n"; 257foreachmy$page(@{$mw_pages}) { 258$pages->{$page->{title}} =$page; 259} 260} 261return; 262} 263 264sub get_mw_tracked_namespaces { 265my$pages=shift; 266foreachmy$local_namespace(sort@tracked_namespaces) { 267my$namespace_id; 268if($local_namespaceeq"(Main)") { 269$namespace_id=0; 270}else{ 271$namespace_id= get_mw_namespace_id($local_namespace); 272} 273# virtual namespaces don't support allpages 274next if!defined($namespace_id) ||$namespace_id<0; 275my$mw_pages=$mediawiki->list( { 276 action =>'query', 277 list =>'allpages', 278 apnamespace =>$namespace_id, 279 aplimit =>'max'} ) 280||die$mediawiki->{error}->{code} .': ' 281.$mediawiki->{error}->{details} ."\n"; 282print{*STDERR}"$#{$mw_pages} found in namespace$local_namespace($namespace_id)\n"; 283foreachmy$page(@{$mw_pages}) { 284$pages->{$page->{title}} =$page; 285} 286} 287return; 288} 289 290sub get_mw_all_pages { 291my$pages=shift; 292# No user-provided list, get the list of pages from the API. 293my$mw_pages=$mediawiki->list({ 294 action =>'query', 295 list =>'allpages', 296 aplimit =>'max' 297}); 298if(!defined($mw_pages)) { 299 fatal_mw_error("get the list of wiki pages"); 300} 301foreachmy$page(@{$mw_pages}) { 302$pages->{$page->{title}} =$page; 303} 304return; 305} 306 307# queries the wiki for a set of pages. Meant to be used within a loop 308# querying the wiki for slices of page list. 309sub get_mw_first_pages { 310my$some_pages=shift; 311my@some_pages= @{$some_pages}; 312 313my$pages=shift; 314 315# pattern 'page1|page2|...' required by the API 316my$titles=join('|',@some_pages); 317 318my$mw_pages=$mediawiki->api({ 319 action =>'query', 320 titles =>$titles, 321}); 322if(!defined($mw_pages)) { 323 fatal_mw_error("query the list of wiki pages"); 324} 325while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 326if($id<0) { 327print{*STDERR}"Warning: page$page->{title} not found on wiki\n"; 328}else{ 329$pages->{$page->{title}} =$page; 330} 331} 332return; 333} 334 335# Get the list of pages to be fetched according to configuration. 336sub get_mw_pages { 337$mediawiki= connect_maybe($mediawiki,$remotename,$url); 338 339print{*STDERR}"Listing pages on remote wiki...\n"; 340 341my%pages;# hash on page titles to avoid duplicates 342my$user_defined; 343if(@tracked_pages) { 344$user_defined=1; 345# The user provided a list of pages titles, but we 346# still need to query the API to get the page IDs. 347 get_mw_tracked_pages(\%pages); 348} 349if(@tracked_categories) { 350$user_defined=1; 351 get_mw_tracked_categories(\%pages); 352} 353if(@tracked_namespaces) { 354$user_defined=1; 355 get_mw_tracked_namespaces(\%pages); 356} 357if(!$user_defined) { 358 get_mw_all_pages(\%pages); 359} 360if($import_media) { 361print{*STDERR}"Getting media files for selected pages...\n"; 362if($user_defined) { 363 get_linked_mediafiles(\%pages); 364}else{ 365 get_all_mediafiles(\%pages); 366} 367} 368print{*STDERR} (scalar keys%pages) ." pages found.\n"; 369return%pages; 370} 371 372# usage: $out = run_git("command args"); 373# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 374sub run_git { 375my$args=shift; 376my$encoding= (shift||'encoding(UTF-8)'); 377open(my$git,"-|:${encoding}","git ${args}") 378or die"Unable to fork:$!\n"; 379my$res=do{ 380local$/=undef; 381<$git> 382}; 383close($git); 384 385return$res; 386} 387 388 389sub get_all_mediafiles { 390my$pages=shift; 391# Attach list of all pages for media files from the API, 392# they are in a different namespace, only one namespace 393# can be queried at the same moment 394my$mw_pages=$mediawiki->list({ 395 action =>'query', 396 list =>'allpages', 397 apnamespace => get_mw_namespace_id('File'), 398 aplimit =>'max' 399}); 400if(!defined($mw_pages)) { 401print{*STDERR}"fatal: could not get the list of pages for media files.\n"; 402print{*STDERR}"fatal: '$url' does not appear to be a mediawiki\n"; 403print{*STDERR}"fatal: make sure '$url/api.php' is a valid page.\n"; 404exit1; 405} 406foreachmy$page(@{$mw_pages}) { 407$pages->{$page->{title}} =$page; 408} 409return; 410} 411 412sub get_linked_mediafiles { 413my$pages=shift; 414my@titles=map{$_->{title} }values(%{$pages}); 415 416my$batch= BATCH_SIZE; 417while(@titles) { 418if($#titles<$batch) { 419$batch=$#titles; 420} 421my@slice=@titles[0..$batch]; 422 423# pattern 'page1|page2|...' required by the API 424my$mw_titles=join('|',@slice); 425 426# Media files could be included or linked from 427# a page, get all related 428my$query= { 429 action =>'query', 430 prop =>'links|images', 431 titles =>$mw_titles, 432 plnamespace => get_mw_namespace_id('File'), 433 pllimit =>'max' 434}; 435my$result=$mediawiki->api($query); 436 437while(my($id,$page) =each(%{$result->{query}->{pages}})) { 438my@media_titles; 439if(defined($page->{links})) { 440my@link_titles 441=map{$_->{title} } @{$page->{links}}; 442push(@media_titles,@link_titles); 443} 444if(defined($page->{images})) { 445my@image_titles 446=map{$_->{title} } @{$page->{images}}; 447push(@media_titles,@image_titles); 448} 449if(@media_titles) { 450 get_mw_page_list(\@media_titles,$pages); 451} 452} 453 454@titles=@titles[($batch+1)..$#titles]; 455} 456return; 457} 458 459sub get_mw_mediafile_for_page_revision { 460# Name of the file on Wiki, with the prefix. 461my$filename=shift; 462my$timestamp=shift; 463my%mediafile; 464 465# Search if on a media file with given timestamp exists on 466# MediaWiki. In that case download the file. 467my$query= { 468 action =>'query', 469 prop =>'imageinfo', 470 titles =>"File:${filename}", 471 iistart =>$timestamp, 472 iiend =>$timestamp, 473 iiprop =>'timestamp|archivename|url', 474 iilimit =>1 475}; 476my$result=$mediawiki->api($query); 477 478my($fileid,$file) =each( %{$result->{query}->{pages}} ); 479# If not defined it means there is no revision of the file for 480# given timestamp. 481if(defined($file->{imageinfo})) { 482$mediafile{title} =$filename; 483 484my$fileinfo=pop(@{$file->{imageinfo}}); 485$mediafile{timestamp} =$fileinfo->{timestamp}; 486# Mediawiki::API's download function doesn't support https URLs 487# and can't download old versions of files. 488print{*STDERR}"\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 489$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 490} 491return%mediafile; 492} 493 494sub download_mw_mediafile { 495my$download_url=shift; 496 497my$response=$mediawiki->{ua}->get($download_url); 498if($response->code== HTTP_CODE_OK) { 499# It is tempting to return 500# $response->decoded_content({charset => "none"}), but 501# when doing so, utf8::downgrade($content) fails with 502# "Wide character in subroutine entry". 503$response->decode(); 504return$response->content(); 505}else{ 506print{*STDERR}"Error downloading mediafile from :\n"; 507print{*STDERR}"URL: ${download_url}\n"; 508print{*STDERR}'Server response: '.$response->code.q{ }.$response->message."\n"; 509exit1; 510} 511} 512 513sub get_last_local_revision { 514# Get note regarding last mediawiki revision 515my$note= run_git("notes --ref=${remotename}/mediawiki show refs/mediawiki/${remotename}/master 2>/dev/null"); 516my@note_info=split(/ /,$note); 517 518my$lastrevision_number; 519if(!(defined($note_info[0]) &&$note_info[0]eq'mediawiki_revision:')) { 520print{*STDERR}'No previous mediawiki revision found'; 521$lastrevision_number=0; 522}else{ 523# Notes are formatted : mediawiki_revision: #number 524$lastrevision_number=$note_info[1]; 525chomp($lastrevision_number); 526print{*STDERR}"Last local mediawiki revision found is ${lastrevision_number}"; 527} 528return$lastrevision_number; 529} 530 531# Get the last remote revision without taking in account which pages are 532# tracked or not. This function makes a single request to the wiki thus 533# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 534# option. 535sub get_last_global_remote_rev { 536$mediawiki= connect_maybe($mediawiki,$remotename,$url); 537 538my$query= { 539 action =>'query', 540 list =>'recentchanges', 541 prop =>'revisions', 542 rclimit =>'1', 543 rcdir =>'older', 544}; 545my$result=$mediawiki->api($query); 546return$result->{query}->{recentchanges}[0]->{revid}; 547} 548 549# Get the last remote revision concerning the tracked pages and the tracked 550# categories. 551sub get_last_remote_revision { 552$mediawiki= connect_maybe($mediawiki,$remotename,$url); 553 554my%pages_hash= get_mw_pages(); 555my@pages=values(%pages_hash); 556 557my$max_rev_num=0; 558 559print{*STDERR}"Getting last revision id on tracked pages...\n"; 560 561foreachmy$page(@pages) { 562my$id=$page->{pageid}; 563 564my$query= { 565 action =>'query', 566 prop =>'revisions', 567 rvprop =>'ids|timestamp', 568 pageids =>$id, 569}; 570 571my$result=$mediawiki->api($query); 572 573my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 574 575$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 576 577$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 578} 579 580print{*STDERR}"Last remote revision found is$max_rev_num.\n"; 581return$max_rev_num; 582} 583 584# Clean content before sending it to MediaWiki 585sub mediawiki_clean { 586my$string=shift; 587my$page_created=shift; 588# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 589# This function right trims a string and adds a \n at the end to follow this rule 590$string=~s/\s+$//; 591if($stringeq EMPTY &&$page_created) { 592# Creating empty pages is forbidden. 593$string= EMPTY_CONTENT; 594} 595return$string."\n"; 596} 597 598# Filter applied on MediaWiki data before adding them to Git 599sub mediawiki_smudge { 600my$string=shift; 601if($stringeq EMPTY_CONTENT) { 602$string= EMPTY; 603} 604# This \n is important. This is due to mediawiki's way to handle end of files. 605return"${string}\n"; 606} 607 608sub literal_data { 609my($content) =@_; 610print{*STDOUT}'data ', bytes::length($content),"\n",$content; 611return; 612} 613 614sub literal_data_raw { 615# Output possibly binary content. 616my($content) =@_; 617# Avoid confusion between size in bytes and in characters 618 utf8::downgrade($content); 619binmode STDOUT,':raw'; 620print{*STDOUT}'data ', bytes::length($content),"\n",$content; 621binmode STDOUT,':encoding(UTF-8)'; 622return; 623} 624 625sub mw_capabilities { 626# Revisions are imported to the private namespace 627# refs/mediawiki/$remotename/ by the helper and fetched into 628# refs/remotes/$remotename later by fetch. 629print{*STDOUT}"refspec refs/heads/*:refs/mediawiki/${remotename}/*\n"; 630print{*STDOUT}"import\n"; 631print{*STDOUT}"list\n"; 632print{*STDOUT}"push\n"; 633if($dumb_push) { 634print{*STDOUT}"no-private-update\n"; 635} 636print{*STDOUT}"\n"; 637return; 638} 639 640sub mw_list { 641# MediaWiki do not have branches, we consider one branch arbitrarily 642# called master, and HEAD pointing to it. 643print{*STDOUT}"? refs/heads/master\n"; 644print{*STDOUT}"\@refs/heads/masterHEAD\n"; 645print{*STDOUT}"\n"; 646return; 647} 648 649sub mw_option { 650print{*STDERR}"remote-helper command 'option$_[0]' not yet implemented\n"; 651print{*STDOUT}"unsupported\n"; 652return; 653} 654 655sub fetch_mw_revisions_for_page { 656my$page=shift; 657my$id=shift; 658my$fetch_from=shift; 659my@page_revs= (); 660my$query= { 661 action =>'query', 662 prop =>'revisions', 663 rvprop =>'ids', 664 rvdir =>'newer', 665 rvstartid =>$fetch_from, 666 rvlimit =>500, 667 pageids =>$id, 668 669# Let MediaWiki know that we support the latest API. 670continue=>'', 671}; 672 673my$revnum=0; 674# Get 500 revisions at a time due to the mediawiki api limit 675while(1) { 676my$result=$mediawiki->api($query); 677 678# Parse each of those 500 revisions 679foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 680my$page_rev_ids; 681$page_rev_ids->{pageid} =$page->{pageid}; 682$page_rev_ids->{revid} =$revision->{revid}; 683push(@page_revs,$page_rev_ids); 684$revnum++; 685} 686 687if($result->{'query-continue'}) {# For legacy APIs 688$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 689}elsif($result->{continue}) {# For newer APIs 690$query->{rvstartid} =$result->{continue}->{rvcontinue}; 691$query->{continue} =$result->{continue}->{continue}; 692}else{ 693last; 694} 695} 696if($shallow_import&&@page_revs) { 697print{*STDERR}" Found 1 revision (shallow import).\n"; 698@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 699return$page_revs[0]; 700} 701print{*STDERR}" Found ${revnum} revision(s).\n"; 702return@page_revs; 703} 704 705sub fetch_mw_revisions { 706my$pages=shift;my@pages= @{$pages}; 707my$fetch_from=shift; 708 709my@revisions= (); 710my$n=1; 711foreachmy$page(@pages) { 712my$id=$page->{pageid}; 713print{*STDERR}"page ${n}/",scalar(@pages),': ',$page->{title},"\n"; 714$n++; 715my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 716@revisions= (@page_revs,@revisions); 717} 718 719return($n,@revisions); 720} 721 722sub fe_escape_path { 723my$path=shift; 724$path=~s/\\/\\\\/g; 725$path=~s/"/\\"/g; 726$path=~s/\n/\\n/g; 727returnqq("${path}"); 728} 729 730sub import_file_revision { 731my$commit=shift; 732my%commit= %{$commit}; 733my$full_import=shift; 734my$n=shift; 735my$mediafile=shift; 736my%mediafile; 737if($mediafile) { 738%mediafile= %{$mediafile}; 739} 740 741my$title=$commit{title}; 742my$comment=$commit{comment}; 743my$content=$commit{content}; 744my$author=$commit{author}; 745my$date=$commit{date}; 746 747print{*STDOUT}"commit refs/mediawiki/${remotename}/master\n"; 748print{*STDOUT}"mark :${n}\n"; 749print{*STDOUT}"committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 750 literal_data($comment); 751 752# If it's not a clone, we need to know where to start from 753if(!$full_import&&$n==1) { 754print{*STDOUT}"from refs/mediawiki/${remotename}/master^0\n"; 755} 756if($contentne DELETED_CONTENT) { 757print{*STDOUT}'M 644 inline '. 758 fe_escape_path("${title}.mw") ."\n"; 759 literal_data($content); 760if(%mediafile) { 761print{*STDOUT}'M 644 inline ' 762. fe_escape_path($mediafile{title}) ."\n"; 763 literal_data_raw($mediafile{content}); 764} 765print{*STDOUT}"\n\n"; 766}else{ 767print{*STDOUT}'D '. fe_escape_path("${title}.mw") ."\n"; 768} 769 770# mediawiki revision number in the git note 771if($full_import&&$n==1) { 772print{*STDOUT}"reset refs/notes/${remotename}/mediawiki\n"; 773} 774print{*STDOUT}"commit refs/notes/${remotename}/mediawiki\n"; 775print{*STDOUT}"committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 776 literal_data('Note added by git-mediawiki during import'); 777if(!$full_import&&$n==1) { 778print{*STDOUT}"from refs/notes/${remotename}/mediawiki^0\n"; 779} 780print{*STDOUT}"N inline :${n}\n"; 781 literal_data("mediawiki_revision:$commit{mw_revision}"); 782print{*STDOUT}"\n\n"; 783return; 784} 785 786# parse a sequence of 787# <cmd> <arg1> 788# <cmd> <arg2> 789# \n 790# (like batch sequence of import and sequence of push statements) 791sub get_more_refs { 792my$cmd=shift; 793my@refs; 794while(1) { 795my$line= <STDIN>; 796if($line=~/^$cmd (.*)$/) { 797push(@refs,$1); 798}elsif($lineeq"\n") { 799return@refs; 800}else{ 801die("Invalid command in a '$cmd' batch:$_\n"); 802} 803} 804return; 805} 806 807sub mw_import { 808# multiple import commands can follow each other. 809my@refs= (shift, get_more_refs('import')); 810foreachmy$ref(@refs) { 811 mw_import_ref($ref); 812} 813print{*STDOUT}"done\n"; 814return; 815} 816 817sub mw_import_ref { 818my$ref=shift; 819# The remote helper will call "import HEAD" and 820# "import refs/heads/master". 821# Since HEAD is a symbolic ref to master (by convention, 822# followed by the output of the command "list" that we gave), 823# we don't need to do anything in this case. 824if($refeq'HEAD') { 825return; 826} 827 828$mediawiki= connect_maybe($mediawiki,$remotename,$url); 829 830print{*STDERR}"Searching revisions...\n"; 831my$last_local= get_last_local_revision(); 832my$fetch_from=$last_local+1; 833if($fetch_from==1) { 834print{*STDERR}", fetching from beginning.\n"; 835}else{ 836print{*STDERR}", fetching from here.\n"; 837} 838 839my$n=0; 840if($fetch_strategyeq'by_rev') { 841print{*STDERR}"Fetching & writing export data by revs...\n"; 842$n= mw_import_ref_by_revs($fetch_from); 843}elsif($fetch_strategyeq'by_page') { 844print{*STDERR}"Fetching & writing export data by pages...\n"; 845$n= mw_import_ref_by_pages($fetch_from); 846}else{ 847print{*STDERR}qq(fatal: invalid fetch strategy "${fetch_strategy}".\n); 848print{*STDERR}"Check your configuration variables remote.${remotename}.fetchStrategy and mediawiki.fetchStrategy\n"; 849exit1; 850} 851 852if($fetch_from==1&&$n==0) { 853print{*STDERR}"You appear to have cloned an empty MediaWiki.\n"; 854# Something has to be done remote-helper side. If nothing is done, an error is 855# thrown saying that HEAD is referring to unknown object 0000000000000000000 856# and the clone fails. 857} 858return; 859} 860 861sub mw_import_ref_by_pages { 862 863my$fetch_from=shift; 864my%pages_hash= get_mw_pages(); 865my@pages=values(%pages_hash); 866 867my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 868 869@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 870my@revision_ids=map{$_->{revid} }@revisions; 871 872return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 873} 874 875sub mw_import_ref_by_revs { 876 877my$fetch_from=shift; 878my%pages_hash= get_mw_pages(); 879 880my$last_remote= get_last_global_remote_rev(); 881my@revision_ids=$fetch_from..$last_remote; 882return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 883} 884 885# Import revisions given in second argument (array of integers). 886# Only pages appearing in the third argument (hash indexed by page titles) 887# will be imported. 888sub mw_import_revids { 889my$fetch_from=shift; 890my$revision_ids=shift; 891my$pages=shift; 892 893my$n=0; 894my$n_actual=0; 895my$last_timestamp=0;# Placeholder in case $rev->timestamp is undefined 896 897foreachmy$pagerevid(@{$revision_ids}) { 898# Count page even if we skip it, since we display 899# $n/$total and $total includes skipped pages. 900$n++; 901 902# fetch the content of the pages 903my$query= { 904 action =>'query', 905 prop =>'revisions', 906 rvprop =>'content|timestamp|comment|user|ids', 907 revids =>$pagerevid, 908}; 909 910my$result=$mediawiki->api($query); 911 912if(!$result) { 913die"Failed to retrieve modified page for revision$pagerevid\n"; 914} 915 916if(defined($result->{query}->{badrevids}->{$pagerevid})) { 917# The revision id does not exist on the remote wiki. 918next; 919} 920 921if(!defined($result->{query}->{pages})) { 922die"Invalid revision ${pagerevid}.\n"; 923} 924 925my@result_pages=values(%{$result->{query}->{pages}}); 926my$result_page=$result_pages[0]; 927my$rev=$result_pages[0]->{revisions}->[0]; 928 929my$page_title=$result_page->{title}; 930 931if(!exists($pages->{$page_title})) { 932print{*STDERR}"${n}/",scalar(@{$revision_ids}), 933": Skipping revision #$rev->{revid} of ${page_title}\n"; 934next; 935} 936 937$n_actual++; 938 939my%commit; 940$commit{author} =$rev->{user} ||'Anonymous'; 941$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 942$commit{title} = smudge_filename($page_title); 943$commit{mw_revision} =$rev->{revid}; 944$commit{content} = mediawiki_smudge($rev->{'*'}); 945 946if(!defined($rev->{timestamp})) { 947$last_timestamp++; 948}else{ 949$last_timestamp=$rev->{timestamp}; 950} 951$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 952 953# Differentiates classic pages and media files. 954my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 955my%mediafile; 956if($namespace) { 957my$id= get_mw_namespace_id($namespace); 958if($id&&$id== get_mw_namespace_id('File')) { 959%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 960} 961} 962# If this is a revision of the media page for new version 963# of a file do one common commit for both file and media page. 964# Else do commit only for that page. 965print{*STDERR}"${n}/",scalar(@{$revision_ids}),": Revision #$rev->{revid} of$commit{title}\n"; 966 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 967} 968 969return$n_actual; 970} 971 972sub error_non_fast_forward { 973my$advice= run_git('config --bool advice.pushNonFastForward'); 974chomp($advice); 975if($advicene'false') { 976# Native git-push would show this after the summary. 977# We can't ask it to display it cleanly, so print it 978# ourselves before. 979print{*STDERR}"To prevent you from losing history, non-fast-forward updates were rejected\n"; 980print{*STDERR}"Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 981print{*STDERR}"'Note about fast-forwards' section of 'git push --help' for details.\n"; 982} 983print{*STDOUT}qq(error$_[0] "non-fast-forward"\n); 984return0; 985} 986 987sub mw_upload_file { 988my$complete_file_name=shift; 989my$new_sha1=shift; 990my$extension=shift; 991my$file_deleted=shift; 992my$summary=shift; 993my$newrevid; 994my$path="File:${complete_file_name}"; 995my%hashFiles= get_allowed_file_extensions(); 996if(!exists($hashFiles{$extension})) { 997print{*STDERR}"${complete_file_name} is not a permitted file on this wiki.\n"; 998print{*STDERR}"Check the configuration of file uploads in your mediawiki.\n"; 999return$newrevid;1000}1001# Deleting and uploading a file requires a privileged user1002if($file_deleted) {1003$mediawiki= connect_maybe($mediawiki,$remotename,$url);1004my$query= {1005 action =>'delete',1006 title =>$path,1007 reason =>$summary1008};1009if(!$mediawiki->edit($query)) {1010print{*STDERR}"Failed to delete file on remote wiki\n";1011print{*STDERR}"Check your permissions on the remote site. Error code:\n";1012print{*STDERR}$mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details};1013exit1;1014}1015}else{1016# Don't let perl try to interpret file content as UTF-8 => use "raw"1017my$content= run_git("cat-file blob ${new_sha1}",'raw');1018if($contentne EMPTY) {1019$mediawiki= connect_maybe($mediawiki,$remotename,$url);1020$mediawiki->{config}->{upload_url} =1021"${url}/index.php/Special:Upload";1022$mediawiki->edit({1023 action =>'upload',1024 filename =>$complete_file_name,1025 comment =>$summary,1026 file => [undef,1027$complete_file_name,1028 Content =>$content],1029 ignorewarnings =>1,1030}, {1031 skip_encoding =>11032} ) ||die$mediawiki->{error}->{code} .':'1033.$mediawiki->{error}->{details} ."\n";1034my$last_file_page=$mediawiki->get_page({title =>$path});1035$newrevid=$last_file_page->{revid};1036print{*STDERR}"Pushed file: ${new_sha1} - ${complete_file_name}.\n";1037}else{1038print{*STDERR}"Empty file ${complete_file_name} not pushed.\n";1039}1040}1041return$newrevid;1042}10431044sub mw_push_file {1045my$diff_info=shift;1046# $diff_info contains a string in this format:1047# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1048my@diff_info_split=split(/[ \t]/,$diff_info);10491050# Filename, including .mw extension1051my$complete_file_name=shift;1052# Commit message1053my$summary=shift;1054# MediaWiki revision number. Keep the previous one by default,1055# in case there's no edit to perform.1056my$oldrevid=shift;1057my$newrevid;10581059if($summaryeq EMPTY_MESSAGE) {1060$summary= EMPTY;1061}10621063my$new_sha1=$diff_info_split[3];1064my$old_sha1=$diff_info_split[2];1065my$page_created= ($old_sha1eq NULL_SHA1);1066my$page_deleted= ($new_sha1eq NULL_SHA1);1067$complete_file_name= clean_filename($complete_file_name);10681069my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1070if(!defined($extension)) {1071$extension= EMPTY;1072}1073if($extensioneq'mw') {1074my$ns= get_mw_namespace_id_for_page($complete_file_name);1075if($ns&&$ns== get_mw_namespace_id('File') && (!$export_media)) {1076print{*STDERR}"Ignoring media file related page: ${complete_file_name}\n";1077return($oldrevid,'ok');1078}1079my$file_content;1080if($page_deleted) {1081# Deleting a page usually requires1082# special privileges. A common1083# convention is to replace the page1084# with this content instead:1085$file_content= DELETED_CONTENT;1086}else{1087$file_content= run_git("cat-file blob ${new_sha1}");1088}10891090$mediawiki= connect_maybe($mediawiki,$remotename,$url);10911092my$result=$mediawiki->edit( {1093 action =>'edit',1094 summary =>$summary,1095 title =>$title,1096 basetimestamp =>$basetimestamps{$oldrevid},1097 text => mediawiki_clean($file_content,$page_created),1098}, {1099 skip_encoding =>1# Helps with names with accentuated characters1100});1101if(!$result) {1102if($mediawiki->{error}->{code} ==3) {1103# edit conflicts, considered as non-fast-forward1104print{*STDERR}'Warning: Error '.1105$mediawiki->{error}->{code} .1106' from mediawiki: '.$mediawiki->{error}->{details} .1107".\n";1108return($oldrevid,'non-fast-forward');1109}else{1110# Other errors. Shouldn't happen => just die()1111die'Fatal: Error '.1112$mediawiki->{error}->{code} .1113' from mediawiki: '.$mediawiki->{error}->{details} ."\n";1114}1115}1116$newrevid=$result->{edit}->{newrevid};1117print{*STDERR}"Pushed file: ${new_sha1} - ${title}\n";1118}elsif($export_media) {1119$newrevid= mw_upload_file($complete_file_name,$new_sha1,1120$extension,$page_deleted,1121$summary);1122}else{1123print{*STDERR}"Ignoring media file ${title}\n";1124}1125$newrevid= ($newrevidor$oldrevid);1126return($newrevid,'ok');1127}11281129sub mw_push {1130# multiple push statements can follow each other1131my@refsspecs= (shift, get_more_refs('push'));1132my$pushed;1133formy$refspec(@refsspecs) {1134my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1135or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>\n");1136if($force) {1137print{*STDERR}"Warning: forced push not allowed on a MediaWiki.\n";1138}1139if($localeq EMPTY) {1140print{*STDERR}"Cannot delete remote branch on a MediaWiki\n";1141print{*STDOUT}"error ${remote} cannot delete\n";1142next;1143}1144if($remotene'refs/heads/master') {1145print{*STDERR}"Only push to the branch 'master' is supported on a MediaWiki\n";1146print{*STDOUT}"error ${remote} only master allowed\n";1147next;1148}1149if(mw_push_revision($local,$remote)) {1150$pushed=1;1151}1152}11531154# Notify Git that the push is done1155print{*STDOUT}"\n";11561157if($pushed&&$dumb_push) {1158print{*STDERR}"Just pushed some revisions to MediaWiki.\n";1159print{*STDERR}"The pushed revisions now have to be re-imported, and your current branch\n";1160print{*STDERR}"needs to be updated with these re-imported commits. You can do this with\n";1161print{*STDERR}"\n";1162print{*STDERR}" git pull --rebase\n";1163print{*STDERR}"\n";1164}1165return;1166}11671168sub mw_push_revision {1169my$local=shift;1170my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1171my$last_local_revid= get_last_local_revision();1172print{*STDERR}".\n";# Finish sentence started by get_last_local_revision()1173my$last_remote_revid= get_last_remote_revision();1174my$mw_revision=$last_remote_revid;11751176# Get sha1 of commit pointed by local HEAD1177my$HEAD_sha1= run_git("rev-parse ${local} 2>/dev/null");1178chomp($HEAD_sha1);1179# Get sha1 of commit pointed by remotes/$remotename/master1180my$remoteorigin_sha1= run_git("rev-parse refs/remotes/${remotename}/master 2>/dev/null");1181chomp($remoteorigin_sha1);11821183if($last_local_revid>0&&1184$last_local_revid<$last_remote_revid) {1185return error_non_fast_forward($remote);1186}11871188if($HEAD_sha1eq$remoteorigin_sha1) {1189# nothing to push1190return0;1191}11921193# Get every commit in between HEAD and refs/remotes/origin/master,1194# including HEAD and refs/remotes/origin/master1195my@commit_pairs= ();1196if($last_local_revid>0) {1197my$parsed_sha1=$remoteorigin_sha1;1198# Find a path from last MediaWiki commit to pushed commit1199print{*STDERR}"Computing path from local to remote ...\n";1200my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents ${local} ^${parsed_sha1}"));1201my%local_ancestry;1202foreachmy$line(@local_ancestry) {1203if(my($child,$parents) =$line=~/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1204foreachmy$parent(split(/ /,$parents)) {1205$local_ancestry{$parent} =$child;1206}1207}elsif(!$line=~/^([a-f0-9]+)/) {1208die"Unexpected output from git rev-list: ${line}\n";1209}1210}1211while($parsed_sha1ne$HEAD_sha1) {1212my$child=$local_ancestry{$parsed_sha1};1213if(!$child) {1214print{*STDERR}"Cannot find a path in history from remote commit to last commit\n";1215return error_non_fast_forward($remote);1216}1217push(@commit_pairs, [$parsed_sha1,$child]);1218$parsed_sha1=$child;1219}1220}else{1221# No remote mediawiki revision. Export the whole1222# history (linearized with --first-parent)1223print{*STDERR}"Warning: no common ancestor, pushing complete history\n";1224my$history= run_git("rev-list --first-parent --children ${local}");1225my@history=split(/\n/,$history);1226@history=@history[1..$#history];1227foreachmy$line(reverse@history) {1228my@commit_info_split=split(/[ \n]/,$line);1229push(@commit_pairs, \@commit_info_split);1230}1231}12321233foreachmy$commit_info_split(@commit_pairs) {1234my$sha1_child= @{$commit_info_split}[0];1235my$sha1_commit= @{$commit_info_split}[1];1236my$diff_infos= run_git("diff-tree -r --raw -z ${sha1_child} ${sha1_commit}");1237# TODO: we could detect rename, and encode them with a #redirect on the wiki.1238# TODO: for now, it's just a delete+add1239my@diff_info_list=split(/\0/,$diff_infos);1240# Keep the subject line of the commit message as mediawiki comment for the revision1241my$commit_msg= run_git(qq(log --no-walk --format="%s" ${sha1_commit}));1242chomp($commit_msg);1243# Push every blob1244while(@diff_info_list) {1245my$status;1246# git diff-tree -z gives an output like1247# <metadata>\0<filename1>\01248# <metadata>\0<filename2>\01249# and we've split on \0.1250my$info=shift(@diff_info_list);1251my$file=shift(@diff_info_list);1252($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1253if($statuseq'non-fast-forward') {1254# we may already have sent part of the1255# commit to MediaWiki, but it's too1256# late to cancel it. Stop the push in1257# the middle, but still give an1258# accurate error message.1259return error_non_fast_forward($remote);1260}1261if($statusne'ok') {1262die("Unknown error from mw_push_file()\n");1263}1264}1265if(!$dumb_push) {1266 run_git(qq(notes --ref=${remotename}/mediawiki add -f -m "mediawiki_revision: ${mw_revision}" ${sha1_commit}));1267}1268}12691270print{*STDOUT}"ok ${remote}\n";1271return1;1272}12731274sub get_allowed_file_extensions {1275$mediawiki= connect_maybe($mediawiki,$remotename,$url);12761277my$query= {1278 action =>'query',1279 meta =>'siteinfo',1280 siprop =>'fileextensions'1281};1282my$result=$mediawiki->api($query);1283my@file_extensions=map{$_->{ext}} @{$result->{query}->{fileextensions}};1284my%hashFile=map{$_=>1}@file_extensions;12851286return%hashFile;1287}12881289# In memory cache for MediaWiki namespace ids.1290my%namespace_id;12911292# Namespaces whose id is cached in the configuration file1293# (to avoid duplicates)1294my%cached_mw_namespace_id;12951296# Return MediaWiki id for a canonical namespace name.1297# Ex.: "File", "Project".1298sub get_mw_namespace_id {1299$mediawiki= connect_maybe($mediawiki,$remotename,$url);1300my$name=shift;13011302if(!exists$namespace_id{$name}) {1303# Look at configuration file, if the record for that namespace is1304# already cached. Namespaces are stored in form:1305# "Name_of_namespace:Id_namespace", ex.: "File:6".1306my@temp=split(/\n/,1307 run_git("config --get-all remote.${remotename}.namespaceCache"));1308chomp(@temp);1309foreachmy$ns(@temp) {1310my($n,$id) =split(/:/,$ns);1311if($ideq'notANameSpace') {1312$namespace_id{$n} = {is_namespace =>0};1313}else{1314$namespace_id{$n} = {is_namespace =>1, id =>$id};1315}1316$cached_mw_namespace_id{$n} =1;1317}1318}13191320if(!exists$namespace_id{$name}) {1321print{*STDERR}"Namespace ${name} not found in cache, querying the wiki ...\n";1322# NS not found => get namespace id from MW and store it in1323# configuration file.1324my$query= {1325 action =>'query',1326 meta =>'siteinfo',1327 siprop =>'namespaces'1328};1329my$result=$mediawiki->api($query);13301331while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1332if(defined($ns->{id}) &&defined($ns->{canonical})) {1333$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1334if($ns->{'*'}) {1335# alias (e.g. french Fichier: as alias for canonical File:)1336$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1337}1338}1339}1340}13411342my$ns=$namespace_id{$name};1343my$id;13441345if(!defined$ns) {1346my@namespaces=map{s/ /_/g;$_; }sort keys%namespace_id;1347print{*STDERR}"No such namespace ${name} on MediaWiki, known namespaces:@namespaces\n";1348$ns= {is_namespace =>0};1349$namespace_id{$name} =$ns;1350}13511352if($ns->{is_namespace}) {1353$id=$ns->{id};1354}13551356# Store "notANameSpace" as special value for inexisting namespaces1357my$store_id= ($id||'notANameSpace');13581359# Store explicitly requested namespaces on disk1360if(!exists$cached_mw_namespace_id{$name}) {1361 run_git(qq(config --add remote.${remotename}.namespaceCache "${name}:${store_id}"));1362$cached_mw_namespace_id{$name} =1;1363}1364return$id;1365}13661367sub get_mw_namespace_id_for_page {1368my$namespace=shift;1369if($namespace=~/^([^:]*):/) {1370return get_mw_namespace_id($namespace);1371}else{1372return;1373}1374}