1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# https://github.com/Bibzball/Git-Mediawiki/wiki 13# 14# Known limitations: 15# 16# - Several strategies are provided to fetch modifications from the 17# wiki, but no automatic heuristics is provided, the user has 18# to understand and chose which strategy is appropriate for him. 19# 20# - Git renames could be turned into MediaWiki renames (see TODO 21# below) 22# 23# - No way to import "one page, and all pages included in it" 24# 25# - Multiple remote MediaWikis have not been very well tested. 26 27use strict; 28use MediaWiki::API; 29use DateTime::Format::ISO8601; 30 31# By default, use UTF-8 to communicate with Git and the user 32binmode STDERR,":utf8"; 33binmode STDOUT,":utf8"; 34 35use URI::Escape; 36use IPC::Open2; 37 38use warnings; 39 40# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 41useconstant SLASH_REPLACEMENT =>"%2F"; 42 43# It's not always possible to delete pages (may require some 44# priviledges). Deleted pages are replaced with this content. 45useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 46 47# It's not possible to create empty pages. New empty files in Git are 48# sent with this content instead. 49useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 50 51# used to reflect file creation or deletion in diff. 52useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 53 54my$remotename=$ARGV[0]; 55my$url=$ARGV[1]; 56 57# Accept both space-separated and multiple keys in config file. 58# Spaces should be written as _ anyway because we'll use chomp. 59my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 60chomp(@tracked_pages); 61 62# Just like @tracked_pages, but for MediaWiki categories. 63my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 64chomp(@tracked_categories); 65 66# Import media files too. 67my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 68chomp($import_media); 69$import_media= ($import_mediaeq"true"); 70 71my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 72# Note: mwPassword is discourraged. Use the credential system instead. 73my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 74my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 75chomp($wiki_login); 76chomp($wiki_passwd); 77chomp($wiki_domain); 78 79# Import only last revisions (both for clone and fetch) 80my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 81chomp($shallow_import); 82$shallow_import= ($shallow_importeq"true"); 83 84# Fetch (clone and pull) by revisions instead of by pages. This behavior 85# is more efficient when we have a wiki with lots of pages and we fetch 86# the revisions quite often so that they concern only few pages. 87# Possible values: 88# - by_rev: perform one query per new revision on the remote wiki 89# - by_page: query each tracked page for new revision 90my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 91unless($fetch_strategy) { 92$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 93} 94chomp($fetch_strategy); 95unless($fetch_strategy) { 96$fetch_strategy="by_page"; 97} 98 99# Dumb push: don't update notes and mediawiki ref to reflect the last push. 100# 101# Configurable with mediawiki.dumbPush, or per-remote with 102# remote.<remotename>.dumbPush. 103# 104# This means the user will have to re-import the just-pushed 105# revisions. On the other hand, this means that the Git revisions 106# corresponding to MediaWiki revisions are all imported from the wiki, 107# regardless of whether they were initially created in Git or from the 108# web interface, hence all users will get the same history (i.e. if 109# the push from Git to MediaWiki loses some information, everybody 110# will get the history with information lost). If the import is 111# deterministic, this means everybody gets the same sha1 for each 112# MediaWiki revision. 113my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 114unless($dumb_push) { 115$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 116} 117chomp($dumb_push); 118$dumb_push= ($dumb_pusheq"true"); 119 120my$wiki_name=$url; 121$wiki_name=~s/[^\/]*:\/\///; 122# If URL is like http://user:password@example.com/, we clearly don't 123# want the password in $wiki_name. While we're there, also remove user 124# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 125$wiki_name=~s/^.*@//; 126 127# Commands parser 128my$entry; 129my@cmd; 130while(<STDIN>) { 131chomp; 132@cmd=split(/ /); 133if(defined($cmd[0])) { 134# Line not blank 135if($cmd[0]eq"capabilities") { 136die("Too many arguments for capabilities")unless(!defined($cmd[1])); 137 mw_capabilities(); 138}elsif($cmd[0]eq"list") { 139die("Too many arguments for list")unless(!defined($cmd[2])); 140 mw_list($cmd[1]); 141}elsif($cmd[0]eq"import") { 142die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 143 mw_import($cmd[1]); 144}elsif($cmd[0]eq"option") { 145die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 146 mw_option($cmd[1],$cmd[2]); 147}elsif($cmd[0]eq"push") { 148 mw_push($cmd[1]); 149}else{ 150print STDERR "Unknown command. Aborting...\n"; 151last; 152} 153}else{ 154# blank line: we should terminate 155last; 156} 157 158BEGIN{ $| =1}# flush STDOUT, to make sure the previous 159# command is fully processed. 160} 161 162########################## Functions ############################## 163 164## credential API management (generic functions) 165 166sub credential_from_url { 167my$url=shift; 168my$parsed= URI->new($url); 169my%credential; 170 171if($parsed->scheme) { 172$credential{protocol} =$parsed->scheme; 173} 174if($parsed->host) { 175$credential{host} =$parsed->host; 176} 177if($parsed->path) { 178$credential{path} =$parsed->path; 179} 180if($parsed->userinfo) { 181if($parsed->userinfo=~/([^:]*):(.*)/) { 182$credential{username} =$1; 183$credential{password} =$2; 184}else{ 185$credential{username} =$parsed->userinfo; 186} 187} 188 189return%credential; 190} 191 192sub credential_read { 193my%credential; 194my$reader=shift; 195my$op=shift; 196while(<$reader>) { 197my($key,$value) =/([^=]*)=(.*)/; 198if(not defined$key) { 199die"ERROR receiving response from git credential$op:\n$_\n"; 200} 201$credential{$key} =$value; 202} 203return%credential; 204} 205 206sub credential_write { 207my$credential=shift; 208my$writer=shift; 209while(my($key,$value) =each(%$credential) ) { 210if($value) { 211print$writer"$key=$value\n"; 212} 213} 214} 215 216sub credential_run { 217my$op=shift; 218my$credential=shift; 219my$pid= open2(my$reader,my$writer,"git credential$op"); 220 credential_write($credential,$writer); 221print$writer"\n"; 222close($writer); 223 224if($opeq"fill") { 225%$credential= credential_read($reader,$op); 226}else{ 227if(<$reader>) { 228die"ERROR while running git credential$op:\n$_"; 229} 230} 231close($reader); 232waitpid($pid,0); 233my$child_exit_status=$?>>8; 234if($child_exit_status!=0) { 235die"'git credential$op' failed with code$child_exit_status."; 236} 237} 238 239# MediaWiki API instance, created lazily. 240my$mediawiki; 241 242sub mw_connect_maybe { 243if($mediawiki) { 244return; 245} 246$mediawiki= MediaWiki::API->new; 247$mediawiki->{config}->{api_url} ="$url/api.php"; 248if($wiki_login) { 249my%credential= credential_from_url($url); 250$credential{username} =$wiki_login; 251$credential{password} =$wiki_passwd; 252 credential_run("fill", \%credential); 253my$request= {lgname =>$credential{username}, 254 lgpassword =>$credential{password}, 255 lgdomain =>$wiki_domain}; 256if($mediawiki->login($request)) { 257 credential_run("approve", \%credential); 258print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 259}else{ 260print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 261print STDERR " (error ". 262$mediawiki->{error}->{code} .': '. 263$mediawiki->{error}->{details} .")\n"; 264 credential_run("reject", \%credential); 265exit1; 266} 267} 268} 269 270## Functions for listing pages on the remote wiki 271sub get_mw_tracked_pages { 272my$pages=shift; 273 get_mw_page_list(\@tracked_pages,$pages); 274} 275 276sub get_mw_page_list { 277my$page_list=shift; 278my$pages=shift; 279my@some_pages=@$page_list; 280while(@some_pages) { 281my$last=50; 282if($#some_pages<$last) { 283$last=$#some_pages; 284} 285my@slice=@some_pages[0..$last]; 286 get_mw_first_pages(\@slice,$pages); 287@some_pages=@some_pages[51..$#some_pages]; 288} 289} 290 291sub get_mw_tracked_categories { 292my$pages=shift; 293foreachmy$category(@tracked_categories) { 294if(index($category,':') <0) { 295# Mediawiki requires the Category 296# prefix, but let's not force the user 297# to specify it. 298$category="Category:".$category; 299} 300my$mw_pages=$mediawiki->list( { 301 action =>'query', 302 list =>'categorymembers', 303 cmtitle =>$category, 304 cmlimit =>'max'} ) 305||die$mediawiki->{error}->{code} .': ' 306.$mediawiki->{error}->{details}; 307foreachmy$page(@{$mw_pages}) { 308$pages->{$page->{title}} =$page; 309} 310} 311} 312 313sub get_mw_all_pages { 314my$pages=shift; 315# No user-provided list, get the list of pages from the API. 316my$mw_pages=$mediawiki->list({ 317 action =>'query', 318 list =>'allpages', 319 aplimit =>'max' 320}); 321if(!defined($mw_pages)) { 322print STDERR "fatal: could not get the list of wiki pages.\n"; 323print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 324print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 325exit1; 326} 327foreachmy$page(@{$mw_pages}) { 328$pages->{$page->{title}} =$page; 329} 330} 331 332# queries the wiki for a set of pages. Meant to be used within a loop 333# querying the wiki for slices of page list. 334sub get_mw_first_pages { 335my$some_pages=shift; 336my@some_pages= @{$some_pages}; 337 338my$pages=shift; 339 340# pattern 'page1|page2|...' required by the API 341my$titles=join('|',@some_pages); 342 343my$mw_pages=$mediawiki->api({ 344 action =>'query', 345 titles =>$titles, 346}); 347if(!defined($mw_pages)) { 348print STDERR "fatal: could not query the list of wiki pages.\n"; 349print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 350print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 351exit1; 352} 353while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 354if($id<0) { 355print STDERR "Warning: page$page->{title} not found on wiki\n"; 356}else{ 357$pages->{$page->{title}} =$page; 358} 359} 360} 361 362# Get the list of pages to be fetched according to configuration. 363sub get_mw_pages { 364 mw_connect_maybe(); 365 366my%pages;# hash on page titles to avoid duplicates 367my$user_defined; 368if(@tracked_pages) { 369$user_defined=1; 370# The user provided a list of pages titles, but we 371# still need to query the API to get the page IDs. 372 get_mw_tracked_pages(\%pages); 373} 374if(@tracked_categories) { 375$user_defined=1; 376 get_mw_tracked_categories(\%pages); 377} 378if(!$user_defined) { 379 get_mw_all_pages(\%pages); 380} 381if($import_media) { 382print STDERR "Getting media files for selected pages...\n"; 383if($user_defined) { 384 get_linked_mediafiles(\%pages); 385}else{ 386 get_all_mediafiles(\%pages); 387} 388} 389return%pages; 390} 391 392# usage: $out = run_git("command args"); 393# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 394sub run_git { 395my$args=shift; 396my$encoding= (shift||"encoding(UTF-8)"); 397open(my$git,"-|:$encoding","git ".$args); 398my$res=do{local$/; <$git> }; 399close($git); 400 401return$res; 402} 403 404 405sub get_all_mediafiles { 406my$pages=shift; 407# Attach list of all pages for media files from the API, 408# they are in a different namespace, only one namespace 409# can be queried at the same moment 410my$mw_pages=$mediawiki->list({ 411 action =>'query', 412 list =>'allpages', 413 apnamespace => get_mw_namespace_id("File"), 414 aplimit =>'max' 415}); 416if(!defined($mw_pages)) { 417print STDERR "fatal: could not get the list of pages for media files.\n"; 418print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 419print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 420exit1; 421} 422foreachmy$page(@{$mw_pages}) { 423$pages->{$page->{title}} =$page; 424} 425} 426 427sub get_linked_mediafiles { 428my$pages=shift; 429my@titles=map$_->{title},values(%{$pages}); 430 431# The query is split in small batches because of the MW API limit of 432# the number of links to be returned (500 links max). 433my$batch=10; 434while(@titles) { 435if($#titles<$batch) { 436$batch=$#titles; 437} 438my@slice=@titles[0..$batch]; 439 440# pattern 'page1|page2|...' required by the API 441my$mw_titles=join('|',@slice); 442 443# Media files could be included or linked from 444# a page, get all related 445my$query= { 446 action =>'query', 447 prop =>'links|images', 448 titles =>$mw_titles, 449 plnamespace => get_mw_namespace_id("File"), 450 pllimit =>'max' 451}; 452my$result=$mediawiki->api($query); 453 454while(my($id,$page) =each(%{$result->{query}->{pages}})) { 455my@media_titles; 456if(defined($page->{links})) { 457my@link_titles=map$_->{title}, @{$page->{links}}; 458push(@media_titles,@link_titles); 459} 460if(defined($page->{images})) { 461my@image_titles=map$_->{title}, @{$page->{images}}; 462push(@media_titles,@image_titles); 463} 464if(@media_titles) { 465 get_mw_page_list(\@media_titles,$pages); 466} 467} 468 469@titles=@titles[($batch+1)..$#titles]; 470} 471} 472 473sub get_mw_mediafile_for_page_revision { 474# Name of the file on Wiki, with the prefix. 475my$filename=shift; 476my$timestamp=shift; 477my%mediafile; 478 479# Search if on a media file with given timestamp exists on 480# MediaWiki. In that case download the file. 481my$query= { 482 action =>'query', 483 prop =>'imageinfo', 484 titles =>"File:".$filename, 485 iistart =>$timestamp, 486 iiend =>$timestamp, 487 iiprop =>'timestamp|archivename|url', 488 iilimit =>1 489}; 490my$result=$mediawiki->api($query); 491 492my($fileid,$file) =each( %{$result->{query}->{pages}} ); 493# If not defined it means there is no revision of the file for 494# given timestamp. 495if(defined($file->{imageinfo})) { 496$mediafile{title} =$filename; 497 498my$fileinfo=pop(@{$file->{imageinfo}}); 499$mediafile{timestamp} =$fileinfo->{timestamp}; 500# Mediawiki::API's download function doesn't support https URLs 501# and can't download old versions of files. 502print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 503$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 504} 505return%mediafile; 506} 507 508sub download_mw_mediafile { 509my$url=shift; 510 511my$response=$mediawiki->{ua}->get($url); 512if($response->code==200) { 513return$response->decoded_content; 514}else{ 515print STDERR "Error downloading mediafile from :\n"; 516print STDERR "URL:$url\n"; 517print STDERR "Server response: ".$response->code." ".$response->message."\n"; 518exit1; 519} 520} 521 522sub get_last_local_revision { 523# Get note regarding last mediawiki revision 524my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 525my@note_info=split(/ /,$note); 526 527my$lastrevision_number; 528if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 529print STDERR "No previous mediawiki revision found"; 530$lastrevision_number=0; 531}else{ 532# Notes are formatted : mediawiki_revision: #number 533$lastrevision_number=$note_info[1]; 534chomp($lastrevision_number); 535print STDERR "Last local mediawiki revision found is$lastrevision_number"; 536} 537return$lastrevision_number; 538} 539 540# Remember the timestamp corresponding to a revision id. 541my%basetimestamps; 542 543# Get the last remote revision without taking in account which pages are 544# tracked or not. This function makes a single request to the wiki thus 545# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 546# option. 547sub get_last_global_remote_rev { 548 mw_connect_maybe(); 549 550my$query= { 551 action =>'query', 552 list =>'recentchanges', 553 prop =>'revisions', 554 rclimit =>'1', 555 rcdir =>'older', 556}; 557my$result=$mediawiki->api($query); 558return$result->{query}->{recentchanges}[0]->{revid}; 559} 560 561# Get the last remote revision concerning the tracked pages and the tracked 562# categories. 563sub get_last_remote_revision { 564 mw_connect_maybe(); 565 566my%pages_hash= get_mw_pages(); 567my@pages=values(%pages_hash); 568 569my$max_rev_num=0; 570 571foreachmy$page(@pages) { 572my$id=$page->{pageid}; 573 574my$query= { 575 action =>'query', 576 prop =>'revisions', 577 rvprop =>'ids|timestamp', 578 pageids =>$id, 579}; 580 581my$result=$mediawiki->api($query); 582 583my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 584 585$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 586 587$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 588} 589 590print STDERR "Last remote revision found is$max_rev_num.\n"; 591return$max_rev_num; 592} 593 594# Clean content before sending it to MediaWiki 595sub mediawiki_clean { 596my$string=shift; 597my$page_created=shift; 598# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 599# This function right trims a string and adds a \n at the end to follow this rule 600$string=~s/\s+$//; 601if($stringeq""&&$page_created) { 602# Creating empty pages is forbidden. 603$string= EMPTY_CONTENT; 604} 605return$string."\n"; 606} 607 608# Filter applied on MediaWiki data before adding them to Git 609sub mediawiki_smudge { 610my$string=shift; 611if($stringeq EMPTY_CONTENT) { 612$string=""; 613} 614# This \n is important. This is due to mediawiki's way to handle end of files. 615return$string."\n"; 616} 617 618sub mediawiki_clean_filename { 619my$filename=shift; 620$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 621# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 622# Do a variant of URL-encoding, i.e. looks like URL-encoding, 623# but with _ added to prevent MediaWiki from thinking this is 624# an actual special character. 625$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 626# If we use the uri escape before 627# we should unescape here, before anything 628 629return$filename; 630} 631 632sub mediawiki_smudge_filename { 633my$filename=shift; 634$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 635$filename=~s/ /_/g; 636# Decode forbidden characters encoded in mediawiki_clean_filename 637$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 638return$filename; 639} 640 641sub literal_data { 642my($content) =@_; 643print STDOUT "data ", bytes::length($content),"\n",$content; 644} 645 646sub literal_data_raw { 647# Output possibly binary content. 648my($content) =@_; 649# Avoid confusion between size in bytes and in characters 650 utf8::downgrade($content); 651binmode STDOUT,":raw"; 652print STDOUT "data ", bytes::length($content),"\n",$content; 653binmode STDOUT,":utf8"; 654} 655 656sub mw_capabilities { 657# Revisions are imported to the private namespace 658# refs/mediawiki/$remotename/ by the helper and fetched into 659# refs/remotes/$remotename later by fetch. 660print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 661print STDOUT "import\n"; 662print STDOUT "list\n"; 663print STDOUT "push\n"; 664print STDOUT "\n"; 665} 666 667sub mw_list { 668# MediaWiki do not have branches, we consider one branch arbitrarily 669# called master, and HEAD pointing to it. 670print STDOUT "? refs/heads/master\n"; 671print STDOUT "\@refs/heads/masterHEAD\n"; 672print STDOUT "\n"; 673} 674 675sub mw_option { 676print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 677print STDOUT "unsupported\n"; 678} 679 680sub fetch_mw_revisions_for_page { 681my$page=shift; 682my$id=shift; 683my$fetch_from=shift; 684my@page_revs= (); 685my$query= { 686 action =>'query', 687 prop =>'revisions', 688 rvprop =>'ids', 689 rvdir =>'newer', 690 rvstartid =>$fetch_from, 691 rvlimit =>500, 692 pageids =>$id, 693}; 694 695my$revnum=0; 696# Get 500 revisions at a time due to the mediawiki api limit 697while(1) { 698my$result=$mediawiki->api($query); 699 700# Parse each of those 500 revisions 701foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 702my$page_rev_ids; 703$page_rev_ids->{pageid} =$page->{pageid}; 704$page_rev_ids->{revid} =$revision->{revid}; 705push(@page_revs,$page_rev_ids); 706$revnum++; 707} 708last unless$result->{'query-continue'}; 709$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 710} 711if($shallow_import&&@page_revs) { 712print STDERR " Found 1 revision (shallow import).\n"; 713@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 714return$page_revs[0]; 715} 716print STDERR " Found ",$revnum," revision(s).\n"; 717return@page_revs; 718} 719 720sub fetch_mw_revisions { 721my$pages=shift;my@pages= @{$pages}; 722my$fetch_from=shift; 723 724my@revisions= (); 725my$n=1; 726foreachmy$page(@pages) { 727my$id=$page->{pageid}; 728 729print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 730$n++; 731my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 732@revisions= (@page_revs,@revisions); 733} 734 735return($n,@revisions); 736} 737 738sub import_file_revision { 739my$commit=shift; 740my%commit= %{$commit}; 741my$full_import=shift; 742my$n=shift; 743my$mediafile=shift; 744my%mediafile; 745if($mediafile) { 746%mediafile= %{$mediafile}; 747} 748 749my$title=$commit{title}; 750my$comment=$commit{comment}; 751my$content=$commit{content}; 752my$author=$commit{author}; 753my$date=$commit{date}; 754 755print STDOUT "commit refs/mediawiki/$remotename/master\n"; 756print STDOUT "mark :$n\n"; 757print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 758 literal_data($comment); 759 760# If it's not a clone, we need to know where to start from 761if(!$full_import&&$n==1) { 762print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 763} 764if($contentne DELETED_CONTENT) { 765print STDOUT "M 644 inline$title.mw\n"; 766 literal_data($content); 767if(%mediafile) { 768print STDOUT "M 644 inline$mediafile{title}\n"; 769 literal_data_raw($mediafile{content}); 770} 771print STDOUT "\n\n"; 772}else{ 773print STDOUT "D$title.mw\n"; 774} 775 776# mediawiki revision number in the git note 777if($full_import&&$n==1) { 778print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 779} 780print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 781print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 782 literal_data("Note added by git-mediawiki during import"); 783if(!$full_import&&$n==1) { 784print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 785} 786print STDOUT "N inline :$n\n"; 787 literal_data("mediawiki_revision: ".$commit{mw_revision}); 788print STDOUT "\n\n"; 789} 790 791# parse a sequence of 792# <cmd> <arg1> 793# <cmd> <arg2> 794# \n 795# (like batch sequence of import and sequence of push statements) 796sub get_more_refs { 797my$cmd=shift; 798my@refs; 799while(1) { 800my$line= <STDIN>; 801if($line=~m/^$cmd (.*)$/) { 802push(@refs,$1); 803}elsif($lineeq"\n") { 804return@refs; 805}else{ 806die("Invalid command in a '$cmd' batch: ".$_); 807} 808} 809} 810 811sub mw_import { 812# multiple import commands can follow each other. 813my@refs= (shift, get_more_refs("import")); 814foreachmy$ref(@refs) { 815 mw_import_ref($ref); 816} 817print STDOUT "done\n"; 818} 819 820sub mw_import_ref { 821my$ref=shift; 822# The remote helper will call "import HEAD" and 823# "import refs/heads/master". 824# Since HEAD is a symbolic ref to master (by convention, 825# followed by the output of the command "list" that we gave), 826# we don't need to do anything in this case. 827if($refeq"HEAD") { 828return; 829} 830 831 mw_connect_maybe(); 832 833print STDERR "Searching revisions...\n"; 834my$last_local= get_last_local_revision(); 835my$fetch_from=$last_local+1; 836if($fetch_from==1) { 837print STDERR ", fetching from beginning.\n"; 838}else{ 839print STDERR ", fetching from here.\n"; 840} 841 842my$n=0; 843if($fetch_strategyeq"by_rev") { 844print STDERR "Fetching & writing export data by revs...\n"; 845$n= mw_import_ref_by_revs($fetch_from); 846}elsif($fetch_strategyeq"by_page") { 847print STDERR "Fetching & writing export data by pages...\n"; 848$n= mw_import_ref_by_pages($fetch_from); 849}else{ 850print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 851print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 852exit1; 853} 854 855if($fetch_from==1&&$n==0) { 856print STDERR "You appear to have cloned an empty MediaWiki.\n"; 857# Something has to be done remote-helper side. If nothing is done, an error is 858# thrown saying that HEAD is refering to unknown object 0000000000000000000 859# and the clone fails. 860} 861} 862 863sub mw_import_ref_by_pages { 864 865my$fetch_from=shift; 866my%pages_hash= get_mw_pages(); 867my@pages=values(%pages_hash); 868 869my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 870 871@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 872my@revision_ids=map$_->{revid},@revisions; 873 874return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 875} 876 877sub mw_import_ref_by_revs { 878 879my$fetch_from=shift; 880my%pages_hash= get_mw_pages(); 881 882my$last_remote= get_last_global_remote_rev(); 883my@revision_ids=$fetch_from..$last_remote; 884return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 885} 886 887# Import revisions given in second argument (array of integers). 888# Only pages appearing in the third argument (hash indexed by page titles) 889# will be imported. 890sub mw_import_revids { 891my$fetch_from=shift; 892my$revision_ids=shift; 893my$pages=shift; 894 895my$n=0; 896my$n_actual=0; 897my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 898 899foreachmy$pagerevid(@$revision_ids) { 900# fetch the content of the pages 901my$query= { 902 action =>'query', 903 prop =>'revisions', 904 rvprop =>'content|timestamp|comment|user|ids', 905 revids =>$pagerevid, 906}; 907 908my$result=$mediawiki->api($query); 909 910if(!$result) { 911die"Failed to retrieve modified page for revision$pagerevid"; 912} 913 914if(!defined($result->{query}->{pages})) { 915die"Invalid revision$pagerevid."; 916} 917 918my@result_pages=values(%{$result->{query}->{pages}}); 919my$result_page=$result_pages[0]; 920my$rev=$result_pages[0]->{revisions}->[0]; 921 922# Count page even if we skip it, since we display 923# $n/$total and $total includes skipped pages. 924$n++; 925 926my$page_title=$result_page->{title}; 927 928if(!exists($pages->{$page_title})) { 929print STDERR "$n/",scalar(@$revision_ids), 930": Skipping revision #$rev->{revid} of$page_title\n"; 931next; 932} 933 934$n_actual++; 935 936my%commit; 937$commit{author} =$rev->{user} ||'Anonymous'; 938$commit{comment} =$rev->{comment} ||'*Empty MediaWiki Message*'; 939$commit{title} = mediawiki_smudge_filename($page_title); 940$commit{mw_revision} =$rev->{revid}; 941$commit{content} = mediawiki_smudge($rev->{'*'}); 942 943if(!defined($rev->{timestamp})) { 944$last_timestamp++; 945}else{ 946$last_timestamp=$rev->{timestamp}; 947} 948$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 949 950# Differentiates classic pages and media files. 951my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 952my%mediafile; 953if($namespace&& get_mw_namespace_id($namespace) == get_mw_namespace_id("File")) { 954%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 955} 956# If this is a revision of the media page for new version 957# of a file do one common commit for both file and media page. 958# Else do commit only for that page. 959print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 960 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 961} 962 963return$n_actual; 964} 965 966sub error_non_fast_forward { 967my$advice= run_git("config --bool advice.pushNonFastForward"); 968chomp($advice); 969if($advicene"false") { 970# Native git-push would show this after the summary. 971# We can't ask it to display it cleanly, so print it 972# ourselves before. 973print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 974print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 975print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 976} 977print STDOUT "error$_[0]\"non-fast-forward\"\n"; 978return0; 979} 980 981sub mw_upload_file { 982my$complete_file_name=shift; 983my$new_sha1=shift; 984my$extension=shift; 985my$file_deleted=shift; 986my$summary=shift; 987my$newrevid; 988my$path="File:".$complete_file_name; 989my%hashFiles= get_allowed_file_extensions(); 990if(!exists($hashFiles{$extension})) { 991print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 992print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 993return$newrevid; 994} 995# Deleting and uploading a file requires a priviledged user 996if($file_deleted) { 997 mw_connect_maybe(); 998my$query= { 999 action =>'delete',1000 title =>$path,1001 reason =>$summary1002};1003if(!$mediawiki->edit($query)) {1004print STDERR "Failed to delete file on remote wiki\n";1005print STDERR "Check your permissions on the remote site. Error code:\n";1006print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details};1007exit1;1008}1009}else{1010# Don't let perl try to interpret file content as UTF-8 => use "raw"1011my$content= run_git("cat-file blob$new_sha1","raw");1012if($contentne"") {1013 mw_connect_maybe();1014$mediawiki->{config}->{upload_url} =1015"$url/index.php/Special:Upload";1016$mediawiki->edit({1017 action =>'upload',1018 filename =>$complete_file_name,1019 comment =>$summary,1020 file => [undef,1021$complete_file_name,1022 Content =>$content],1023 ignorewarnings =>1,1024}, {1025 skip_encoding =>11026} ) ||die$mediawiki->{error}->{code} .':'1027.$mediawiki->{error}->{details};1028my$last_file_page=$mediawiki->get_page({title =>$path});1029$newrevid=$last_file_page->{revid};1030print STDERR "Pushed file:$new_sha1-$complete_file_name.\n";1031}else{1032print STDERR "Empty file$complete_file_namenot pushed.\n";1033}1034}1035return$newrevid;1036}10371038sub mw_push_file {1039my$diff_info=shift;1040# $diff_info contains a string in this format:1041# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1042my@diff_info_split=split(/[ \t]/,$diff_info);10431044# Filename, including .mw extension1045my$complete_file_name=shift;1046# Commit message1047my$summary=shift;1048# MediaWiki revision number. Keep the previous one by default,1049# in case there's no edit to perform.1050my$oldrevid=shift;1051my$newrevid;10521053my$new_sha1=$diff_info_split[3];1054my$old_sha1=$diff_info_split[2];1055my$page_created= ($old_sha1eq NULL_SHA1);1056my$page_deleted= ($new_sha1eq NULL_SHA1);1057$complete_file_name= mediawiki_clean_filename($complete_file_name);10581059my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1060if(!defined($extension)) {1061$extension="";1062}1063if($extensioneq"mw") {1064my$file_content;1065if($page_deleted) {1066# Deleting a page usually requires1067# special priviledges. A common1068# convention is to replace the page1069# with this content instead:1070$file_content= DELETED_CONTENT;1071}else{1072$file_content= run_git("cat-file blob$new_sha1");1073}10741075 mw_connect_maybe();10761077my$result=$mediawiki->edit( {1078 action =>'edit',1079 summary =>$summary,1080 title =>$title,1081 basetimestamp =>$basetimestamps{$oldrevid},1082 text => mediawiki_clean($file_content,$page_created),1083}, {1084 skip_encoding =>1# Helps with names with accentuated characters1085});1086if(!$result) {1087if($mediawiki->{error}->{code} ==3) {1088# edit conflicts, considered as non-fast-forward1089print STDERR 'Warning: Error '.1090$mediawiki->{error}->{code} .1091' from mediwiki: '.$mediawiki->{error}->{details} .1092".\n";1093return($oldrevid,"non-fast-forward");1094}else{1095# Other errors. Shouldn't happen => just die()1096die'Fatal: Error '.1097$mediawiki->{error}->{code} .1098' from mediwiki: '.$mediawiki->{error}->{details};1099}1100}1101$newrevid=$result->{edit}->{newrevid};1102print STDERR "Pushed file:$new_sha1-$title\n";1103}else{1104$newrevid= mw_upload_file($complete_file_name,$new_sha1,1105$extension,$page_deleted,1106$summary);1107}1108$newrevid= ($newrevidor$oldrevid);1109return($newrevid,"ok");1110}11111112sub mw_push {1113# multiple push statements can follow each other1114my@refsspecs= (shift, get_more_refs("push"));1115my$pushed;1116formy$refspec(@refsspecs) {1117my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1118or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1119if($force) {1120print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1121}1122if($localeq"") {1123print STDERR "Cannot delete remote branch on a MediaWiki\n";1124print STDOUT "error$remotecannot delete\n";1125next;1126}1127if($remotene"refs/heads/master") {1128print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1129print STDOUT "error$remoteonly master allowed\n";1130next;1131}1132if(mw_push_revision($local,$remote)) {1133$pushed=1;1134}1135}11361137# Notify Git that the push is done1138print STDOUT "\n";11391140if($pushed&&$dumb_push) {1141print STDERR "Just pushed some revisions to MediaWiki.\n";1142print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1143print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1144print STDERR "\n";1145print STDERR " git pull --rebase\n";1146print STDERR "\n";1147}1148}11491150sub mw_push_revision {1151my$local=shift;1152my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1153my$last_local_revid= get_last_local_revision();1154print STDERR ".\n";# Finish sentence started by get_last_local_revision()1155my$last_remote_revid= get_last_remote_revision();1156my$mw_revision=$last_remote_revid;11571158# Get sha1 of commit pointed by local HEAD1159my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1160# Get sha1 of commit pointed by remotes/$remotename/master1161my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1162chomp($remoteorigin_sha1);11631164if($last_local_revid>0&&1165$last_local_revid<$last_remote_revid) {1166return error_non_fast_forward($remote);1167}11681169if($HEAD_sha1eq$remoteorigin_sha1) {1170# nothing to push1171return0;1172}11731174# Get every commit in between HEAD and refs/remotes/origin/master,1175# including HEAD and refs/remotes/origin/master1176my@commit_pairs= ();1177if($last_local_revid>0) {1178my$parsed_sha1=$remoteorigin_sha1;1179# Find a path from last MediaWiki commit to pushed commit1180while($parsed_sha1ne$HEAD_sha1) {1181my@commit_info=grep(/^$parsed_sha1/,split(/\n/, run_git("rev-list --children$local")));1182if(!@commit_info) {1183return error_non_fast_forward($remote);1184}1185my@commit_info_split=split(/ |\n/,$commit_info[0]);1186# $commit_info_split[1] is the sha1 of the commit to export1187# $commit_info_split[0] is the sha1 of its direct child1188push(@commit_pairs, \@commit_info_split);1189$parsed_sha1=$commit_info_split[1];1190}1191}else{1192# No remote mediawiki revision. Export the whole1193# history (linearized with --first-parent)1194print STDERR "Warning: no common ancestor, pushing complete history\n";1195my$history= run_git("rev-list --first-parent --children$local");1196my@history=split('\n',$history);1197@history=@history[1..$#history];1198foreachmy$line(reverse@history) {1199my@commit_info_split=split(/ |\n/,$line);1200push(@commit_pairs, \@commit_info_split);1201}1202}12031204foreachmy$commit_info_split(@commit_pairs) {1205my$sha1_child= @{$commit_info_split}[0];1206my$sha1_commit= @{$commit_info_split}[1];1207my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1208# TODO: we could detect rename, and encode them with a #redirect on the wiki.1209# TODO: for now, it's just a delete+add1210my@diff_info_list=split(/\0/,$diff_infos);1211# Keep the subject line of the commit message as mediawiki comment for the revision1212my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1213chomp($commit_msg);1214# Push every blob1215while(@diff_info_list) {1216my$status;1217# git diff-tree -z gives an output like1218# <metadata>\0<filename1>\01219# <metadata>\0<filename2>\01220# and we've split on \0.1221my$info=shift(@diff_info_list);1222my$file=shift(@diff_info_list);1223($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1224if($statuseq"non-fast-forward") {1225# we may already have sent part of the1226# commit to MediaWiki, but it's too1227# late to cancel it. Stop the push in1228# the middle, but still give an1229# accurate error message.1230return error_non_fast_forward($remote);1231}1232if($statusne"ok") {1233die("Unknown error from mw_push_file()");1234}1235}1236unless($dumb_push) {1237 run_git("notes --ref=$remotename/mediawikiadd -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1238 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1239}1240}12411242print STDOUT "ok$remote\n";1243return1;1244}12451246sub get_allowed_file_extensions {1247 mw_connect_maybe();12481249my$query= {1250 action =>'query',1251 meta =>'siteinfo',1252 siprop =>'fileextensions'1253};1254my$result=$mediawiki->api($query);1255my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1256my%hashFile=map{$_=>1}@file_extensions;12571258return%hashFile;1259}12601261# In memory cache for MediaWiki namespace ids.1262my%namespace_id;12631264# Namespaces whose id is cached in the configuration file1265# (to avoid duplicates)1266my%cached_mw_namespace_id;12671268# Return MediaWiki id for a canonical namespace name.1269# Ex.: "File", "Project".1270sub get_mw_namespace_id {1271 mw_connect_maybe();1272my$name=shift;12731274if(!exists$namespace_id{$name}) {1275# Look at configuration file, if the record for that namespace is1276# already cached. Namespaces are stored in form:1277# "Name_of_namespace:Id_namespace", ex.: "File:6".1278my@temp=split(/[ \n]/, run_git("config --get-all remote."1279.$remotename.".namespaceCache"));1280chomp(@temp);1281foreachmy$ns(@temp) {1282my($n,$id) =split(/:/,$ns);1283$namespace_id{$n} =$id;1284$cached_mw_namespace_id{$n} =1;1285}1286}12871288if(!exists$namespace_id{$name}) {1289print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1290# NS not found => get namespace id from MW and store it in1291# configuration file.1292my$query= {1293 action =>'query',1294 meta =>'siteinfo',1295 siprop =>'namespaces'1296};1297my$result=$mediawiki->api($query);12981299while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1300if(defined($ns->{id}) &&defined($ns->{canonical})) {1301$namespace_id{$ns->{canonical}} =$ns->{id};1302if($ns->{'*'}) {1303# alias (e.g. french Fichier: as alias for canonical File:)1304$namespace_id{$ns->{'*'}} =$ns->{id};1305}1306}1307}1308}13091310my$id=$namespace_id{$name};13111312if(defined$id) {1313# Store explicitely requested namespaces on disk1314if(!exists$cached_mw_namespace_id{$name}) {1315 run_git("config --add remote.".$remotename1316.".namespaceCache\"".$name.":".$id."\"");1317$cached_mw_namespace_id{$name} =1;1318}1319return$id;1320}else{1321die"No such namespace$nameon MediaWiki.";1322}1323}