1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# https://github.com/Bibzball/Git-Mediawiki/wiki 13# 14# Known limitations: 15# 16# - Several strategies are provided to fetch modifications from the 17# wiki, but no automatic heuristics is provided, the user has 18# to understand and chose which strategy is appropriate for him. 19# 20# - Git renames could be turned into MediaWiki renames (see TODO 21# below) 22# 23# - No way to import "one page, and all pages included in it" 24# 25# - Multiple remote MediaWikis have not been very well tested. 26 27use strict; 28use MediaWiki::API; 29use DateTime::Format::ISO8601; 30 31# By default, use UTF-8 to communicate with Git and the user 32binmode STDERR,":utf8"; 33binmode STDOUT,":utf8"; 34 35use URI::Escape; 36use IPC::Open2; 37 38use warnings; 39 40# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 41useconstant SLASH_REPLACEMENT =>"%2F"; 42 43# It's not always possible to delete pages (may require some 44# priviledges). Deleted pages are replaced with this content. 45useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 46 47# It's not possible to create empty pages. New empty files in Git are 48# sent with this content instead. 49useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 50 51# used to reflect file creation or deletion in diff. 52useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 53 54# Used on Git's side to reflect empty edit messages on the wiki 55useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 56 57my$remotename=$ARGV[0]; 58my$url=$ARGV[1]; 59 60# Accept both space-separated and multiple keys in config file. 61# Spaces should be written as _ anyway because we'll use chomp. 62my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 63chomp(@tracked_pages); 64 65# Just like @tracked_pages, but for MediaWiki categories. 66my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 67chomp(@tracked_categories); 68 69# Import media files on pull 70my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 71chomp($import_media); 72$import_media= ($import_mediaeq"true"); 73 74# Export media files on push 75my$export_media= run_git("config --get --bool remote.".$remotename.".mediaexport"); 76chomp($export_media); 77$export_media= !($export_mediaeq"false"); 78 79my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 80# Note: mwPassword is discourraged. Use the credential system instead. 81my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 82my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 83chomp($wiki_login); 84chomp($wiki_passwd); 85chomp($wiki_domain); 86 87# Import only last revisions (both for clone and fetch) 88my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 89chomp($shallow_import); 90$shallow_import= ($shallow_importeq"true"); 91 92# Fetch (clone and pull) by revisions instead of by pages. This behavior 93# is more efficient when we have a wiki with lots of pages and we fetch 94# the revisions quite often so that they concern only few pages. 95# Possible values: 96# - by_rev: perform one query per new revision on the remote wiki 97# - by_page: query each tracked page for new revision 98my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 99unless($fetch_strategy) { 100$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 101} 102chomp($fetch_strategy); 103unless($fetch_strategy) { 104$fetch_strategy="by_page"; 105} 106 107# Dumb push: don't update notes and mediawiki ref to reflect the last push. 108# 109# Configurable with mediawiki.dumbPush, or per-remote with 110# remote.<remotename>.dumbPush. 111# 112# This means the user will have to re-import the just-pushed 113# revisions. On the other hand, this means that the Git revisions 114# corresponding to MediaWiki revisions are all imported from the wiki, 115# regardless of whether they were initially created in Git or from the 116# web interface, hence all users will get the same history (i.e. if 117# the push from Git to MediaWiki loses some information, everybody 118# will get the history with information lost). If the import is 119# deterministic, this means everybody gets the same sha1 for each 120# MediaWiki revision. 121my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 122unless($dumb_push) { 123$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 124} 125chomp($dumb_push); 126$dumb_push= ($dumb_pusheq"true"); 127 128my$wiki_name=$url; 129$wiki_name=~s/[^\/]*:\/\///; 130# If URL is like http://user:password@example.com/, we clearly don't 131# want the password in $wiki_name. While we're there, also remove user 132# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 133$wiki_name=~s/^.*@//; 134 135# Commands parser 136my$entry; 137my@cmd; 138while(<STDIN>) { 139chomp; 140@cmd=split(/ /); 141if(defined($cmd[0])) { 142# Line not blank 143if($cmd[0]eq"capabilities") { 144die("Too many arguments for capabilities")unless(!defined($cmd[1])); 145 mw_capabilities(); 146}elsif($cmd[0]eq"list") { 147die("Too many arguments for list")unless(!defined($cmd[2])); 148 mw_list($cmd[1]); 149}elsif($cmd[0]eq"import") { 150die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 151 mw_import($cmd[1]); 152}elsif($cmd[0]eq"option") { 153die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 154 mw_option($cmd[1],$cmd[2]); 155}elsif($cmd[0]eq"push") { 156 mw_push($cmd[1]); 157}else{ 158print STDERR "Unknown command. Aborting...\n"; 159last; 160} 161}else{ 162# blank line: we should terminate 163last; 164} 165 166BEGIN{ $| =1}# flush STDOUT, to make sure the previous 167# command is fully processed. 168} 169 170########################## Functions ############################## 171 172## credential API management (generic functions) 173 174sub credential_from_url { 175my$url=shift; 176my$parsed= URI->new($url); 177my%credential; 178 179if($parsed->scheme) { 180$credential{protocol} =$parsed->scheme; 181} 182if($parsed->host) { 183$credential{host} =$parsed->host; 184} 185if($parsed->path) { 186$credential{path} =$parsed->path; 187} 188if($parsed->userinfo) { 189if($parsed->userinfo=~/([^:]*):(.*)/) { 190$credential{username} =$1; 191$credential{password} =$2; 192}else{ 193$credential{username} =$parsed->userinfo; 194} 195} 196 197return%credential; 198} 199 200sub credential_read { 201my%credential; 202my$reader=shift; 203my$op=shift; 204while(<$reader>) { 205my($key,$value) =/([^=]*)=(.*)/; 206if(not defined$key) { 207die"ERROR receiving response from git credential$op:\n$_\n"; 208} 209$credential{$key} =$value; 210} 211return%credential; 212} 213 214sub credential_write { 215my$credential=shift; 216my$writer=shift; 217while(my($key,$value) =each(%$credential) ) { 218if($value) { 219print$writer"$key=$value\n"; 220} 221} 222} 223 224sub credential_run { 225my$op=shift; 226my$credential=shift; 227my$pid= open2(my$reader,my$writer,"git credential$op"); 228 credential_write($credential,$writer); 229print$writer"\n"; 230close($writer); 231 232if($opeq"fill") { 233%$credential= credential_read($reader,$op); 234}else{ 235if(<$reader>) { 236die"ERROR while running git credential$op:\n$_"; 237} 238} 239close($reader); 240waitpid($pid,0); 241my$child_exit_status=$?>>8; 242if($child_exit_status!=0) { 243die"'git credential$op' failed with code$child_exit_status."; 244} 245} 246 247# MediaWiki API instance, created lazily. 248my$mediawiki; 249 250sub mw_connect_maybe { 251if($mediawiki) { 252return; 253} 254$mediawiki= MediaWiki::API->new; 255$mediawiki->{config}->{api_url} ="$url/api.php"; 256if($wiki_login) { 257my%credential= credential_from_url($url); 258$credential{username} =$wiki_login; 259$credential{password} =$wiki_passwd; 260 credential_run("fill", \%credential); 261my$request= {lgname =>$credential{username}, 262 lgpassword =>$credential{password}, 263 lgdomain =>$wiki_domain}; 264if($mediawiki->login($request)) { 265 credential_run("approve", \%credential); 266print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 267}else{ 268print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 269print STDERR " (error ". 270$mediawiki->{error}->{code} .': '. 271$mediawiki->{error}->{details} .")\n"; 272 credential_run("reject", \%credential); 273exit1; 274} 275} 276} 277 278## Functions for listing pages on the remote wiki 279sub get_mw_tracked_pages { 280my$pages=shift; 281 get_mw_page_list(\@tracked_pages,$pages); 282} 283 284sub get_mw_page_list { 285my$page_list=shift; 286my$pages=shift; 287my@some_pages=@$page_list; 288while(@some_pages) { 289my$last=50; 290if($#some_pages<$last) { 291$last=$#some_pages; 292} 293my@slice=@some_pages[0..$last]; 294 get_mw_first_pages(\@slice,$pages); 295@some_pages=@some_pages[51..$#some_pages]; 296} 297} 298 299sub get_mw_tracked_categories { 300my$pages=shift; 301foreachmy$category(@tracked_categories) { 302if(index($category,':') <0) { 303# Mediawiki requires the Category 304# prefix, but let's not force the user 305# to specify it. 306$category="Category:".$category; 307} 308my$mw_pages=$mediawiki->list( { 309 action =>'query', 310 list =>'categorymembers', 311 cmtitle =>$category, 312 cmlimit =>'max'} ) 313||die$mediawiki->{error}->{code} .': ' 314.$mediawiki->{error}->{details}; 315foreachmy$page(@{$mw_pages}) { 316$pages->{$page->{title}} =$page; 317} 318} 319} 320 321sub get_mw_all_pages { 322my$pages=shift; 323# No user-provided list, get the list of pages from the API. 324my$mw_pages=$mediawiki->list({ 325 action =>'query', 326 list =>'allpages', 327 aplimit =>'max' 328}); 329if(!defined($mw_pages)) { 330print STDERR "fatal: could not get the list of wiki pages.\n"; 331print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 332print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 333exit1; 334} 335foreachmy$page(@{$mw_pages}) { 336$pages->{$page->{title}} =$page; 337} 338} 339 340# queries the wiki for a set of pages. Meant to be used within a loop 341# querying the wiki for slices of page list. 342sub get_mw_first_pages { 343my$some_pages=shift; 344my@some_pages= @{$some_pages}; 345 346my$pages=shift; 347 348# pattern 'page1|page2|...' required by the API 349my$titles=join('|',@some_pages); 350 351my$mw_pages=$mediawiki->api({ 352 action =>'query', 353 titles =>$titles, 354}); 355if(!defined($mw_pages)) { 356print STDERR "fatal: could not query the list of wiki pages.\n"; 357print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 358print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 359exit1; 360} 361while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 362if($id<0) { 363print STDERR "Warning: page$page->{title} not found on wiki\n"; 364}else{ 365$pages->{$page->{title}} =$page; 366} 367} 368} 369 370# Get the list of pages to be fetched according to configuration. 371sub get_mw_pages { 372 mw_connect_maybe(); 373 374print STDERR "Listing pages on remote wiki...\n"; 375 376my%pages;# hash on page titles to avoid duplicates 377my$user_defined; 378if(@tracked_pages) { 379$user_defined=1; 380# The user provided a list of pages titles, but we 381# still need to query the API to get the page IDs. 382 get_mw_tracked_pages(\%pages); 383} 384if(@tracked_categories) { 385$user_defined=1; 386 get_mw_tracked_categories(\%pages); 387} 388if(!$user_defined) { 389 get_mw_all_pages(\%pages); 390} 391if($import_media) { 392print STDERR "Getting media files for selected pages...\n"; 393if($user_defined) { 394 get_linked_mediafiles(\%pages); 395}else{ 396 get_all_mediafiles(\%pages); 397} 398} 399print STDERR (scalar keys%pages) ." pages found.\n"; 400return%pages; 401} 402 403# usage: $out = run_git("command args"); 404# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 405sub run_git { 406my$args=shift; 407my$encoding= (shift||"encoding(UTF-8)"); 408open(my$git,"-|:$encoding","git ".$args); 409my$res=do{local$/; <$git> }; 410close($git); 411 412return$res; 413} 414 415 416sub get_all_mediafiles { 417my$pages=shift; 418# Attach list of all pages for media files from the API, 419# they are in a different namespace, only one namespace 420# can be queried at the same moment 421my$mw_pages=$mediawiki->list({ 422 action =>'query', 423 list =>'allpages', 424 apnamespace => get_mw_namespace_id("File"), 425 aplimit =>'max' 426}); 427if(!defined($mw_pages)) { 428print STDERR "fatal: could not get the list of pages for media files.\n"; 429print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 430print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 431exit1; 432} 433foreachmy$page(@{$mw_pages}) { 434$pages->{$page->{title}} =$page; 435} 436} 437 438sub get_linked_mediafiles { 439my$pages=shift; 440my@titles=map$_->{title},values(%{$pages}); 441 442# The query is split in small batches because of the MW API limit of 443# the number of links to be returned (500 links max). 444my$batch=10; 445while(@titles) { 446if($#titles<$batch) { 447$batch=$#titles; 448} 449my@slice=@titles[0..$batch]; 450 451# pattern 'page1|page2|...' required by the API 452my$mw_titles=join('|',@slice); 453 454# Media files could be included or linked from 455# a page, get all related 456my$query= { 457 action =>'query', 458 prop =>'links|images', 459 titles =>$mw_titles, 460 plnamespace => get_mw_namespace_id("File"), 461 pllimit =>'max' 462}; 463my$result=$mediawiki->api($query); 464 465while(my($id,$page) =each(%{$result->{query}->{pages}})) { 466my@media_titles; 467if(defined($page->{links})) { 468my@link_titles=map$_->{title}, @{$page->{links}}; 469push(@media_titles,@link_titles); 470} 471if(defined($page->{images})) { 472my@image_titles=map$_->{title}, @{$page->{images}}; 473push(@media_titles,@image_titles); 474} 475if(@media_titles) { 476 get_mw_page_list(\@media_titles,$pages); 477} 478} 479 480@titles=@titles[($batch+1)..$#titles]; 481} 482} 483 484sub get_mw_mediafile_for_page_revision { 485# Name of the file on Wiki, with the prefix. 486my$filename=shift; 487my$timestamp=shift; 488my%mediafile; 489 490# Search if on a media file with given timestamp exists on 491# MediaWiki. In that case download the file. 492my$query= { 493 action =>'query', 494 prop =>'imageinfo', 495 titles =>"File:".$filename, 496 iistart =>$timestamp, 497 iiend =>$timestamp, 498 iiprop =>'timestamp|archivename|url', 499 iilimit =>1 500}; 501my$result=$mediawiki->api($query); 502 503my($fileid,$file) =each( %{$result->{query}->{pages}} ); 504# If not defined it means there is no revision of the file for 505# given timestamp. 506if(defined($file->{imageinfo})) { 507$mediafile{title} =$filename; 508 509my$fileinfo=pop(@{$file->{imageinfo}}); 510$mediafile{timestamp} =$fileinfo->{timestamp}; 511# Mediawiki::API's download function doesn't support https URLs 512# and can't download old versions of files. 513print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 514$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 515} 516return%mediafile; 517} 518 519sub download_mw_mediafile { 520my$url=shift; 521 522my$response=$mediawiki->{ua}->get($url); 523if($response->code==200) { 524return$response->decoded_content; 525}else{ 526print STDERR "Error downloading mediafile from :\n"; 527print STDERR "URL:$url\n"; 528print STDERR "Server response: ".$response->code." ".$response->message."\n"; 529exit1; 530} 531} 532 533sub get_last_local_revision { 534# Get note regarding last mediawiki revision 535my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 536my@note_info=split(/ /,$note); 537 538my$lastrevision_number; 539if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 540print STDERR "No previous mediawiki revision found"; 541$lastrevision_number=0; 542}else{ 543# Notes are formatted : mediawiki_revision: #number 544$lastrevision_number=$note_info[1]; 545chomp($lastrevision_number); 546print STDERR "Last local mediawiki revision found is$lastrevision_number"; 547} 548return$lastrevision_number; 549} 550 551# Remember the timestamp corresponding to a revision id. 552my%basetimestamps; 553 554# Get the last remote revision without taking in account which pages are 555# tracked or not. This function makes a single request to the wiki thus 556# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 557# option. 558sub get_last_global_remote_rev { 559 mw_connect_maybe(); 560 561my$query= { 562 action =>'query', 563 list =>'recentchanges', 564 prop =>'revisions', 565 rclimit =>'1', 566 rcdir =>'older', 567}; 568my$result=$mediawiki->api($query); 569return$result->{query}->{recentchanges}[0]->{revid}; 570} 571 572# Get the last remote revision concerning the tracked pages and the tracked 573# categories. 574sub get_last_remote_revision { 575 mw_connect_maybe(); 576 577my%pages_hash= get_mw_pages(); 578my@pages=values(%pages_hash); 579 580my$max_rev_num=0; 581 582print STDERR "Getting last revision id on tracked pages...\n"; 583 584foreachmy$page(@pages) { 585my$id=$page->{pageid}; 586 587my$query= { 588 action =>'query', 589 prop =>'revisions', 590 rvprop =>'ids|timestamp', 591 pageids =>$id, 592}; 593 594my$result=$mediawiki->api($query); 595 596my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 597 598$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 599 600$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 601} 602 603print STDERR "Last remote revision found is$max_rev_num.\n"; 604return$max_rev_num; 605} 606 607# Clean content before sending it to MediaWiki 608sub mediawiki_clean { 609my$string=shift; 610my$page_created=shift; 611# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 612# This function right trims a string and adds a \n at the end to follow this rule 613$string=~s/\s+$//; 614if($stringeq""&&$page_created) { 615# Creating empty pages is forbidden. 616$string= EMPTY_CONTENT; 617} 618return$string."\n"; 619} 620 621# Filter applied on MediaWiki data before adding them to Git 622sub mediawiki_smudge { 623my$string=shift; 624if($stringeq EMPTY_CONTENT) { 625$string=""; 626} 627# This \n is important. This is due to mediawiki's way to handle end of files. 628return$string."\n"; 629} 630 631sub mediawiki_clean_filename { 632my$filename=shift; 633$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 634# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 635# Do a variant of URL-encoding, i.e. looks like URL-encoding, 636# but with _ added to prevent MediaWiki from thinking this is 637# an actual special character. 638$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 639# If we use the uri escape before 640# we should unescape here, before anything 641 642return$filename; 643} 644 645sub mediawiki_smudge_filename { 646my$filename=shift; 647$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 648$filename=~s/ /_/g; 649# Decode forbidden characters encoded in mediawiki_clean_filename 650$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 651return$filename; 652} 653 654sub literal_data { 655my($content) =@_; 656print STDOUT "data ", bytes::length($content),"\n",$content; 657} 658 659sub literal_data_raw { 660# Output possibly binary content. 661my($content) =@_; 662# Avoid confusion between size in bytes and in characters 663 utf8::downgrade($content); 664binmode STDOUT,":raw"; 665print STDOUT "data ", bytes::length($content),"\n",$content; 666binmode STDOUT,":utf8"; 667} 668 669sub mw_capabilities { 670# Revisions are imported to the private namespace 671# refs/mediawiki/$remotename/ by the helper and fetched into 672# refs/remotes/$remotename later by fetch. 673print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 674print STDOUT "import\n"; 675print STDOUT "list\n"; 676print STDOUT "push\n"; 677print STDOUT "\n"; 678} 679 680sub mw_list { 681# MediaWiki do not have branches, we consider one branch arbitrarily 682# called master, and HEAD pointing to it. 683print STDOUT "? refs/heads/master\n"; 684print STDOUT "\@refs/heads/masterHEAD\n"; 685print STDOUT "\n"; 686} 687 688sub mw_option { 689print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 690print STDOUT "unsupported\n"; 691} 692 693sub fetch_mw_revisions_for_page { 694my$page=shift; 695my$id=shift; 696my$fetch_from=shift; 697my@page_revs= (); 698my$query= { 699 action =>'query', 700 prop =>'revisions', 701 rvprop =>'ids', 702 rvdir =>'newer', 703 rvstartid =>$fetch_from, 704 rvlimit =>500, 705 pageids =>$id, 706}; 707 708my$revnum=0; 709# Get 500 revisions at a time due to the mediawiki api limit 710while(1) { 711my$result=$mediawiki->api($query); 712 713# Parse each of those 500 revisions 714foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 715my$page_rev_ids; 716$page_rev_ids->{pageid} =$page->{pageid}; 717$page_rev_ids->{revid} =$revision->{revid}; 718push(@page_revs,$page_rev_ids); 719$revnum++; 720} 721last unless$result->{'query-continue'}; 722$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 723} 724if($shallow_import&&@page_revs) { 725print STDERR " Found 1 revision (shallow import).\n"; 726@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 727return$page_revs[0]; 728} 729print STDERR " Found ",$revnum," revision(s).\n"; 730return@page_revs; 731} 732 733sub fetch_mw_revisions { 734my$pages=shift;my@pages= @{$pages}; 735my$fetch_from=shift; 736 737my@revisions= (); 738my$n=1; 739foreachmy$page(@pages) { 740my$id=$page->{pageid}; 741 742print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 743$n++; 744my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 745@revisions= (@page_revs,@revisions); 746} 747 748return($n,@revisions); 749} 750 751sub import_file_revision { 752my$commit=shift; 753my%commit= %{$commit}; 754my$full_import=shift; 755my$n=shift; 756my$mediafile=shift; 757my%mediafile; 758if($mediafile) { 759%mediafile= %{$mediafile}; 760} 761 762my$title=$commit{title}; 763my$comment=$commit{comment}; 764my$content=$commit{content}; 765my$author=$commit{author}; 766my$date=$commit{date}; 767 768print STDOUT "commit refs/mediawiki/$remotename/master\n"; 769print STDOUT "mark :$n\n"; 770print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 771 literal_data($comment); 772 773# If it's not a clone, we need to know where to start from 774if(!$full_import&&$n==1) { 775print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 776} 777if($contentne DELETED_CONTENT) { 778print STDOUT "M 644 inline$title.mw\n"; 779 literal_data($content); 780if(%mediafile) { 781print STDOUT "M 644 inline$mediafile{title}\n"; 782 literal_data_raw($mediafile{content}); 783} 784print STDOUT "\n\n"; 785}else{ 786print STDOUT "D$title.mw\n"; 787} 788 789# mediawiki revision number in the git note 790if($full_import&&$n==1) { 791print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 792} 793print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 794print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 795 literal_data("Note added by git-mediawiki during import"); 796if(!$full_import&&$n==1) { 797print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 798} 799print STDOUT "N inline :$n\n"; 800 literal_data("mediawiki_revision: ".$commit{mw_revision}); 801print STDOUT "\n\n"; 802} 803 804# parse a sequence of 805# <cmd> <arg1> 806# <cmd> <arg2> 807# \n 808# (like batch sequence of import and sequence of push statements) 809sub get_more_refs { 810my$cmd=shift; 811my@refs; 812while(1) { 813my$line= <STDIN>; 814if($line=~m/^$cmd (.*)$/) { 815push(@refs,$1); 816}elsif($lineeq"\n") { 817return@refs; 818}else{ 819die("Invalid command in a '$cmd' batch: ".$_); 820} 821} 822} 823 824sub mw_import { 825# multiple import commands can follow each other. 826my@refs= (shift, get_more_refs("import")); 827foreachmy$ref(@refs) { 828 mw_import_ref($ref); 829} 830print STDOUT "done\n"; 831} 832 833sub mw_import_ref { 834my$ref=shift; 835# The remote helper will call "import HEAD" and 836# "import refs/heads/master". 837# Since HEAD is a symbolic ref to master (by convention, 838# followed by the output of the command "list" that we gave), 839# we don't need to do anything in this case. 840if($refeq"HEAD") { 841return; 842} 843 844 mw_connect_maybe(); 845 846print STDERR "Searching revisions...\n"; 847my$last_local= get_last_local_revision(); 848my$fetch_from=$last_local+1; 849if($fetch_from==1) { 850print STDERR ", fetching from beginning.\n"; 851}else{ 852print STDERR ", fetching from here.\n"; 853} 854 855my$n=0; 856if($fetch_strategyeq"by_rev") { 857print STDERR "Fetching & writing export data by revs...\n"; 858$n= mw_import_ref_by_revs($fetch_from); 859}elsif($fetch_strategyeq"by_page") { 860print STDERR "Fetching & writing export data by pages...\n"; 861$n= mw_import_ref_by_pages($fetch_from); 862}else{ 863print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 864print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 865exit1; 866} 867 868if($fetch_from==1&&$n==0) { 869print STDERR "You appear to have cloned an empty MediaWiki.\n"; 870# Something has to be done remote-helper side. If nothing is done, an error is 871# thrown saying that HEAD is refering to unknown object 0000000000000000000 872# and the clone fails. 873} 874} 875 876sub mw_import_ref_by_pages { 877 878my$fetch_from=shift; 879my%pages_hash= get_mw_pages(); 880my@pages=values(%pages_hash); 881 882my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 883 884@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 885my@revision_ids=map$_->{revid},@revisions; 886 887return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 888} 889 890sub mw_import_ref_by_revs { 891 892my$fetch_from=shift; 893my%pages_hash= get_mw_pages(); 894 895my$last_remote= get_last_global_remote_rev(); 896my@revision_ids=$fetch_from..$last_remote; 897return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 898} 899 900# Import revisions given in second argument (array of integers). 901# Only pages appearing in the third argument (hash indexed by page titles) 902# will be imported. 903sub mw_import_revids { 904my$fetch_from=shift; 905my$revision_ids=shift; 906my$pages=shift; 907 908my$n=0; 909my$n_actual=0; 910my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 911 912foreachmy$pagerevid(@$revision_ids) { 913# fetch the content of the pages 914my$query= { 915 action =>'query', 916 prop =>'revisions', 917 rvprop =>'content|timestamp|comment|user|ids', 918 revids =>$pagerevid, 919}; 920 921my$result=$mediawiki->api($query); 922 923if(!$result) { 924die"Failed to retrieve modified page for revision$pagerevid"; 925} 926 927if(!defined($result->{query}->{pages})) { 928die"Invalid revision$pagerevid."; 929} 930 931my@result_pages=values(%{$result->{query}->{pages}}); 932my$result_page=$result_pages[0]; 933my$rev=$result_pages[0]->{revisions}->[0]; 934 935# Count page even if we skip it, since we display 936# $n/$total and $total includes skipped pages. 937$n++; 938 939my$page_title=$result_page->{title}; 940 941if(!exists($pages->{$page_title})) { 942print STDERR "$n/",scalar(@$revision_ids), 943": Skipping revision #$rev->{revid} of$page_title\n"; 944next; 945} 946 947$n_actual++; 948 949my%commit; 950$commit{author} =$rev->{user} ||'Anonymous'; 951$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 952$commit{title} = mediawiki_smudge_filename($page_title); 953$commit{mw_revision} =$rev->{revid}; 954$commit{content} = mediawiki_smudge($rev->{'*'}); 955 956if(!defined($rev->{timestamp})) { 957$last_timestamp++; 958}else{ 959$last_timestamp=$rev->{timestamp}; 960} 961$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 962 963# Differentiates classic pages and media files. 964my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 965my%mediafile; 966if($namespace&& get_mw_namespace_id($namespace) == get_mw_namespace_id("File")) { 967%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 968} 969# If this is a revision of the media page for new version 970# of a file do one common commit for both file and media page. 971# Else do commit only for that page. 972print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 973 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 974} 975 976return$n_actual; 977} 978 979sub error_non_fast_forward { 980my$advice= run_git("config --bool advice.pushNonFastForward"); 981chomp($advice); 982if($advicene"false") { 983# Native git-push would show this after the summary. 984# We can't ask it to display it cleanly, so print it 985# ourselves before. 986print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 987print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 988print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 989} 990print STDOUT "error$_[0]\"non-fast-forward\"\n"; 991return0; 992} 993 994sub mw_upload_file { 995my$complete_file_name=shift; 996my$new_sha1=shift; 997my$extension=shift; 998my$file_deleted=shift; 999my$summary=shift;1000my$newrevid;1001my$path="File:".$complete_file_name;1002my%hashFiles= get_allowed_file_extensions();1003if(!exists($hashFiles{$extension})) {1004print STDERR "$complete_file_nameis not a permitted file on this wiki.\n";1005print STDERR "Check the configuration of file uploads in your mediawiki.\n";1006return$newrevid;1007}1008# Deleting and uploading a file requires a priviledged user1009if($file_deleted) {1010 mw_connect_maybe();1011my$query= {1012 action =>'delete',1013 title =>$path,1014 reason =>$summary1015};1016if(!$mediawiki->edit($query)) {1017print STDERR "Failed to delete file on remote wiki\n";1018print STDERR "Check your permissions on the remote site. Error code:\n";1019print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details};1020exit1;1021}1022}else{1023# Don't let perl try to interpret file content as UTF-8 => use "raw"1024my$content= run_git("cat-file blob$new_sha1","raw");1025if($contentne"") {1026 mw_connect_maybe();1027$mediawiki->{config}->{upload_url} =1028"$url/index.php/Special:Upload";1029$mediawiki->edit({1030 action =>'upload',1031 filename =>$complete_file_name,1032 comment =>$summary,1033 file => [undef,1034$complete_file_name,1035 Content =>$content],1036 ignorewarnings =>1,1037}, {1038 skip_encoding =>11039} ) ||die$mediawiki->{error}->{code} .':'1040.$mediawiki->{error}->{details};1041my$last_file_page=$mediawiki->get_page({title =>$path});1042$newrevid=$last_file_page->{revid};1043print STDERR "Pushed file:$new_sha1-$complete_file_name.\n";1044}else{1045print STDERR "Empty file$complete_file_namenot pushed.\n";1046}1047}1048return$newrevid;1049}10501051sub mw_push_file {1052my$diff_info=shift;1053# $diff_info contains a string in this format:1054# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1055my@diff_info_split=split(/[ \t]/,$diff_info);10561057# Filename, including .mw extension1058my$complete_file_name=shift;1059# Commit message1060my$summary=shift;1061# MediaWiki revision number. Keep the previous one by default,1062# in case there's no edit to perform.1063my$oldrevid=shift;1064my$newrevid;10651066if($summaryeq EMPTY_MESSAGE) {1067$summary='';1068}10691070my$new_sha1=$diff_info_split[3];1071my$old_sha1=$diff_info_split[2];1072my$page_created= ($old_sha1eq NULL_SHA1);1073my$page_deleted= ($new_sha1eq NULL_SHA1);1074$complete_file_name= mediawiki_clean_filename($complete_file_name);10751076my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1077if(!defined($extension)) {1078$extension="";1079}1080if($extensioneq"mw") {1081my$ns= get_mw_namespace_id_for_page($complete_file_name);1082if($ns&&$ns== get_mw_namespace_id("File") && (!$export_media)) {1083print STDERR "Ignoring media file related page:$complete_file_name\n";1084return($oldrevid,"ok");1085}1086my$file_content;1087if($page_deleted) {1088# Deleting a page usually requires1089# special priviledges. A common1090# convention is to replace the page1091# with this content instead:1092$file_content= DELETED_CONTENT;1093}else{1094$file_content= run_git("cat-file blob$new_sha1");1095}10961097 mw_connect_maybe();10981099my$result=$mediawiki->edit( {1100 action =>'edit',1101 summary =>$summary,1102 title =>$title,1103 basetimestamp =>$basetimestamps{$oldrevid},1104 text => mediawiki_clean($file_content,$page_created),1105}, {1106 skip_encoding =>1# Helps with names with accentuated characters1107});1108if(!$result) {1109if($mediawiki->{error}->{code} ==3) {1110# edit conflicts, considered as non-fast-forward1111print STDERR 'Warning: Error '.1112$mediawiki->{error}->{code} .1113' from mediwiki: '.$mediawiki->{error}->{details} .1114".\n";1115return($oldrevid,"non-fast-forward");1116}else{1117# Other errors. Shouldn't happen => just die()1118die'Fatal: Error '.1119$mediawiki->{error}->{code} .1120' from mediwiki: '.$mediawiki->{error}->{details};1121}1122}1123$newrevid=$result->{edit}->{newrevid};1124print STDERR "Pushed file:$new_sha1-$title\n";1125}elsif($export_media) {1126$newrevid= mw_upload_file($complete_file_name,$new_sha1,1127$extension,$page_deleted,1128$summary);1129}else{1130print STDERR "Ignoring media file$title\n";1131}1132$newrevid= ($newrevidor$oldrevid);1133return($newrevid,"ok");1134}11351136sub mw_push {1137# multiple push statements can follow each other1138my@refsspecs= (shift, get_more_refs("push"));1139my$pushed;1140formy$refspec(@refsspecs) {1141my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1142or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1143if($force) {1144print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1145}1146if($localeq"") {1147print STDERR "Cannot delete remote branch on a MediaWiki\n";1148print STDOUT "error$remotecannot delete\n";1149next;1150}1151if($remotene"refs/heads/master") {1152print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1153print STDOUT "error$remoteonly master allowed\n";1154next;1155}1156if(mw_push_revision($local,$remote)) {1157$pushed=1;1158}1159}11601161# Notify Git that the push is done1162print STDOUT "\n";11631164if($pushed&&$dumb_push) {1165print STDERR "Just pushed some revisions to MediaWiki.\n";1166print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1167print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1168print STDERR "\n";1169print STDERR " git pull --rebase\n";1170print STDERR "\n";1171}1172}11731174sub mw_push_revision {1175my$local=shift;1176my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1177my$last_local_revid= get_last_local_revision();1178print STDERR ".\n";# Finish sentence started by get_last_local_revision()1179my$last_remote_revid= get_last_remote_revision();1180my$mw_revision=$last_remote_revid;11811182# Get sha1 of commit pointed by local HEAD1183my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1184# Get sha1 of commit pointed by remotes/$remotename/master1185my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1186chomp($remoteorigin_sha1);11871188if($last_local_revid>0&&1189$last_local_revid<$last_remote_revid) {1190return error_non_fast_forward($remote);1191}11921193if($HEAD_sha1eq$remoteorigin_sha1) {1194# nothing to push1195return0;1196}11971198# Get every commit in between HEAD and refs/remotes/origin/master,1199# including HEAD and refs/remotes/origin/master1200my@commit_pairs= ();1201if($last_local_revid>0) {1202my$parsed_sha1=$remoteorigin_sha1;1203# Find a path from last MediaWiki commit to pushed commit1204print STDERR "Computing path from local to remote ...\n";1205my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents$local^$parsed_sha1"));1206my%local_ancestry;1207foreachmy$line(@local_ancestry) {1208if(my($child,$parents) =$line=~m/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1209foreachmy$parent(split(' ',$parents)) {1210$local_ancestry{$parent} =$child;1211}1212}elsif(!$line=~m/^([a-f0-9]+)/) {1213die"Unexpected output from git rev-list:$line";1214}1215}1216while($parsed_sha1ne$HEAD_sha1) {1217my$child=$local_ancestry{$parsed_sha1};1218if(!$child) {1219printf STDERR "Cannot find a path in history from remote commit to last commit\n";1220return error_non_fast_forward($remote);1221}1222push(@commit_pairs, [$parsed_sha1,$child]);1223$parsed_sha1=$child;1224}1225}else{1226# No remote mediawiki revision. Export the whole1227# history (linearized with --first-parent)1228print STDERR "Warning: no common ancestor, pushing complete history\n";1229my$history= run_git("rev-list --first-parent --children$local");1230my@history=split('\n',$history);1231@history=@history[1..$#history];1232foreachmy$line(reverse@history) {1233my@commit_info_split=split(/ |\n/,$line);1234push(@commit_pairs, \@commit_info_split);1235}1236}12371238foreachmy$commit_info_split(@commit_pairs) {1239my$sha1_child= @{$commit_info_split}[0];1240my$sha1_commit= @{$commit_info_split}[1];1241my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1242# TODO: we could detect rename, and encode them with a #redirect on the wiki.1243# TODO: for now, it's just a delete+add1244my@diff_info_list=split(/\0/,$diff_infos);1245# Keep the subject line of the commit message as mediawiki comment for the revision1246my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1247chomp($commit_msg);1248# Push every blob1249while(@diff_info_list) {1250my$status;1251# git diff-tree -z gives an output like1252# <metadata>\0<filename1>\01253# <metadata>\0<filename2>\01254# and we've split on \0.1255my$info=shift(@diff_info_list);1256my$file=shift(@diff_info_list);1257($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1258if($statuseq"non-fast-forward") {1259# we may already have sent part of the1260# commit to MediaWiki, but it's too1261# late to cancel it. Stop the push in1262# the middle, but still give an1263# accurate error message.1264return error_non_fast_forward($remote);1265}1266if($statusne"ok") {1267die("Unknown error from mw_push_file()");1268}1269}1270unless($dumb_push) {1271 run_git("notes --ref=$remotename/mediawikiadd -f -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1272 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1273}1274}12751276print STDOUT "ok$remote\n";1277return1;1278}12791280sub get_allowed_file_extensions {1281 mw_connect_maybe();12821283my$query= {1284 action =>'query',1285 meta =>'siteinfo',1286 siprop =>'fileextensions'1287};1288my$result=$mediawiki->api($query);1289my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1290my%hashFile=map{$_=>1}@file_extensions;12911292return%hashFile;1293}12941295# In memory cache for MediaWiki namespace ids.1296my%namespace_id;12971298# Namespaces whose id is cached in the configuration file1299# (to avoid duplicates)1300my%cached_mw_namespace_id;13011302# Return MediaWiki id for a canonical namespace name.1303# Ex.: "File", "Project".1304sub get_mw_namespace_id {1305 mw_connect_maybe();1306my$name=shift;13071308if(!exists$namespace_id{$name}) {1309# Look at configuration file, if the record for that namespace is1310# already cached. Namespaces are stored in form:1311# "Name_of_namespace:Id_namespace", ex.: "File:6".1312my@temp=split(/[\n]/, run_git("config --get-all remote."1313.$remotename.".namespaceCache"));1314chomp(@temp);1315foreachmy$ns(@temp) {1316my($n,$id) =split(/:/,$ns);1317$namespace_id{$n} =$id;1318$cached_mw_namespace_id{$n} =1;1319}1320}13211322if(!exists$namespace_id{$name}) {1323print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1324# NS not found => get namespace id from MW and store it in1325# configuration file.1326my$query= {1327 action =>'query',1328 meta =>'siteinfo',1329 siprop =>'namespaces'1330};1331my$result=$mediawiki->api($query);13321333while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1334if(defined($ns->{id}) &&defined($ns->{canonical})) {1335$namespace_id{$ns->{canonical}} =$ns->{id};1336if($ns->{'*'}) {1337# alias (e.g. french Fichier: as alias for canonical File:)1338$namespace_id{$ns->{'*'}} =$ns->{id};1339}1340}1341}1342}13431344my$id=$namespace_id{$name};13451346if(defined$id) {1347# Store explicitely requested namespaces on disk1348if(!exists$cached_mw_namespace_id{$name}) {1349 run_git("config --add remote.".$remotename1350.".namespaceCache\"".$name.":".$id."\"");1351$cached_mw_namespace_id{$name} =1;1352}1353return$id;1354}else{1355die"No such namespace$nameon MediaWiki.";1356}1357}13581359sub get_mw_namespace_id_for_page {1360if(my($namespace) =$_[0] =~/^([^:]*):/) {1361return get_mw_namespace_id($namespace);1362}else{1363return;1364}1365}