1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# https://github.com/Bibzball/Git-Mediawiki/wiki 13# 14# Known limitations: 15# 16# - Several strategies are provided to fetch modifications from the 17# wiki, but no automatic heuristics is provided, the user has 18# to understand and chose which strategy is appropriate for him. 19# 20# - Git renames could be turned into MediaWiki renames (see TODO 21# below) 22# 23# - No way to import "one page, and all pages included in it" 24# 25# - Multiple remote MediaWikis have not been very well tested. 26 27use strict; 28use MediaWiki::API; 29use DateTime::Format::ISO8601; 30 31# By default, use UTF-8 to communicate with Git and the user 32binmode STDERR,":utf8"; 33binmode STDOUT,":utf8"; 34 35use URI::Escape; 36use IPC::Open2; 37 38use warnings; 39 40# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 41useconstant SLASH_REPLACEMENT =>"%2F"; 42 43# It's not always possible to delete pages (may require some 44# priviledges). Deleted pages are replaced with this content. 45useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 46 47# It's not possible to create empty pages. New empty files in Git are 48# sent with this content instead. 49useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 50 51# used to reflect file creation or deletion in diff. 52useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 53 54# Used on Git's side to reflect empty edit messages on the wiki 55useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 56 57my$remotename=$ARGV[0]; 58my$url=$ARGV[1]; 59 60# Accept both space-separated and multiple keys in config file. 61# Spaces should be written as _ anyway because we'll use chomp. 62my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 63chomp(@tracked_pages); 64 65# Just like @tracked_pages, but for MediaWiki categories. 66my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 67chomp(@tracked_categories); 68 69# Import media files too. 70my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 71chomp($import_media); 72$import_media= ($import_mediaeq"true"); 73 74my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 75# Note: mwPassword is discourraged. Use the credential system instead. 76my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 77my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 78chomp($wiki_login); 79chomp($wiki_passwd); 80chomp($wiki_domain); 81 82# Import only last revisions (both for clone and fetch) 83my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 84chomp($shallow_import); 85$shallow_import= ($shallow_importeq"true"); 86 87# Fetch (clone and pull) by revisions instead of by pages. This behavior 88# is more efficient when we have a wiki with lots of pages and we fetch 89# the revisions quite often so that they concern only few pages. 90# Possible values: 91# - by_rev: perform one query per new revision on the remote wiki 92# - by_page: query each tracked page for new revision 93my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 94unless($fetch_strategy) { 95$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 96} 97chomp($fetch_strategy); 98unless($fetch_strategy) { 99$fetch_strategy="by_page"; 100} 101 102# Dumb push: don't update notes and mediawiki ref to reflect the last push. 103# 104# Configurable with mediawiki.dumbPush, or per-remote with 105# remote.<remotename>.dumbPush. 106# 107# This means the user will have to re-import the just-pushed 108# revisions. On the other hand, this means that the Git revisions 109# corresponding to MediaWiki revisions are all imported from the wiki, 110# regardless of whether they were initially created in Git or from the 111# web interface, hence all users will get the same history (i.e. if 112# the push from Git to MediaWiki loses some information, everybody 113# will get the history with information lost). If the import is 114# deterministic, this means everybody gets the same sha1 for each 115# MediaWiki revision. 116my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 117unless($dumb_push) { 118$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 119} 120chomp($dumb_push); 121$dumb_push= ($dumb_pusheq"true"); 122 123my$wiki_name=$url; 124$wiki_name=~s/[^\/]*:\/\///; 125# If URL is like http://user:password@example.com/, we clearly don't 126# want the password in $wiki_name. While we're there, also remove user 127# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 128$wiki_name=~s/^.*@//; 129 130# Commands parser 131my$entry; 132my@cmd; 133while(<STDIN>) { 134chomp; 135@cmd=split(/ /); 136if(defined($cmd[0])) { 137# Line not blank 138if($cmd[0]eq"capabilities") { 139die("Too many arguments for capabilities")unless(!defined($cmd[1])); 140 mw_capabilities(); 141}elsif($cmd[0]eq"list") { 142die("Too many arguments for list")unless(!defined($cmd[2])); 143 mw_list($cmd[1]); 144}elsif($cmd[0]eq"import") { 145die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 146 mw_import($cmd[1]); 147}elsif($cmd[0]eq"option") { 148die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 149 mw_option($cmd[1],$cmd[2]); 150}elsif($cmd[0]eq"push") { 151 mw_push($cmd[1]); 152}else{ 153print STDERR "Unknown command. Aborting...\n"; 154last; 155} 156}else{ 157# blank line: we should terminate 158last; 159} 160 161BEGIN{ $| =1}# flush STDOUT, to make sure the previous 162# command is fully processed. 163} 164 165########################## Functions ############################## 166 167## credential API management (generic functions) 168 169sub credential_from_url { 170my$url=shift; 171my$parsed= URI->new($url); 172my%credential; 173 174if($parsed->scheme) { 175$credential{protocol} =$parsed->scheme; 176} 177if($parsed->host) { 178$credential{host} =$parsed->host; 179} 180if($parsed->path) { 181$credential{path} =$parsed->path; 182} 183if($parsed->userinfo) { 184if($parsed->userinfo=~/([^:]*):(.*)/) { 185$credential{username} =$1; 186$credential{password} =$2; 187}else{ 188$credential{username} =$parsed->userinfo; 189} 190} 191 192return%credential; 193} 194 195sub credential_read { 196my%credential; 197my$reader=shift; 198my$op=shift; 199while(<$reader>) { 200my($key,$value) =/([^=]*)=(.*)/; 201if(not defined$key) { 202die"ERROR receiving response from git credential$op:\n$_\n"; 203} 204$credential{$key} =$value; 205} 206return%credential; 207} 208 209sub credential_write { 210my$credential=shift; 211my$writer=shift; 212while(my($key,$value) =each(%$credential) ) { 213if($value) { 214print$writer"$key=$value\n"; 215} 216} 217} 218 219sub credential_run { 220my$op=shift; 221my$credential=shift; 222my$pid= open2(my$reader,my$writer,"git credential$op"); 223 credential_write($credential,$writer); 224print$writer"\n"; 225close($writer); 226 227if($opeq"fill") { 228%$credential= credential_read($reader,$op); 229}else{ 230if(<$reader>) { 231die"ERROR while running git credential$op:\n$_"; 232} 233} 234close($reader); 235waitpid($pid,0); 236my$child_exit_status=$?>>8; 237if($child_exit_status!=0) { 238die"'git credential$op' failed with code$child_exit_status."; 239} 240} 241 242# MediaWiki API instance, created lazily. 243my$mediawiki; 244 245sub mw_connect_maybe { 246if($mediawiki) { 247return; 248} 249$mediawiki= MediaWiki::API->new; 250$mediawiki->{config}->{api_url} ="$url/api.php"; 251if($wiki_login) { 252my%credential= credential_from_url($url); 253$credential{username} =$wiki_login; 254$credential{password} =$wiki_passwd; 255 credential_run("fill", \%credential); 256my$request= {lgname =>$credential{username}, 257 lgpassword =>$credential{password}, 258 lgdomain =>$wiki_domain}; 259if($mediawiki->login($request)) { 260 credential_run("approve", \%credential); 261print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 262}else{ 263print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 264print STDERR " (error ". 265$mediawiki->{error}->{code} .': '. 266$mediawiki->{error}->{details} .")\n"; 267 credential_run("reject", \%credential); 268exit1; 269} 270} 271} 272 273## Functions for listing pages on the remote wiki 274sub get_mw_tracked_pages { 275my$pages=shift; 276 get_mw_page_list(\@tracked_pages,$pages); 277} 278 279sub get_mw_page_list { 280my$page_list=shift; 281my$pages=shift; 282my@some_pages=@$page_list; 283while(@some_pages) { 284my$last=50; 285if($#some_pages<$last) { 286$last=$#some_pages; 287} 288my@slice=@some_pages[0..$last]; 289 get_mw_first_pages(\@slice,$pages); 290@some_pages=@some_pages[51..$#some_pages]; 291} 292} 293 294sub get_mw_tracked_categories { 295my$pages=shift; 296foreachmy$category(@tracked_categories) { 297if(index($category,':') <0) { 298# Mediawiki requires the Category 299# prefix, but let's not force the user 300# to specify it. 301$category="Category:".$category; 302} 303my$mw_pages=$mediawiki->list( { 304 action =>'query', 305 list =>'categorymembers', 306 cmtitle =>$category, 307 cmlimit =>'max'} ) 308||die$mediawiki->{error}->{code} .': ' 309.$mediawiki->{error}->{details}; 310foreachmy$page(@{$mw_pages}) { 311$pages->{$page->{title}} =$page; 312} 313} 314} 315 316sub get_mw_all_pages { 317my$pages=shift; 318# No user-provided list, get the list of pages from the API. 319my$mw_pages=$mediawiki->list({ 320 action =>'query', 321 list =>'allpages', 322 aplimit =>'max' 323}); 324if(!defined($mw_pages)) { 325print STDERR "fatal: could not get the list of wiki pages.\n"; 326print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 327print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 328exit1; 329} 330foreachmy$page(@{$mw_pages}) { 331$pages->{$page->{title}} =$page; 332} 333} 334 335# queries the wiki for a set of pages. Meant to be used within a loop 336# querying the wiki for slices of page list. 337sub get_mw_first_pages { 338my$some_pages=shift; 339my@some_pages= @{$some_pages}; 340 341my$pages=shift; 342 343# pattern 'page1|page2|...' required by the API 344my$titles=join('|',@some_pages); 345 346my$mw_pages=$mediawiki->api({ 347 action =>'query', 348 titles =>$titles, 349}); 350if(!defined($mw_pages)) { 351print STDERR "fatal: could not query the list of wiki pages.\n"; 352print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 353print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 354exit1; 355} 356while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 357if($id<0) { 358print STDERR "Warning: page$page->{title} not found on wiki\n"; 359}else{ 360$pages->{$page->{title}} =$page; 361} 362} 363} 364 365# Get the list of pages to be fetched according to configuration. 366sub get_mw_pages { 367 mw_connect_maybe(); 368 369my%pages;# hash on page titles to avoid duplicates 370my$user_defined; 371if(@tracked_pages) { 372$user_defined=1; 373# The user provided a list of pages titles, but we 374# still need to query the API to get the page IDs. 375 get_mw_tracked_pages(\%pages); 376} 377if(@tracked_categories) { 378$user_defined=1; 379 get_mw_tracked_categories(\%pages); 380} 381if(!$user_defined) { 382 get_mw_all_pages(\%pages); 383} 384if($import_media) { 385print STDERR "Getting media files for selected pages...\n"; 386if($user_defined) { 387 get_linked_mediafiles(\%pages); 388}else{ 389 get_all_mediafiles(\%pages); 390} 391} 392return%pages; 393} 394 395# usage: $out = run_git("command args"); 396# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 397sub run_git { 398my$args=shift; 399my$encoding= (shift||"encoding(UTF-8)"); 400open(my$git,"-|:$encoding","git ".$args); 401my$res=do{local$/; <$git> }; 402close($git); 403 404return$res; 405} 406 407 408sub get_all_mediafiles { 409my$pages=shift; 410# Attach list of all pages for media files from the API, 411# they are in a different namespace, only one namespace 412# can be queried at the same moment 413my$mw_pages=$mediawiki->list({ 414 action =>'query', 415 list =>'allpages', 416 apnamespace => get_mw_namespace_id("File"), 417 aplimit =>'max' 418}); 419if(!defined($mw_pages)) { 420print STDERR "fatal: could not get the list of pages for media files.\n"; 421print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 422print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 423exit1; 424} 425foreachmy$page(@{$mw_pages}) { 426$pages->{$page->{title}} =$page; 427} 428} 429 430sub get_linked_mediafiles { 431my$pages=shift; 432my@titles=map$_->{title},values(%{$pages}); 433 434# The query is split in small batches because of the MW API limit of 435# the number of links to be returned (500 links max). 436my$batch=10; 437while(@titles) { 438if($#titles<$batch) { 439$batch=$#titles; 440} 441my@slice=@titles[0..$batch]; 442 443# pattern 'page1|page2|...' required by the API 444my$mw_titles=join('|',@slice); 445 446# Media files could be included or linked from 447# a page, get all related 448my$query= { 449 action =>'query', 450 prop =>'links|images', 451 titles =>$mw_titles, 452 plnamespace => get_mw_namespace_id("File"), 453 pllimit =>'max' 454}; 455my$result=$mediawiki->api($query); 456 457while(my($id,$page) =each(%{$result->{query}->{pages}})) { 458my@media_titles; 459if(defined($page->{links})) { 460my@link_titles=map$_->{title}, @{$page->{links}}; 461push(@media_titles,@link_titles); 462} 463if(defined($page->{images})) { 464my@image_titles=map$_->{title}, @{$page->{images}}; 465push(@media_titles,@image_titles); 466} 467if(@media_titles) { 468 get_mw_page_list(\@media_titles,$pages); 469} 470} 471 472@titles=@titles[($batch+1)..$#titles]; 473} 474} 475 476sub get_mw_mediafile_for_page_revision { 477# Name of the file on Wiki, with the prefix. 478my$filename=shift; 479my$timestamp=shift; 480my%mediafile; 481 482# Search if on a media file with given timestamp exists on 483# MediaWiki. In that case download the file. 484my$query= { 485 action =>'query', 486 prop =>'imageinfo', 487 titles =>"File:".$filename, 488 iistart =>$timestamp, 489 iiend =>$timestamp, 490 iiprop =>'timestamp|archivename|url', 491 iilimit =>1 492}; 493my$result=$mediawiki->api($query); 494 495my($fileid,$file) =each( %{$result->{query}->{pages}} ); 496# If not defined it means there is no revision of the file for 497# given timestamp. 498if(defined($file->{imageinfo})) { 499$mediafile{title} =$filename; 500 501my$fileinfo=pop(@{$file->{imageinfo}}); 502$mediafile{timestamp} =$fileinfo->{timestamp}; 503# Mediawiki::API's download function doesn't support https URLs 504# and can't download old versions of files. 505print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 506$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 507} 508return%mediafile; 509} 510 511sub download_mw_mediafile { 512my$url=shift; 513 514my$response=$mediawiki->{ua}->get($url); 515if($response->code==200) { 516return$response->decoded_content; 517}else{ 518print STDERR "Error downloading mediafile from :\n"; 519print STDERR "URL:$url\n"; 520print STDERR "Server response: ".$response->code." ".$response->message."\n"; 521exit1; 522} 523} 524 525sub get_last_local_revision { 526# Get note regarding last mediawiki revision 527my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 528my@note_info=split(/ /,$note); 529 530my$lastrevision_number; 531if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 532print STDERR "No previous mediawiki revision found"; 533$lastrevision_number=0; 534}else{ 535# Notes are formatted : mediawiki_revision: #number 536$lastrevision_number=$note_info[1]; 537chomp($lastrevision_number); 538print STDERR "Last local mediawiki revision found is$lastrevision_number"; 539} 540return$lastrevision_number; 541} 542 543# Remember the timestamp corresponding to a revision id. 544my%basetimestamps; 545 546# Get the last remote revision without taking in account which pages are 547# tracked or not. This function makes a single request to the wiki thus 548# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 549# option. 550sub get_last_global_remote_rev { 551 mw_connect_maybe(); 552 553my$query= { 554 action =>'query', 555 list =>'recentchanges', 556 prop =>'revisions', 557 rclimit =>'1', 558 rcdir =>'older', 559}; 560my$result=$mediawiki->api($query); 561return$result->{query}->{recentchanges}[0]->{revid}; 562} 563 564# Get the last remote revision concerning the tracked pages and the tracked 565# categories. 566sub get_last_remote_revision { 567 mw_connect_maybe(); 568 569my%pages_hash= get_mw_pages(); 570my@pages=values(%pages_hash); 571 572my$max_rev_num=0; 573 574foreachmy$page(@pages) { 575my$id=$page->{pageid}; 576 577my$query= { 578 action =>'query', 579 prop =>'revisions', 580 rvprop =>'ids|timestamp', 581 pageids =>$id, 582}; 583 584my$result=$mediawiki->api($query); 585 586my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 587 588$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 589 590$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 591} 592 593print STDERR "Last remote revision found is$max_rev_num.\n"; 594return$max_rev_num; 595} 596 597# Clean content before sending it to MediaWiki 598sub mediawiki_clean { 599my$string=shift; 600my$page_created=shift; 601# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 602# This function right trims a string and adds a \n at the end to follow this rule 603$string=~s/\s+$//; 604if($stringeq""&&$page_created) { 605# Creating empty pages is forbidden. 606$string= EMPTY_CONTENT; 607} 608return$string."\n"; 609} 610 611# Filter applied on MediaWiki data before adding them to Git 612sub mediawiki_smudge { 613my$string=shift; 614if($stringeq EMPTY_CONTENT) { 615$string=""; 616} 617# This \n is important. This is due to mediawiki's way to handle end of files. 618return$string."\n"; 619} 620 621sub mediawiki_clean_filename { 622my$filename=shift; 623$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 624# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 625# Do a variant of URL-encoding, i.e. looks like URL-encoding, 626# but with _ added to prevent MediaWiki from thinking this is 627# an actual special character. 628$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 629# If we use the uri escape before 630# we should unescape here, before anything 631 632return$filename; 633} 634 635sub mediawiki_smudge_filename { 636my$filename=shift; 637$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 638$filename=~s/ /_/g; 639# Decode forbidden characters encoded in mediawiki_clean_filename 640$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 641return$filename; 642} 643 644sub literal_data { 645my($content) =@_; 646print STDOUT "data ", bytes::length($content),"\n",$content; 647} 648 649sub literal_data_raw { 650# Output possibly binary content. 651my($content) =@_; 652# Avoid confusion between size in bytes and in characters 653 utf8::downgrade($content); 654binmode STDOUT,":raw"; 655print STDOUT "data ", bytes::length($content),"\n",$content; 656binmode STDOUT,":utf8"; 657} 658 659sub mw_capabilities { 660# Revisions are imported to the private namespace 661# refs/mediawiki/$remotename/ by the helper and fetched into 662# refs/remotes/$remotename later by fetch. 663print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 664print STDOUT "import\n"; 665print STDOUT "list\n"; 666print STDOUT "push\n"; 667print STDOUT "\n"; 668} 669 670sub mw_list { 671# MediaWiki do not have branches, we consider one branch arbitrarily 672# called master, and HEAD pointing to it. 673print STDOUT "? refs/heads/master\n"; 674print STDOUT "\@refs/heads/masterHEAD\n"; 675print STDOUT "\n"; 676} 677 678sub mw_option { 679print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 680print STDOUT "unsupported\n"; 681} 682 683sub fetch_mw_revisions_for_page { 684my$page=shift; 685my$id=shift; 686my$fetch_from=shift; 687my@page_revs= (); 688my$query= { 689 action =>'query', 690 prop =>'revisions', 691 rvprop =>'ids', 692 rvdir =>'newer', 693 rvstartid =>$fetch_from, 694 rvlimit =>500, 695 pageids =>$id, 696}; 697 698my$revnum=0; 699# Get 500 revisions at a time due to the mediawiki api limit 700while(1) { 701my$result=$mediawiki->api($query); 702 703# Parse each of those 500 revisions 704foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 705my$page_rev_ids; 706$page_rev_ids->{pageid} =$page->{pageid}; 707$page_rev_ids->{revid} =$revision->{revid}; 708push(@page_revs,$page_rev_ids); 709$revnum++; 710} 711last unless$result->{'query-continue'}; 712$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 713} 714if($shallow_import&&@page_revs) { 715print STDERR " Found 1 revision (shallow import).\n"; 716@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 717return$page_revs[0]; 718} 719print STDERR " Found ",$revnum," revision(s).\n"; 720return@page_revs; 721} 722 723sub fetch_mw_revisions { 724my$pages=shift;my@pages= @{$pages}; 725my$fetch_from=shift; 726 727my@revisions= (); 728my$n=1; 729foreachmy$page(@pages) { 730my$id=$page->{pageid}; 731 732print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 733$n++; 734my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 735@revisions= (@page_revs,@revisions); 736} 737 738return($n,@revisions); 739} 740 741sub import_file_revision { 742my$commit=shift; 743my%commit= %{$commit}; 744my$full_import=shift; 745my$n=shift; 746my$mediafile=shift; 747my%mediafile; 748if($mediafile) { 749%mediafile= %{$mediafile}; 750} 751 752my$title=$commit{title}; 753my$comment=$commit{comment}; 754my$content=$commit{content}; 755my$author=$commit{author}; 756my$date=$commit{date}; 757 758print STDOUT "commit refs/mediawiki/$remotename/master\n"; 759print STDOUT "mark :$n\n"; 760print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 761 literal_data($comment); 762 763# If it's not a clone, we need to know where to start from 764if(!$full_import&&$n==1) { 765print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 766} 767if($contentne DELETED_CONTENT) { 768print STDOUT "M 644 inline$title.mw\n"; 769 literal_data($content); 770if(%mediafile) { 771print STDOUT "M 644 inline$mediafile{title}\n"; 772 literal_data_raw($mediafile{content}); 773} 774print STDOUT "\n\n"; 775}else{ 776print STDOUT "D$title.mw\n"; 777} 778 779# mediawiki revision number in the git note 780if($full_import&&$n==1) { 781print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 782} 783print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 784print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 785 literal_data("Note added by git-mediawiki during import"); 786if(!$full_import&&$n==1) { 787print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 788} 789print STDOUT "N inline :$n\n"; 790 literal_data("mediawiki_revision: ".$commit{mw_revision}); 791print STDOUT "\n\n"; 792} 793 794# parse a sequence of 795# <cmd> <arg1> 796# <cmd> <arg2> 797# \n 798# (like batch sequence of import and sequence of push statements) 799sub get_more_refs { 800my$cmd=shift; 801my@refs; 802while(1) { 803my$line= <STDIN>; 804if($line=~m/^$cmd (.*)$/) { 805push(@refs,$1); 806}elsif($lineeq"\n") { 807return@refs; 808}else{ 809die("Invalid command in a '$cmd' batch: ".$_); 810} 811} 812} 813 814sub mw_import { 815# multiple import commands can follow each other. 816my@refs= (shift, get_more_refs("import")); 817foreachmy$ref(@refs) { 818 mw_import_ref($ref); 819} 820print STDOUT "done\n"; 821} 822 823sub mw_import_ref { 824my$ref=shift; 825# The remote helper will call "import HEAD" and 826# "import refs/heads/master". 827# Since HEAD is a symbolic ref to master (by convention, 828# followed by the output of the command "list" that we gave), 829# we don't need to do anything in this case. 830if($refeq"HEAD") { 831return; 832} 833 834 mw_connect_maybe(); 835 836print STDERR "Searching revisions...\n"; 837my$last_local= get_last_local_revision(); 838my$fetch_from=$last_local+1; 839if($fetch_from==1) { 840print STDERR ", fetching from beginning.\n"; 841}else{ 842print STDERR ", fetching from here.\n"; 843} 844 845my$n=0; 846if($fetch_strategyeq"by_rev") { 847print STDERR "Fetching & writing export data by revs...\n"; 848$n= mw_import_ref_by_revs($fetch_from); 849}elsif($fetch_strategyeq"by_page") { 850print STDERR "Fetching & writing export data by pages...\n"; 851$n= mw_import_ref_by_pages($fetch_from); 852}else{ 853print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 854print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 855exit1; 856} 857 858if($fetch_from==1&&$n==0) { 859print STDERR "You appear to have cloned an empty MediaWiki.\n"; 860# Something has to be done remote-helper side. If nothing is done, an error is 861# thrown saying that HEAD is refering to unknown object 0000000000000000000 862# and the clone fails. 863} 864} 865 866sub mw_import_ref_by_pages { 867 868my$fetch_from=shift; 869my%pages_hash= get_mw_pages(); 870my@pages=values(%pages_hash); 871 872my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 873 874@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 875my@revision_ids=map$_->{revid},@revisions; 876 877return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 878} 879 880sub mw_import_ref_by_revs { 881 882my$fetch_from=shift; 883my%pages_hash= get_mw_pages(); 884 885my$last_remote= get_last_global_remote_rev(); 886my@revision_ids=$fetch_from..$last_remote; 887return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 888} 889 890# Import revisions given in second argument (array of integers). 891# Only pages appearing in the third argument (hash indexed by page titles) 892# will be imported. 893sub mw_import_revids { 894my$fetch_from=shift; 895my$revision_ids=shift; 896my$pages=shift; 897 898my$n=0; 899my$n_actual=0; 900my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 901 902foreachmy$pagerevid(@$revision_ids) { 903# fetch the content of the pages 904my$query= { 905 action =>'query', 906 prop =>'revisions', 907 rvprop =>'content|timestamp|comment|user|ids', 908 revids =>$pagerevid, 909}; 910 911my$result=$mediawiki->api($query); 912 913if(!$result) { 914die"Failed to retrieve modified page for revision$pagerevid"; 915} 916 917if(!defined($result->{query}->{pages})) { 918die"Invalid revision$pagerevid."; 919} 920 921my@result_pages=values(%{$result->{query}->{pages}}); 922my$result_page=$result_pages[0]; 923my$rev=$result_pages[0]->{revisions}->[0]; 924 925# Count page even if we skip it, since we display 926# $n/$total and $total includes skipped pages. 927$n++; 928 929my$page_title=$result_page->{title}; 930 931if(!exists($pages->{$page_title})) { 932print STDERR "$n/",scalar(@$revision_ids), 933": Skipping revision #$rev->{revid} of$page_title\n"; 934next; 935} 936 937$n_actual++; 938 939my%commit; 940$commit{author} =$rev->{user} ||'Anonymous'; 941$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 942$commit{title} = mediawiki_smudge_filename($page_title); 943$commit{mw_revision} =$rev->{revid}; 944$commit{content} = mediawiki_smudge($rev->{'*'}); 945 946if(!defined($rev->{timestamp})) { 947$last_timestamp++; 948}else{ 949$last_timestamp=$rev->{timestamp}; 950} 951$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 952 953# Differentiates classic pages and media files. 954my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 955my%mediafile; 956if($namespace&& get_mw_namespace_id($namespace) == get_mw_namespace_id("File")) { 957%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 958} 959# If this is a revision of the media page for new version 960# of a file do one common commit for both file and media page. 961# Else do commit only for that page. 962print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 963 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 964} 965 966return$n_actual; 967} 968 969sub error_non_fast_forward { 970my$advice= run_git("config --bool advice.pushNonFastForward"); 971chomp($advice); 972if($advicene"false") { 973# Native git-push would show this after the summary. 974# We can't ask it to display it cleanly, so print it 975# ourselves before. 976print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 977print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 978print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 979} 980print STDOUT "error$_[0]\"non-fast-forward\"\n"; 981return0; 982} 983 984sub mw_upload_file { 985my$complete_file_name=shift; 986my$new_sha1=shift; 987my$extension=shift; 988my$file_deleted=shift; 989my$summary=shift; 990my$newrevid; 991my$path="File:".$complete_file_name; 992my%hashFiles= get_allowed_file_extensions(); 993if(!exists($hashFiles{$extension})) { 994print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 995print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 996return$newrevid; 997} 998# Deleting and uploading a file requires a priviledged user 999if($file_deleted) {1000 mw_connect_maybe();1001my$query= {1002 action =>'delete',1003 title =>$path,1004 reason =>$summary1005};1006if(!$mediawiki->edit($query)) {1007print STDERR "Failed to delete file on remote wiki\n";1008print STDERR "Check your permissions on the remote site. Error code:\n";1009print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details};1010exit1;1011}1012}else{1013# Don't let perl try to interpret file content as UTF-8 => use "raw"1014my$content= run_git("cat-file blob$new_sha1","raw");1015if($contentne"") {1016 mw_connect_maybe();1017$mediawiki->{config}->{upload_url} =1018"$url/index.php/Special:Upload";1019$mediawiki->edit({1020 action =>'upload',1021 filename =>$complete_file_name,1022 comment =>$summary,1023 file => [undef,1024$complete_file_name,1025 Content =>$content],1026 ignorewarnings =>1,1027}, {1028 skip_encoding =>11029} ) ||die$mediawiki->{error}->{code} .':'1030.$mediawiki->{error}->{details};1031my$last_file_page=$mediawiki->get_page({title =>$path});1032$newrevid=$last_file_page->{revid};1033print STDERR "Pushed file:$new_sha1-$complete_file_name.\n";1034}else{1035print STDERR "Empty file$complete_file_namenot pushed.\n";1036}1037}1038return$newrevid;1039}10401041sub mw_push_file {1042my$diff_info=shift;1043# $diff_info contains a string in this format:1044# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1045my@diff_info_split=split(/[ \t]/,$diff_info);10461047# Filename, including .mw extension1048my$complete_file_name=shift;1049# Commit message1050my$summary=shift;1051# MediaWiki revision number. Keep the previous one by default,1052# in case there's no edit to perform.1053my$oldrevid=shift;1054my$newrevid;10551056if($summaryeq EMPTY_MESSAGE) {1057$summary='';1058}10591060my$new_sha1=$diff_info_split[3];1061my$old_sha1=$diff_info_split[2];1062my$page_created= ($old_sha1eq NULL_SHA1);1063my$page_deleted= ($new_sha1eq NULL_SHA1);1064$complete_file_name= mediawiki_clean_filename($complete_file_name);10651066my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1067if(!defined($extension)) {1068$extension="";1069}1070if($extensioneq"mw") {1071my$file_content;1072if($page_deleted) {1073# Deleting a page usually requires1074# special priviledges. A common1075# convention is to replace the page1076# with this content instead:1077$file_content= DELETED_CONTENT;1078}else{1079$file_content= run_git("cat-file blob$new_sha1");1080}10811082 mw_connect_maybe();10831084my$result=$mediawiki->edit( {1085 action =>'edit',1086 summary =>$summary,1087 title =>$title,1088 basetimestamp =>$basetimestamps{$oldrevid},1089 text => mediawiki_clean($file_content,$page_created),1090}, {1091 skip_encoding =>1# Helps with names with accentuated characters1092});1093if(!$result) {1094if($mediawiki->{error}->{code} ==3) {1095# edit conflicts, considered as non-fast-forward1096print STDERR 'Warning: Error '.1097$mediawiki->{error}->{code} .1098' from mediwiki: '.$mediawiki->{error}->{details} .1099".\n";1100return($oldrevid,"non-fast-forward");1101}else{1102# Other errors. Shouldn't happen => just die()1103die'Fatal: Error '.1104$mediawiki->{error}->{code} .1105' from mediwiki: '.$mediawiki->{error}->{details};1106}1107}1108$newrevid=$result->{edit}->{newrevid};1109print STDERR "Pushed file:$new_sha1-$title\n";1110}else{1111$newrevid= mw_upload_file($complete_file_name,$new_sha1,1112$extension,$page_deleted,1113$summary);1114}1115$newrevid= ($newrevidor$oldrevid);1116return($newrevid,"ok");1117}11181119sub mw_push {1120# multiple push statements can follow each other1121my@refsspecs= (shift, get_more_refs("push"));1122my$pushed;1123formy$refspec(@refsspecs) {1124my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1125or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1126if($force) {1127print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1128}1129if($localeq"") {1130print STDERR "Cannot delete remote branch on a MediaWiki\n";1131print STDOUT "error$remotecannot delete\n";1132next;1133}1134if($remotene"refs/heads/master") {1135print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1136print STDOUT "error$remoteonly master allowed\n";1137next;1138}1139if(mw_push_revision($local,$remote)) {1140$pushed=1;1141}1142}11431144# Notify Git that the push is done1145print STDOUT "\n";11461147if($pushed&&$dumb_push) {1148print STDERR "Just pushed some revisions to MediaWiki.\n";1149print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1150print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1151print STDERR "\n";1152print STDERR " git pull --rebase\n";1153print STDERR "\n";1154}1155}11561157sub mw_push_revision {1158my$local=shift;1159my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1160my$last_local_revid= get_last_local_revision();1161print STDERR ".\n";# Finish sentence started by get_last_local_revision()1162my$last_remote_revid= get_last_remote_revision();1163my$mw_revision=$last_remote_revid;11641165# Get sha1 of commit pointed by local HEAD1166my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1167# Get sha1 of commit pointed by remotes/$remotename/master1168my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1169chomp($remoteorigin_sha1);11701171if($last_local_revid>0&&1172$last_local_revid<$last_remote_revid) {1173return error_non_fast_forward($remote);1174}11751176if($HEAD_sha1eq$remoteorigin_sha1) {1177# nothing to push1178return0;1179}11801181# Get every commit in between HEAD and refs/remotes/origin/master,1182# including HEAD and refs/remotes/origin/master1183my@commit_pairs= ();1184if($last_local_revid>0) {1185my$parsed_sha1=$remoteorigin_sha1;1186# Find a path from last MediaWiki commit to pushed commit1187while($parsed_sha1ne$HEAD_sha1) {1188my@commit_info=grep(/^$parsed_sha1/,split(/\n/, run_git("rev-list --children$local")));1189if(!@commit_info) {1190return error_non_fast_forward($remote);1191}1192my@commit_info_split=split(/ |\n/,$commit_info[0]);1193# $commit_info_split[1] is the sha1 of the commit to export1194# $commit_info_split[0] is the sha1 of its direct child1195push(@commit_pairs, \@commit_info_split);1196$parsed_sha1=$commit_info_split[1];1197}1198}else{1199# No remote mediawiki revision. Export the whole1200# history (linearized with --first-parent)1201print STDERR "Warning: no common ancestor, pushing complete history\n";1202my$history= run_git("rev-list --first-parent --children$local");1203my@history=split('\n',$history);1204@history=@history[1..$#history];1205foreachmy$line(reverse@history) {1206my@commit_info_split=split(/ |\n/,$line);1207push(@commit_pairs, \@commit_info_split);1208}1209}12101211foreachmy$commit_info_split(@commit_pairs) {1212my$sha1_child= @{$commit_info_split}[0];1213my$sha1_commit= @{$commit_info_split}[1];1214my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1215# TODO: we could detect rename, and encode them with a #redirect on the wiki.1216# TODO: for now, it's just a delete+add1217my@diff_info_list=split(/\0/,$diff_infos);1218# Keep the subject line of the commit message as mediawiki comment for the revision1219my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1220chomp($commit_msg);1221# Push every blob1222while(@diff_info_list) {1223my$status;1224# git diff-tree -z gives an output like1225# <metadata>\0<filename1>\01226# <metadata>\0<filename2>\01227# and we've split on \0.1228my$info=shift(@diff_info_list);1229my$file=shift(@diff_info_list);1230($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1231if($statuseq"non-fast-forward") {1232# we may already have sent part of the1233# commit to MediaWiki, but it's too1234# late to cancel it. Stop the push in1235# the middle, but still give an1236# accurate error message.1237return error_non_fast_forward($remote);1238}1239if($statusne"ok") {1240die("Unknown error from mw_push_file()");1241}1242}1243unless($dumb_push) {1244 run_git("notes --ref=$remotename/mediawikiadd -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1245 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1246}1247}12481249print STDOUT "ok$remote\n";1250return1;1251}12521253sub get_allowed_file_extensions {1254 mw_connect_maybe();12551256my$query= {1257 action =>'query',1258 meta =>'siteinfo',1259 siprop =>'fileextensions'1260};1261my$result=$mediawiki->api($query);1262my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1263my%hashFile=map{$_=>1}@file_extensions;12641265return%hashFile;1266}12671268# In memory cache for MediaWiki namespace ids.1269my%namespace_id;12701271# Namespaces whose id is cached in the configuration file1272# (to avoid duplicates)1273my%cached_mw_namespace_id;12741275# Return MediaWiki id for a canonical namespace name.1276# Ex.: "File", "Project".1277sub get_mw_namespace_id {1278 mw_connect_maybe();1279my$name=shift;12801281if(!exists$namespace_id{$name}) {1282# Look at configuration file, if the record for that namespace is1283# already cached. Namespaces are stored in form:1284# "Name_of_namespace:Id_namespace", ex.: "File:6".1285my@temp=split(/[\n]/, run_git("config --get-all remote."1286.$remotename.".namespaceCache"));1287chomp(@temp);1288foreachmy$ns(@temp) {1289my($n,$id) =split(/:/,$ns);1290$namespace_id{$n} =$id;1291$cached_mw_namespace_id{$n} =1;1292}1293}12941295if(!exists$namespace_id{$name}) {1296print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1297# NS not found => get namespace id from MW and store it in1298# configuration file.1299my$query= {1300 action =>'query',1301 meta =>'siteinfo',1302 siprop =>'namespaces'1303};1304my$result=$mediawiki->api($query);13051306while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1307if(defined($ns->{id}) &&defined($ns->{canonical})) {1308$namespace_id{$ns->{canonical}} =$ns->{id};1309if($ns->{'*'}) {1310# alias (e.g. french Fichier: as alias for canonical File:)1311$namespace_id{$ns->{'*'}} =$ns->{id};1312}1313}1314}1315}13161317my$id=$namespace_id{$name};13181319if(defined$id) {1320# Store explicitely requested namespaces on disk1321if(!exists$cached_mw_namespace_id{$name}) {1322 run_git("config --add remote.".$remotename1323.".namespaceCache\"".$name.":".$id."\"");1324$cached_mw_namespace_id{$name} =1;1325}1326return$id;1327}else{1328die"No such namespace$nameon MediaWiki.";1329}1330}