1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# https://github.com/Bibzball/Git-Mediawiki/wiki 13# 14# Known limitations: 15# 16# - Several strategies are provided to fetch modifications from the 17# wiki, but no automatic heuristics is provided, the user has 18# to understand and chose which strategy is appropriate for him. 19# 20# - Git renames could be turned into MediaWiki renames (see TODO 21# below) 22# 23# - No way to import "one page, and all pages included in it" 24# 25# - Multiple remote MediaWikis have not been very well tested. 26 27use strict; 28use MediaWiki::API; 29use DateTime::Format::ISO8601; 30 31# By default, use UTF-8 to communicate with Git and the user 32binmode STDERR,":utf8"; 33binmode STDOUT,":utf8"; 34 35use URI::Escape; 36use IPC::Open2; 37 38use warnings; 39 40# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 41useconstant SLASH_REPLACEMENT =>"%2F"; 42 43# It's not always possible to delete pages (may require some 44# priviledges). Deleted pages are replaced with this content. 45useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 46 47# It's not possible to create empty pages. New empty files in Git are 48# sent with this content instead. 49useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 50 51# used to reflect file creation or deletion in diff. 52useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 53 54my$remotename=$ARGV[0]; 55my$url=$ARGV[1]; 56 57# Accept both space-separated and multiple keys in config file. 58# Spaces should be written as _ anyway because we'll use chomp. 59my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 60chomp(@tracked_pages); 61 62# Just like @tracked_pages, but for MediaWiki categories. 63my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 64chomp(@tracked_categories); 65 66# Import media files too. 67my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 68chomp($import_media); 69$import_media= ($import_mediaeq"true"); 70 71my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 72# Note: mwPassword is discourraged. Use the credential system instead. 73my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 74my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 75chomp($wiki_login); 76chomp($wiki_passwd); 77chomp($wiki_domain); 78 79# Import only last revisions (both for clone and fetch) 80my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 81chomp($shallow_import); 82$shallow_import= ($shallow_importeq"true"); 83 84# Fetch (clone and pull) by revisions instead of by pages. This behavior 85# is more efficient when we have a wiki with lots of pages and we fetch 86# the revisions quite often so that they concern only few pages. 87# Possible values: 88# - by_rev: perform one query per new revision on the remote wiki 89# - by_page: query each tracked page for new revision 90my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 91unless($fetch_strategy) { 92$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 93} 94chomp($fetch_strategy); 95unless($fetch_strategy) { 96$fetch_strategy="by_page"; 97} 98 99# Dumb push: don't update notes and mediawiki ref to reflect the last push. 100# 101# Configurable with mediawiki.dumbPush, or per-remote with 102# remote.<remotename>.dumbPush. 103# 104# This means the user will have to re-import the just-pushed 105# revisions. On the other hand, this means that the Git revisions 106# corresponding to MediaWiki revisions are all imported from the wiki, 107# regardless of whether they were initially created in Git or from the 108# web interface, hence all users will get the same history (i.e. if 109# the push from Git to MediaWiki loses some information, everybody 110# will get the history with information lost). If the import is 111# deterministic, this means everybody gets the same sha1 for each 112# MediaWiki revision. 113my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 114unless($dumb_push) { 115$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 116} 117chomp($dumb_push); 118$dumb_push= ($dumb_pusheq"true"); 119 120my$wiki_name=$url; 121$wiki_name=~s/[^\/]*:\/\///; 122# If URL is like http://user:password@example.com/, we clearly don't 123# want the password in $wiki_name. While we're there, also remove user 124# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 125$wiki_name=~s/^.*@//; 126 127# Commands parser 128my$entry; 129my@cmd; 130while(<STDIN>) { 131chomp; 132@cmd=split(/ /); 133if(defined($cmd[0])) { 134# Line not blank 135if($cmd[0]eq"capabilities") { 136die("Too many arguments for capabilities")unless(!defined($cmd[1])); 137 mw_capabilities(); 138}elsif($cmd[0]eq"list") { 139die("Too many arguments for list")unless(!defined($cmd[2])); 140 mw_list($cmd[1]); 141}elsif($cmd[0]eq"import") { 142die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 143 mw_import($cmd[1]); 144}elsif($cmd[0]eq"option") { 145die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 146 mw_option($cmd[1],$cmd[2]); 147}elsif($cmd[0]eq"push") { 148 mw_push($cmd[1]); 149}else{ 150print STDERR "Unknown command. Aborting...\n"; 151last; 152} 153}else{ 154# blank line: we should terminate 155last; 156} 157 158BEGIN{ $| =1}# flush STDOUT, to make sure the previous 159# command is fully processed. 160} 161 162########################## Functions ############################## 163 164## credential API management (generic functions) 165 166sub credential_read { 167my%credential; 168my$reader=shift; 169my$op=shift; 170while(<$reader>) { 171my($key,$value) =/([^=]*)=(.*)/; 172if(not defined$key) { 173die"ERROR receiving response from git credential$op:\n$_\n"; 174} 175$credential{$key} =$value; 176} 177return%credential; 178} 179 180sub credential_write { 181my$credential=shift; 182my$writer=shift; 183# url overwrites other fields, so it must come first 184print$writer"url=$credential->{url}\n"ifexists$credential->{url}; 185while(my($key,$value) =each(%$credential) ) { 186if(length$value&&$keyne'url') { 187print$writer"$key=$value\n"; 188} 189} 190} 191 192sub credential_run { 193my$op=shift; 194my$credential=shift; 195my$pid= open2(my$reader,my$writer,"git credential$op"); 196 credential_write($credential,$writer); 197print$writer"\n"; 198close($writer); 199 200if($opeq"fill") { 201%$credential= credential_read($reader,$op); 202}else{ 203if(<$reader>) { 204die"ERROR while running git credential$op:\n$_"; 205} 206} 207close($reader); 208waitpid($pid,0); 209my$child_exit_status=$?>>8; 210if($child_exit_status!=0) { 211die"'git credential$op' failed with code$child_exit_status."; 212} 213} 214 215# MediaWiki API instance, created lazily. 216my$mediawiki; 217 218sub mw_connect_maybe { 219if($mediawiki) { 220return; 221} 222$mediawiki= MediaWiki::API->new; 223$mediawiki->{config}->{api_url} ="$url/api.php"; 224if($wiki_login) { 225my%credential= (url =>$url); 226$credential{username} =$wiki_login; 227$credential{password} =$wiki_passwd; 228 credential_run("fill", \%credential); 229my$request= {lgname =>$credential{username}, 230 lgpassword =>$credential{password}, 231 lgdomain =>$wiki_domain}; 232if($mediawiki->login($request)) { 233 credential_run("approve", \%credential); 234print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 235}else{ 236print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 237print STDERR " (error ". 238$mediawiki->{error}->{code} .': '. 239$mediawiki->{error}->{details} .")\n"; 240 credential_run("reject", \%credential); 241exit1; 242} 243} 244} 245 246## Functions for listing pages on the remote wiki 247sub get_mw_tracked_pages { 248my$pages=shift; 249 get_mw_page_list(\@tracked_pages,$pages); 250} 251 252sub get_mw_page_list { 253my$page_list=shift; 254my$pages=shift; 255my@some_pages=@$page_list; 256while(@some_pages) { 257my$last=50; 258if($#some_pages<$last) { 259$last=$#some_pages; 260} 261my@slice=@some_pages[0..$last]; 262 get_mw_first_pages(\@slice,$pages); 263@some_pages=@some_pages[51..$#some_pages]; 264} 265} 266 267sub get_mw_tracked_categories { 268my$pages=shift; 269foreachmy$category(@tracked_categories) { 270if(index($category,':') <0) { 271# Mediawiki requires the Category 272# prefix, but let's not force the user 273# to specify it. 274$category="Category:".$category; 275} 276my$mw_pages=$mediawiki->list( { 277 action =>'query', 278 list =>'categorymembers', 279 cmtitle =>$category, 280 cmlimit =>'max'} ) 281||die$mediawiki->{error}->{code} .': ' 282.$mediawiki->{error}->{details}; 283foreachmy$page(@{$mw_pages}) { 284$pages->{$page->{title}} =$page; 285} 286} 287} 288 289sub get_mw_all_pages { 290my$pages=shift; 291# No user-provided list, get the list of pages from the API. 292my$mw_pages=$mediawiki->list({ 293 action =>'query', 294 list =>'allpages', 295 aplimit =>'max' 296}); 297if(!defined($mw_pages)) { 298print STDERR "fatal: could not get the list of wiki pages.\n"; 299print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 300print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 301exit1; 302} 303foreachmy$page(@{$mw_pages}) { 304$pages->{$page->{title}} =$page; 305} 306} 307 308# queries the wiki for a set of pages. Meant to be used within a loop 309# querying the wiki for slices of page list. 310sub get_mw_first_pages { 311my$some_pages=shift; 312my@some_pages= @{$some_pages}; 313 314my$pages=shift; 315 316# pattern 'page1|page2|...' required by the API 317my$titles=join('|',@some_pages); 318 319my$mw_pages=$mediawiki->api({ 320 action =>'query', 321 titles =>$titles, 322}); 323if(!defined($mw_pages)) { 324print STDERR "fatal: could not query the list of wiki pages.\n"; 325print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 326print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 327exit1; 328} 329while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 330if($id<0) { 331print STDERR "Warning: page$page->{title} not found on wiki\n"; 332}else{ 333$pages->{$page->{title}} =$page; 334} 335} 336} 337 338# Get the list of pages to be fetched according to configuration. 339sub get_mw_pages { 340 mw_connect_maybe(); 341 342my%pages;# hash on page titles to avoid duplicates 343my$user_defined; 344if(@tracked_pages) { 345$user_defined=1; 346# The user provided a list of pages titles, but we 347# still need to query the API to get the page IDs. 348 get_mw_tracked_pages(\%pages); 349} 350if(@tracked_categories) { 351$user_defined=1; 352 get_mw_tracked_categories(\%pages); 353} 354if(!$user_defined) { 355 get_mw_all_pages(\%pages); 356} 357if($import_media) { 358print STDERR "Getting media files for selected pages...\n"; 359if($user_defined) { 360 get_linked_mediafiles(\%pages); 361}else{ 362 get_all_mediafiles(\%pages); 363} 364} 365return%pages; 366} 367 368# usage: $out = run_git("command args"); 369# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 370sub run_git { 371my$args=shift; 372my$encoding= (shift||"encoding(UTF-8)"); 373open(my$git,"-|:$encoding","git ".$args); 374my$res=do{local$/; <$git> }; 375close($git); 376 377return$res; 378} 379 380 381sub get_all_mediafiles { 382my$pages=shift; 383# Attach list of all pages for media files from the API, 384# they are in a different namespace, only one namespace 385# can be queried at the same moment 386my$mw_pages=$mediawiki->list({ 387 action =>'query', 388 list =>'allpages', 389 apnamespace => get_mw_namespace_id("File"), 390 aplimit =>'max' 391}); 392if(!defined($mw_pages)) { 393print STDERR "fatal: could not get the list of pages for media files.\n"; 394print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 395print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 396exit1; 397} 398foreachmy$page(@{$mw_pages}) { 399$pages->{$page->{title}} =$page; 400} 401} 402 403sub get_linked_mediafiles { 404my$pages=shift; 405my@titles=map$_->{title},values(%{$pages}); 406 407# The query is split in small batches because of the MW API limit of 408# the number of links to be returned (500 links max). 409my$batch=10; 410while(@titles) { 411if($#titles<$batch) { 412$batch=$#titles; 413} 414my@slice=@titles[0..$batch]; 415 416# pattern 'page1|page2|...' required by the API 417my$mw_titles=join('|',@slice); 418 419# Media files could be included or linked from 420# a page, get all related 421my$query= { 422 action =>'query', 423 prop =>'links|images', 424 titles =>$mw_titles, 425 plnamespace => get_mw_namespace_id("File"), 426 pllimit =>'max' 427}; 428my$result=$mediawiki->api($query); 429 430while(my($id,$page) =each(%{$result->{query}->{pages}})) { 431my@media_titles; 432if(defined($page->{links})) { 433my@link_titles=map$_->{title}, @{$page->{links}}; 434push(@media_titles,@link_titles); 435} 436if(defined($page->{images})) { 437my@image_titles=map$_->{title}, @{$page->{images}}; 438push(@media_titles,@image_titles); 439} 440if(@media_titles) { 441 get_mw_page_list(\@media_titles,$pages); 442} 443} 444 445@titles=@titles[($batch+1)..$#titles]; 446} 447} 448 449sub get_mw_mediafile_for_page_revision { 450# Name of the file on Wiki, with the prefix. 451my$filename=shift; 452my$timestamp=shift; 453my%mediafile; 454 455# Search if on a media file with given timestamp exists on 456# MediaWiki. In that case download the file. 457my$query= { 458 action =>'query', 459 prop =>'imageinfo', 460 titles =>"File:".$filename, 461 iistart =>$timestamp, 462 iiend =>$timestamp, 463 iiprop =>'timestamp|archivename|url', 464 iilimit =>1 465}; 466my$result=$mediawiki->api($query); 467 468my($fileid,$file) =each( %{$result->{query}->{pages}} ); 469# If not defined it means there is no revision of the file for 470# given timestamp. 471if(defined($file->{imageinfo})) { 472$mediafile{title} =$filename; 473 474my$fileinfo=pop(@{$file->{imageinfo}}); 475$mediafile{timestamp} =$fileinfo->{timestamp}; 476# Mediawiki::API's download function doesn't support https URLs 477# and can't download old versions of files. 478print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 479$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 480} 481return%mediafile; 482} 483 484sub download_mw_mediafile { 485my$url=shift; 486 487my$response=$mediawiki->{ua}->get($url); 488if($response->code==200) { 489return$response->decoded_content; 490}else{ 491print STDERR "Error downloading mediafile from :\n"; 492print STDERR "URL:$url\n"; 493print STDERR "Server response: ".$response->code." ".$response->message."\n"; 494exit1; 495} 496} 497 498sub get_last_local_revision { 499# Get note regarding last mediawiki revision 500my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 501my@note_info=split(/ /,$note); 502 503my$lastrevision_number; 504if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 505print STDERR "No previous mediawiki revision found"; 506$lastrevision_number=0; 507}else{ 508# Notes are formatted : mediawiki_revision: #number 509$lastrevision_number=$note_info[1]; 510chomp($lastrevision_number); 511print STDERR "Last local mediawiki revision found is$lastrevision_number"; 512} 513return$lastrevision_number; 514} 515 516# Remember the timestamp corresponding to a revision id. 517my%basetimestamps; 518 519# Get the last remote revision without taking in account which pages are 520# tracked or not. This function makes a single request to the wiki thus 521# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 522# option. 523sub get_last_global_remote_rev { 524 mw_connect_maybe(); 525 526my$query= { 527 action =>'query', 528 list =>'recentchanges', 529 prop =>'revisions', 530 rclimit =>'1', 531 rcdir =>'older', 532}; 533my$result=$mediawiki->api($query); 534return$result->{query}->{recentchanges}[0]->{revid}; 535} 536 537# Get the last remote revision concerning the tracked pages and the tracked 538# categories. 539sub get_last_remote_revision { 540 mw_connect_maybe(); 541 542my%pages_hash= get_mw_pages(); 543my@pages=values(%pages_hash); 544 545my$max_rev_num=0; 546 547foreachmy$page(@pages) { 548my$id=$page->{pageid}; 549 550my$query= { 551 action =>'query', 552 prop =>'revisions', 553 rvprop =>'ids|timestamp', 554 pageids =>$id, 555}; 556 557my$result=$mediawiki->api($query); 558 559my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 560 561$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 562 563$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 564} 565 566print STDERR "Last remote revision found is$max_rev_num.\n"; 567return$max_rev_num; 568} 569 570# Clean content before sending it to MediaWiki 571sub mediawiki_clean { 572my$string=shift; 573my$page_created=shift; 574# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 575# This function right trims a string and adds a \n at the end to follow this rule 576$string=~s/\s+$//; 577if($stringeq""&&$page_created) { 578# Creating empty pages is forbidden. 579$string= EMPTY_CONTENT; 580} 581return$string."\n"; 582} 583 584# Filter applied on MediaWiki data before adding them to Git 585sub mediawiki_smudge { 586my$string=shift; 587if($stringeq EMPTY_CONTENT) { 588$string=""; 589} 590# This \n is important. This is due to mediawiki's way to handle end of files. 591return$string."\n"; 592} 593 594sub mediawiki_clean_filename { 595my$filename=shift; 596$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 597# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 598# Do a variant of URL-encoding, i.e. looks like URL-encoding, 599# but with _ added to prevent MediaWiki from thinking this is 600# an actual special character. 601$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 602# If we use the uri escape before 603# we should unescape here, before anything 604 605return$filename; 606} 607 608sub mediawiki_smudge_filename { 609my$filename=shift; 610$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 611$filename=~s/ /_/g; 612# Decode forbidden characters encoded in mediawiki_clean_filename 613$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 614return$filename; 615} 616 617sub literal_data { 618my($content) =@_; 619print STDOUT "data ", bytes::length($content),"\n",$content; 620} 621 622sub literal_data_raw { 623# Output possibly binary content. 624my($content) =@_; 625# Avoid confusion between size in bytes and in characters 626 utf8::downgrade($content); 627binmode STDOUT,":raw"; 628print STDOUT "data ", bytes::length($content),"\n",$content; 629binmode STDOUT,":utf8"; 630} 631 632sub mw_capabilities { 633# Revisions are imported to the private namespace 634# refs/mediawiki/$remotename/ by the helper and fetched into 635# refs/remotes/$remotename later by fetch. 636print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 637print STDOUT "import\n"; 638print STDOUT "list\n"; 639print STDOUT "push\n"; 640print STDOUT "\n"; 641} 642 643sub mw_list { 644# MediaWiki do not have branches, we consider one branch arbitrarily 645# called master, and HEAD pointing to it. 646print STDOUT "? refs/heads/master\n"; 647print STDOUT "\@refs/heads/masterHEAD\n"; 648print STDOUT "\n"; 649} 650 651sub mw_option { 652print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 653print STDOUT "unsupported\n"; 654} 655 656sub fetch_mw_revisions_for_page { 657my$page=shift; 658my$id=shift; 659my$fetch_from=shift; 660my@page_revs= (); 661my$query= { 662 action =>'query', 663 prop =>'revisions', 664 rvprop =>'ids', 665 rvdir =>'newer', 666 rvstartid =>$fetch_from, 667 rvlimit =>500, 668 pageids =>$id, 669}; 670 671my$revnum=0; 672# Get 500 revisions at a time due to the mediawiki api limit 673while(1) { 674my$result=$mediawiki->api($query); 675 676# Parse each of those 500 revisions 677foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 678my$page_rev_ids; 679$page_rev_ids->{pageid} =$page->{pageid}; 680$page_rev_ids->{revid} =$revision->{revid}; 681push(@page_revs,$page_rev_ids); 682$revnum++; 683} 684last unless$result->{'query-continue'}; 685$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 686} 687if($shallow_import&&@page_revs) { 688print STDERR " Found 1 revision (shallow import).\n"; 689@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 690return$page_revs[0]; 691} 692print STDERR " Found ",$revnum," revision(s).\n"; 693return@page_revs; 694} 695 696sub fetch_mw_revisions { 697my$pages=shift;my@pages= @{$pages}; 698my$fetch_from=shift; 699 700my@revisions= (); 701my$n=1; 702foreachmy$page(@pages) { 703my$id=$page->{pageid}; 704 705print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 706$n++; 707my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 708@revisions= (@page_revs,@revisions); 709} 710 711return($n,@revisions); 712} 713 714sub import_file_revision { 715my$commit=shift; 716my%commit= %{$commit}; 717my$full_import=shift; 718my$n=shift; 719my$mediafile=shift; 720my%mediafile; 721if($mediafile) { 722%mediafile= %{$mediafile}; 723} 724 725my$title=$commit{title}; 726my$comment=$commit{comment}; 727my$content=$commit{content}; 728my$author=$commit{author}; 729my$date=$commit{date}; 730 731print STDOUT "commit refs/mediawiki/$remotename/master\n"; 732print STDOUT "mark :$n\n"; 733print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 734 literal_data($comment); 735 736# If it's not a clone, we need to know where to start from 737if(!$full_import&&$n==1) { 738print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 739} 740if($contentne DELETED_CONTENT) { 741print STDOUT "M 644 inline$title.mw\n"; 742 literal_data($content); 743if(%mediafile) { 744print STDOUT "M 644 inline$mediafile{title}\n"; 745 literal_data_raw($mediafile{content}); 746} 747print STDOUT "\n\n"; 748}else{ 749print STDOUT "D$title.mw\n"; 750} 751 752# mediawiki revision number in the git note 753if($full_import&&$n==1) { 754print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 755} 756print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 757print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 758 literal_data("Note added by git-mediawiki during import"); 759if(!$full_import&&$n==1) { 760print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 761} 762print STDOUT "N inline :$n\n"; 763 literal_data("mediawiki_revision: ".$commit{mw_revision}); 764print STDOUT "\n\n"; 765} 766 767# parse a sequence of 768# <cmd> <arg1> 769# <cmd> <arg2> 770# \n 771# (like batch sequence of import and sequence of push statements) 772sub get_more_refs { 773my$cmd=shift; 774my@refs; 775while(1) { 776my$line= <STDIN>; 777if($line=~m/^$cmd (.*)$/) { 778push(@refs,$1); 779}elsif($lineeq"\n") { 780return@refs; 781}else{ 782die("Invalid command in a '$cmd' batch: ".$_); 783} 784} 785} 786 787sub mw_import { 788# multiple import commands can follow each other. 789my@refs= (shift, get_more_refs("import")); 790foreachmy$ref(@refs) { 791 mw_import_ref($ref); 792} 793print STDOUT "done\n"; 794} 795 796sub mw_import_ref { 797my$ref=shift; 798# The remote helper will call "import HEAD" and 799# "import refs/heads/master". 800# Since HEAD is a symbolic ref to master (by convention, 801# followed by the output of the command "list" that we gave), 802# we don't need to do anything in this case. 803if($refeq"HEAD") { 804return; 805} 806 807 mw_connect_maybe(); 808 809print STDERR "Searching revisions...\n"; 810my$last_local= get_last_local_revision(); 811my$fetch_from=$last_local+1; 812if($fetch_from==1) { 813print STDERR ", fetching from beginning.\n"; 814}else{ 815print STDERR ", fetching from here.\n"; 816} 817 818my$n=0; 819if($fetch_strategyeq"by_rev") { 820print STDERR "Fetching & writing export data by revs...\n"; 821$n= mw_import_ref_by_revs($fetch_from); 822}elsif($fetch_strategyeq"by_page") { 823print STDERR "Fetching & writing export data by pages...\n"; 824$n= mw_import_ref_by_pages($fetch_from); 825}else{ 826print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 827print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 828exit1; 829} 830 831if($fetch_from==1&&$n==0) { 832print STDERR "You appear to have cloned an empty MediaWiki.\n"; 833# Something has to be done remote-helper side. If nothing is done, an error is 834# thrown saying that HEAD is refering to unknown object 0000000000000000000 835# and the clone fails. 836} 837} 838 839sub mw_import_ref_by_pages { 840 841my$fetch_from=shift; 842my%pages_hash= get_mw_pages(); 843my@pages=values(%pages_hash); 844 845my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 846 847@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 848my@revision_ids=map$_->{revid},@revisions; 849 850return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 851} 852 853sub mw_import_ref_by_revs { 854 855my$fetch_from=shift; 856my%pages_hash= get_mw_pages(); 857 858my$last_remote= get_last_global_remote_rev(); 859my@revision_ids=$fetch_from..$last_remote; 860return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 861} 862 863# Import revisions given in second argument (array of integers). 864# Only pages appearing in the third argument (hash indexed by page titles) 865# will be imported. 866sub mw_import_revids { 867my$fetch_from=shift; 868my$revision_ids=shift; 869my$pages=shift; 870 871my$n=0; 872my$n_actual=0; 873my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 874 875foreachmy$pagerevid(@$revision_ids) { 876# fetch the content of the pages 877my$query= { 878 action =>'query', 879 prop =>'revisions', 880 rvprop =>'content|timestamp|comment|user|ids', 881 revids =>$pagerevid, 882}; 883 884my$result=$mediawiki->api($query); 885 886if(!$result) { 887die"Failed to retrieve modified page for revision$pagerevid"; 888} 889 890if(!defined($result->{query}->{pages})) { 891die"Invalid revision$pagerevid."; 892} 893 894my@result_pages=values(%{$result->{query}->{pages}}); 895my$result_page=$result_pages[0]; 896my$rev=$result_pages[0]->{revisions}->[0]; 897 898# Count page even if we skip it, since we display 899# $n/$total and $total includes skipped pages. 900$n++; 901 902my$page_title=$result_page->{title}; 903 904if(!exists($pages->{$page_title})) { 905print STDERR "$n/",scalar(@$revision_ids), 906": Skipping revision #$rev->{revid} of$page_title\n"; 907next; 908} 909 910$n_actual++; 911 912my%commit; 913$commit{author} =$rev->{user} ||'Anonymous'; 914$commit{comment} =$rev->{comment} ||'*Empty MediaWiki Message*'; 915$commit{title} = mediawiki_smudge_filename($page_title); 916$commit{mw_revision} =$rev->{revid}; 917$commit{content} = mediawiki_smudge($rev->{'*'}); 918 919if(!defined($rev->{timestamp})) { 920$last_timestamp++; 921}else{ 922$last_timestamp=$rev->{timestamp}; 923} 924$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 925 926# Differentiates classic pages and media files. 927my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 928my%mediafile; 929if($namespace&& get_mw_namespace_id($namespace) == get_mw_namespace_id("File")) { 930%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 931} 932# If this is a revision of the media page for new version 933# of a file do one common commit for both file and media page. 934# Else do commit only for that page. 935print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 936 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 937} 938 939return$n_actual; 940} 941 942sub error_non_fast_forward { 943my$advice= run_git("config --bool advice.pushNonFastForward"); 944chomp($advice); 945if($advicene"false") { 946# Native git-push would show this after the summary. 947# We can't ask it to display it cleanly, so print it 948# ourselves before. 949print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 950print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 951print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 952} 953print STDOUT "error$_[0]\"non-fast-forward\"\n"; 954return0; 955} 956 957sub mw_upload_file { 958my$complete_file_name=shift; 959my$new_sha1=shift; 960my$extension=shift; 961my$file_deleted=shift; 962my$summary=shift; 963my$newrevid; 964my$path="File:".$complete_file_name; 965my%hashFiles= get_allowed_file_extensions(); 966if(!exists($hashFiles{$extension})) { 967print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 968print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 969return$newrevid; 970} 971# Deleting and uploading a file requires a priviledged user 972if($file_deleted) { 973 mw_connect_maybe(); 974my$query= { 975 action =>'delete', 976 title =>$path, 977 reason =>$summary 978}; 979if(!$mediawiki->edit($query)) { 980print STDERR "Failed to delete file on remote wiki\n"; 981print STDERR "Check your permissions on the remote site. Error code:\n"; 982print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 983exit1; 984} 985}else{ 986# Don't let perl try to interpret file content as UTF-8 => use "raw" 987my$content= run_git("cat-file blob$new_sha1","raw"); 988if($contentne"") { 989 mw_connect_maybe(); 990$mediawiki->{config}->{upload_url} = 991"$url/index.php/Special:Upload"; 992$mediawiki->edit({ 993 action =>'upload', 994 filename =>$complete_file_name, 995 comment =>$summary, 996 file => [undef, 997$complete_file_name, 998 Content =>$content], 999 ignorewarnings =>1,1000}, {1001 skip_encoding =>11002} ) ||die$mediawiki->{error}->{code} .':'1003.$mediawiki->{error}->{details};1004my$last_file_page=$mediawiki->get_page({title =>$path});1005$newrevid=$last_file_page->{revid};1006print STDERR "Pushed file:$new_sha1-$complete_file_name.\n";1007}else{1008print STDERR "Empty file$complete_file_namenot pushed.\n";1009}1010}1011return$newrevid;1012}10131014sub mw_push_file {1015my$diff_info=shift;1016# $diff_info contains a string in this format:1017# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1018my@diff_info_split=split(/[ \t]/,$diff_info);10191020# Filename, including .mw extension1021my$complete_file_name=shift;1022# Commit message1023my$summary=shift;1024# MediaWiki revision number. Keep the previous one by default,1025# in case there's no edit to perform.1026my$oldrevid=shift;1027my$newrevid;10281029my$new_sha1=$diff_info_split[3];1030my$old_sha1=$diff_info_split[2];1031my$page_created= ($old_sha1eq NULL_SHA1);1032my$page_deleted= ($new_sha1eq NULL_SHA1);1033$complete_file_name= mediawiki_clean_filename($complete_file_name);10341035my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1036if(!defined($extension)) {1037$extension="";1038}1039if($extensioneq"mw") {1040my$file_content;1041if($page_deleted) {1042# Deleting a page usually requires1043# special priviledges. A common1044# convention is to replace the page1045# with this content instead:1046$file_content= DELETED_CONTENT;1047}else{1048$file_content= run_git("cat-file blob$new_sha1");1049}10501051 mw_connect_maybe();10521053my$result=$mediawiki->edit( {1054 action =>'edit',1055 summary =>$summary,1056 title =>$title,1057 basetimestamp =>$basetimestamps{$oldrevid},1058 text => mediawiki_clean($file_content,$page_created),1059}, {1060 skip_encoding =>1# Helps with names with accentuated characters1061});1062if(!$result) {1063if($mediawiki->{error}->{code} ==3) {1064# edit conflicts, considered as non-fast-forward1065print STDERR 'Warning: Error '.1066$mediawiki->{error}->{code} .1067' from mediwiki: '.$mediawiki->{error}->{details} .1068".\n";1069return($oldrevid,"non-fast-forward");1070}else{1071# Other errors. Shouldn't happen => just die()1072die'Fatal: Error '.1073$mediawiki->{error}->{code} .1074' from mediwiki: '.$mediawiki->{error}->{details};1075}1076}1077$newrevid=$result->{edit}->{newrevid};1078print STDERR "Pushed file:$new_sha1-$title\n";1079}else{1080$newrevid= mw_upload_file($complete_file_name,$new_sha1,1081$extension,$page_deleted,1082$summary);1083}1084$newrevid= ($newrevidor$oldrevid);1085return($newrevid,"ok");1086}10871088sub mw_push {1089# multiple push statements can follow each other1090my@refsspecs= (shift, get_more_refs("push"));1091my$pushed;1092formy$refspec(@refsspecs) {1093my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1094or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1095if($force) {1096print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1097}1098if($localeq"") {1099print STDERR "Cannot delete remote branch on a MediaWiki\n";1100print STDOUT "error$remotecannot delete\n";1101next;1102}1103if($remotene"refs/heads/master") {1104print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1105print STDOUT "error$remoteonly master allowed\n";1106next;1107}1108if(mw_push_revision($local,$remote)) {1109$pushed=1;1110}1111}11121113# Notify Git that the push is done1114print STDOUT "\n";11151116if($pushed&&$dumb_push) {1117print STDERR "Just pushed some revisions to MediaWiki.\n";1118print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1119print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1120print STDERR "\n";1121print STDERR " git pull --rebase\n";1122print STDERR "\n";1123}1124}11251126sub mw_push_revision {1127my$local=shift;1128my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1129my$last_local_revid= get_last_local_revision();1130print STDERR ".\n";# Finish sentence started by get_last_local_revision()1131my$last_remote_revid= get_last_remote_revision();1132my$mw_revision=$last_remote_revid;11331134# Get sha1 of commit pointed by local HEAD1135my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1136# Get sha1 of commit pointed by remotes/$remotename/master1137my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1138chomp($remoteorigin_sha1);11391140if($last_local_revid>0&&1141$last_local_revid<$last_remote_revid) {1142return error_non_fast_forward($remote);1143}11441145if($HEAD_sha1eq$remoteorigin_sha1) {1146# nothing to push1147return0;1148}11491150# Get every commit in between HEAD and refs/remotes/origin/master,1151# including HEAD and refs/remotes/origin/master1152my@commit_pairs= ();1153if($last_local_revid>0) {1154my$parsed_sha1=$remoteorigin_sha1;1155# Find a path from last MediaWiki commit to pushed commit1156while($parsed_sha1ne$HEAD_sha1) {1157my@commit_info=grep(/^$parsed_sha1/,split(/\n/, run_git("rev-list --children$local")));1158if(!@commit_info) {1159return error_non_fast_forward($remote);1160}1161my@commit_info_split=split(/ |\n/,$commit_info[0]);1162# $commit_info_split[1] is the sha1 of the commit to export1163# $commit_info_split[0] is the sha1 of its direct child1164push(@commit_pairs, \@commit_info_split);1165$parsed_sha1=$commit_info_split[1];1166}1167}else{1168# No remote mediawiki revision. Export the whole1169# history (linearized with --first-parent)1170print STDERR "Warning: no common ancestor, pushing complete history\n";1171my$history= run_git("rev-list --first-parent --children$local");1172my@history=split('\n',$history);1173@history=@history[1..$#history];1174foreachmy$line(reverse@history) {1175my@commit_info_split=split(/ |\n/,$line);1176push(@commit_pairs, \@commit_info_split);1177}1178}11791180foreachmy$commit_info_split(@commit_pairs) {1181my$sha1_child= @{$commit_info_split}[0];1182my$sha1_commit= @{$commit_info_split}[1];1183my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1184# TODO: we could detect rename, and encode them with a #redirect on the wiki.1185# TODO: for now, it's just a delete+add1186my@diff_info_list=split(/\0/,$diff_infos);1187# Keep the subject line of the commit message as mediawiki comment for the revision1188my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1189chomp($commit_msg);1190# Push every blob1191while(@diff_info_list) {1192my$status;1193# git diff-tree -z gives an output like1194# <metadata>\0<filename1>\01195# <metadata>\0<filename2>\01196# and we've split on \0.1197my$info=shift(@diff_info_list);1198my$file=shift(@diff_info_list);1199($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1200if($statuseq"non-fast-forward") {1201# we may already have sent part of the1202# commit to MediaWiki, but it's too1203# late to cancel it. Stop the push in1204# the middle, but still give an1205# accurate error message.1206return error_non_fast_forward($remote);1207}1208if($statusne"ok") {1209die("Unknown error from mw_push_file()");1210}1211}1212unless($dumb_push) {1213 run_git("notes --ref=$remotename/mediawikiadd -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1214 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1215}1216}12171218print STDOUT "ok$remote\n";1219return1;1220}12211222sub get_allowed_file_extensions {1223 mw_connect_maybe();12241225my$query= {1226 action =>'query',1227 meta =>'siteinfo',1228 siprop =>'fileextensions'1229};1230my$result=$mediawiki->api($query);1231my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1232my%hashFile=map{$_=>1}@file_extensions;12331234return%hashFile;1235}12361237# In memory cache for MediaWiki namespace ids.1238my%namespace_id;12391240# Namespaces whose id is cached in the configuration file1241# (to avoid duplicates)1242my%cached_mw_namespace_id;12431244# Return MediaWiki id for a canonical namespace name.1245# Ex.: "File", "Project".1246sub get_mw_namespace_id {1247 mw_connect_maybe();1248my$name=shift;12491250if(!exists$namespace_id{$name}) {1251# Look at configuration file, if the record for that namespace is1252# already cached. Namespaces are stored in form:1253# "Name_of_namespace:Id_namespace", ex.: "File:6".1254my@temp=split(/[ \n]/, run_git("config --get-all remote."1255.$remotename.".namespaceCache"));1256chomp(@temp);1257foreachmy$ns(@temp) {1258my($n,$id) =split(/:/,$ns);1259$namespace_id{$n} =$id;1260$cached_mw_namespace_id{$n} =1;1261}1262}12631264if(!exists$namespace_id{$name}) {1265print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1266# NS not found => get namespace id from MW and store it in1267# configuration file.1268my$query= {1269 action =>'query',1270 meta =>'siteinfo',1271 siprop =>'namespaces'1272};1273my$result=$mediawiki->api($query);12741275while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1276if(defined($ns->{id}) &&defined($ns->{canonical})) {1277$namespace_id{$ns->{canonical}} =$ns->{id};1278if($ns->{'*'}) {1279# alias (e.g. french Fichier: as alias for canonical File:)1280$namespace_id{$ns->{'*'}} =$ns->{id};1281}1282}1283}1284}12851286my$id=$namespace_id{$name};12871288if(defined$id) {1289# Store explicitely requested namespaces on disk1290if(!exists$cached_mw_namespace_id{$name}) {1291 run_git("config --add remote.".$remotename1292.".namespaceCache\"".$name.":".$id."\"");1293$cached_mw_namespace_id{$name} =1;1294}1295return$id;1296}else{1297die"No such namespace$nameon MediaWiki.";1298}1299}