1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# https://github.com/Bibzball/Git-Mediawiki/wiki 13# 14# Known limitations: 15# 16# - Poor performance in the best case: it takes forever to check 17# whether we're up-to-date (on fetch or push) or to fetch a few 18# revisions from a large wiki, because we use exclusively a 19# page-based synchronization. We could switch to a wiki-wide 20# synchronization when the synchronization involves few revisions 21# but the wiki is large. 22# 23# - Git renames could be turned into MediaWiki renames (see TODO 24# below) 25# 26# - login/password support requires the user to write the password 27# cleartext in a file (see TODO below). 28# 29# - No way to import "one page, and all pages included in it" 30# 31# - Multiple remote MediaWikis have not been very well tested. 32 33use strict; 34use MediaWiki::API; 35use DateTime::Format::ISO8601; 36 37# By default, use UTF-8 to communicate with Git and the user 38binmode STDERR,":utf8"; 39binmode STDOUT,":utf8"; 40 41use URI::Escape; 42use IPC::Open2; 43 44use warnings; 45 46# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 47useconstant SLASH_REPLACEMENT =>"%2F"; 48 49# It's not always possible to delete pages (may require some 50# priviledges). Deleted pages are replaced with this content. 51useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 52 53# It's not possible to create empty pages. New empty files in Git are 54# sent with this content instead. 55useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 56 57# used to reflect file creation or deletion in diff. 58useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 59 60my$remotename=$ARGV[0]; 61my$url=$ARGV[1]; 62 63# Accept both space-separated and multiple keys in config file. 64# Spaces should be written as _ anyway because we'll use chomp. 65my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 66chomp(@tracked_pages); 67 68# Just like @tracked_pages, but for MediaWiki categories. 69my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 70chomp(@tracked_categories); 71 72# Import media files too. 73my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 74chomp($import_media); 75$import_media= ($import_mediaeq"true"); 76 77my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 78# TODO: ideally, this should be able to read from keyboard, but we're 79# inside a remote helper, so our stdin is connect to git, not to a 80# terminal. 81my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 82my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 83chomp($wiki_login); 84chomp($wiki_passwd); 85chomp($wiki_domain); 86 87# Import only last revisions (both for clone and fetch) 88my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 89chomp($shallow_import); 90$shallow_import= ($shallow_importeq"true"); 91 92# Dumb push: don't update notes and mediawiki ref to reflect the last push. 93# 94# Configurable with mediawiki.dumbPush, or per-remote with 95# remote.<remotename>.dumbPush. 96# 97# This means the user will have to re-import the just-pushed 98# revisions. On the other hand, this means that the Git revisions 99# corresponding to MediaWiki revisions are all imported from the wiki, 100# regardless of whether they were initially created in Git or from the 101# web interface, hence all users will get the same history (i.e. if 102# the push from Git to MediaWiki loses some information, everybody 103# will get the history with information lost). If the import is 104# deterministic, this means everybody gets the same sha1 for each 105# MediaWiki revision. 106my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 107unless($dumb_push) { 108$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 109} 110chomp($dumb_push); 111$dumb_push= ($dumb_pusheq"true"); 112 113my$wiki_name=$url; 114$wiki_name=~s/[^\/]*:\/\///; 115# If URL is like http://user:password@example.com/, we clearly don't 116# want the password in $wiki_name. While we're there, also remove user 117# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 118$wiki_name=~s/^.*@//; 119 120# Commands parser 121my$entry; 122my@cmd; 123while(<STDIN>) { 124chomp; 125@cmd=split(/ /); 126if(defined($cmd[0])) { 127# Line not blank 128if($cmd[0]eq"capabilities") { 129die("Too many arguments for capabilities")unless(!defined($cmd[1])); 130 mw_capabilities(); 131}elsif($cmd[0]eq"list") { 132die("Too many arguments for list")unless(!defined($cmd[2])); 133 mw_list($cmd[1]); 134}elsif($cmd[0]eq"import") { 135die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 136 mw_import($cmd[1]); 137}elsif($cmd[0]eq"option") { 138die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 139 mw_option($cmd[1],$cmd[2]); 140}elsif($cmd[0]eq"push") { 141 mw_push($cmd[1]); 142}else{ 143print STDERR "Unknown command. Aborting...\n"; 144last; 145} 146}else{ 147# blank line: we should terminate 148last; 149} 150 151BEGIN{ $| =1}# flush STDOUT, to make sure the previous 152# command is fully processed. 153} 154 155########################## Functions ############################## 156 157## credential API management (generic functions) 158 159sub credential_from_url { 160my$url=shift; 161my$parsed= URI->new($url); 162my%credential; 163 164if($parsed->scheme) { 165$credential{protocol} =$parsed->scheme; 166} 167if($parsed->host) { 168$credential{host} =$parsed->host; 169} 170if($parsed->path) { 171$credential{path} =$parsed->path; 172} 173if($parsed->userinfo) { 174if($parsed->userinfo=~/([^:]*):(.*)/) { 175$credential{username} =$1; 176$credential{password} =$2; 177}else{ 178$credential{username} =$parsed->userinfo; 179} 180} 181 182return%credential; 183} 184 185sub credential_read { 186my%credential; 187my$reader=shift; 188my$op=shift; 189while(<$reader>) { 190my($key,$value) =/([^=]*)=(.*)/; 191if(not defined$key) { 192die"ERROR receiving response from git credential$op:\n$_\n"; 193} 194$credential{$key} =$value; 195} 196return%credential; 197} 198 199sub credential_write { 200my$credential=shift; 201my$writer=shift; 202while(my($key,$value) =each(%$credential) ) { 203if($value) { 204print$writer"$key=$value\n"; 205} 206} 207} 208 209sub credential_run { 210my$op=shift; 211my$credential=shift; 212my$pid= open2(my$reader,my$writer,"git credential$op"); 213 credential_write($credential,$writer); 214print$writer"\n"; 215close($writer); 216 217if($opeq"fill") { 218%$credential= credential_read($reader,$op); 219}else{ 220if(<$reader>) { 221die"ERROR while running git credential$op:\n$_"; 222} 223} 224close($reader); 225waitpid($pid,0); 226my$child_exit_status=$?>>8; 227if($child_exit_status!=0) { 228die"'git credential$op' failed with code$child_exit_status."; 229} 230} 231 232# MediaWiki API instance, created lazily. 233my$mediawiki; 234 235sub mw_connect_maybe { 236if($mediawiki) { 237return; 238} 239$mediawiki= MediaWiki::API->new; 240$mediawiki->{config}->{api_url} ="$url/api.php"; 241if($wiki_login) { 242my%credential= credential_from_url($url); 243$credential{username} =$wiki_login; 244$credential{password} =$wiki_passwd; 245 credential_run("fill", \%credential); 246my$request= {lgname =>$credential{username}, 247 lgpassword =>$credential{password}, 248 lgdomain =>$wiki_domain}; 249if($mediawiki->login($request)) { 250 credential_run("approve", \%credential); 251print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 252}else{ 253print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 254print STDERR " (error ". 255$mediawiki->{error}->{code} .': '. 256$mediawiki->{error}->{details} .")\n"; 257 credential_run("reject", \%credential); 258exit1; 259} 260} 261} 262 263## Functions for listing pages on the remote wiki 264sub get_mw_tracked_pages { 265my$pages=shift; 266 get_mw_page_list(\@tracked_pages,$pages); 267} 268 269sub get_mw_page_list { 270my$page_list=shift; 271my$pages=shift; 272my@some_pages=@$page_list; 273while(@some_pages) { 274my$last=50; 275if($#some_pages<$last) { 276$last=$#some_pages; 277} 278my@slice=@some_pages[0..$last]; 279 get_mw_first_pages(\@slice,$pages); 280@some_pages=@some_pages[51..$#some_pages]; 281} 282} 283 284sub get_mw_tracked_categories { 285my$pages=shift; 286foreachmy$category(@tracked_categories) { 287if(index($category,':') <0) { 288# Mediawiki requires the Category 289# prefix, but let's not force the user 290# to specify it. 291$category="Category:".$category; 292} 293my$mw_pages=$mediawiki->list( { 294 action =>'query', 295 list =>'categorymembers', 296 cmtitle =>$category, 297 cmlimit =>'max'} ) 298||die$mediawiki->{error}->{code} .': ' 299.$mediawiki->{error}->{details}; 300foreachmy$page(@{$mw_pages}) { 301$pages->{$page->{title}} =$page; 302} 303} 304} 305 306sub get_mw_all_pages { 307my$pages=shift; 308# No user-provided list, get the list of pages from the API. 309my$mw_pages=$mediawiki->list({ 310 action =>'query', 311 list =>'allpages', 312 aplimit =>'max' 313}); 314if(!defined($mw_pages)) { 315print STDERR "fatal: could not get the list of wiki pages.\n"; 316print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 317print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 318exit1; 319} 320foreachmy$page(@{$mw_pages}) { 321$pages->{$page->{title}} =$page; 322} 323} 324 325# queries the wiki for a set of pages. Meant to be used within a loop 326# querying the wiki for slices of page list. 327sub get_mw_first_pages { 328my$some_pages=shift; 329my@some_pages= @{$some_pages}; 330 331my$pages=shift; 332 333# pattern 'page1|page2|...' required by the API 334my$titles=join('|',@some_pages); 335 336my$mw_pages=$mediawiki->api({ 337 action =>'query', 338 titles =>$titles, 339}); 340if(!defined($mw_pages)) { 341print STDERR "fatal: could not query the list of wiki pages.\n"; 342print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 343print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 344exit1; 345} 346while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 347if($id<0) { 348print STDERR "Warning: page$page->{title} not found on wiki\n"; 349}else{ 350$pages->{$page->{title}} =$page; 351} 352} 353} 354 355# Get the list of pages to be fetched according to configuration. 356sub get_mw_pages { 357 mw_connect_maybe(); 358 359my%pages;# hash on page titles to avoid duplicates 360my$user_defined; 361if(@tracked_pages) { 362$user_defined=1; 363# The user provided a list of pages titles, but we 364# still need to query the API to get the page IDs. 365 get_mw_tracked_pages(\%pages); 366} 367if(@tracked_categories) { 368$user_defined=1; 369 get_mw_tracked_categories(\%pages); 370} 371if(!$user_defined) { 372 get_mw_all_pages(\%pages); 373} 374if($import_media) { 375print STDERR "Getting media files for selected pages...\n"; 376if($user_defined) { 377 get_linked_mediafiles(\%pages); 378}else{ 379 get_all_mediafiles(\%pages); 380} 381} 382return%pages; 383} 384 385# usage: $out = run_git("command args"); 386# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 387sub run_git { 388my$args=shift; 389my$encoding= (shift||"encoding(UTF-8)"); 390open(my$git,"-|:$encoding","git ".$args); 391my$res=do{local$/; <$git> }; 392close($git); 393 394return$res; 395} 396 397 398sub get_all_mediafiles { 399my$pages=shift; 400# Attach list of all pages for media files from the API, 401# they are in a different namespace, only one namespace 402# can be queried at the same moment 403my$mw_pages=$mediawiki->list({ 404 action =>'query', 405 list =>'allpages', 406 apnamespace => get_mw_namespace_id("File"), 407 aplimit =>'max' 408}); 409if(!defined($mw_pages)) { 410print STDERR "fatal: could not get the list of pages for media files.\n"; 411print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 412print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 413exit1; 414} 415foreachmy$page(@{$mw_pages}) { 416$pages->{$page->{title}} =$page; 417} 418} 419 420sub get_linked_mediafiles { 421my$pages=shift; 422my@titles=map$_->{title},values(%{$pages}); 423 424# The query is split in small batches because of the MW API limit of 425# the number of links to be returned (500 links max). 426my$batch=10; 427while(@titles) { 428if($#titles<$batch) { 429$batch=$#titles; 430} 431my@slice=@titles[0..$batch]; 432 433# pattern 'page1|page2|...' required by the API 434my$mw_titles=join('|',@slice); 435 436# Media files could be included or linked from 437# a page, get all related 438my$query= { 439 action =>'query', 440 prop =>'links|images', 441 titles =>$mw_titles, 442 plnamespace => get_mw_namespace_id("File"), 443 pllimit =>'max' 444}; 445my$result=$mediawiki->api($query); 446 447while(my($id,$page) =each(%{$result->{query}->{pages}})) { 448my@media_titles; 449if(defined($page->{links})) { 450my@link_titles=map$_->{title}, @{$page->{links}}; 451push(@media_titles,@link_titles); 452} 453if(defined($page->{images})) { 454my@image_titles=map$_->{title}, @{$page->{images}}; 455push(@media_titles,@image_titles); 456} 457if(@media_titles) { 458 get_mw_page_list(\@media_titles,$pages); 459} 460} 461 462@titles=@titles[($batch+1)..$#titles]; 463} 464} 465 466sub get_mw_mediafile_for_page_revision { 467# Name of the file on Wiki, with the prefix. 468my$filename=shift; 469my$timestamp=shift; 470my%mediafile; 471 472# Search if on a media file with given timestamp exists on 473# MediaWiki. In that case download the file. 474my$query= { 475 action =>'query', 476 prop =>'imageinfo', 477 titles =>"File:".$filename, 478 iistart =>$timestamp, 479 iiend =>$timestamp, 480 iiprop =>'timestamp|archivename|url', 481 iilimit =>1 482}; 483my$result=$mediawiki->api($query); 484 485my($fileid,$file) =each( %{$result->{query}->{pages}} ); 486# If not defined it means there is no revision of the file for 487# given timestamp. 488if(defined($file->{imageinfo})) { 489$mediafile{title} =$filename; 490 491my$fileinfo=pop(@{$file->{imageinfo}}); 492$mediafile{timestamp} =$fileinfo->{timestamp}; 493# Mediawiki::API's download function doesn't support https URLs 494# and can't download old versions of files. 495print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 496$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 497} 498return%mediafile; 499} 500 501sub download_mw_mediafile { 502my$url=shift; 503 504my$response=$mediawiki->{ua}->get($url); 505if($response->code==200) { 506return$response->decoded_content; 507}else{ 508print STDERR "Error downloading mediafile from :\n"; 509print STDERR "URL:$url\n"; 510print STDERR "Server response: ".$response->code." ".$response->message."\n"; 511exit1; 512} 513} 514 515sub get_last_local_revision { 516# Get note regarding last mediawiki revision 517my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 518my@note_info=split(/ /,$note); 519 520my$lastrevision_number; 521if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 522print STDERR "No previous mediawiki revision found"; 523$lastrevision_number=0; 524}else{ 525# Notes are formatted : mediawiki_revision: #number 526$lastrevision_number=$note_info[1]; 527chomp($lastrevision_number); 528print STDERR "Last local mediawiki revision found is$lastrevision_number"; 529} 530return$lastrevision_number; 531} 532 533# Remember the timestamp corresponding to a revision id. 534my%basetimestamps; 535 536sub get_last_remote_revision { 537 mw_connect_maybe(); 538 539my%pages_hash= get_mw_pages(); 540my@pages=values(%pages_hash); 541 542my$max_rev_num=0; 543 544foreachmy$page(@pages) { 545my$id=$page->{pageid}; 546 547my$query= { 548 action =>'query', 549 prop =>'revisions', 550 rvprop =>'ids|timestamp', 551 pageids =>$id, 552}; 553 554my$result=$mediawiki->api($query); 555 556my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 557 558$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 559 560$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 561} 562 563print STDERR "Last remote revision found is$max_rev_num.\n"; 564return$max_rev_num; 565} 566 567# Clean content before sending it to MediaWiki 568sub mediawiki_clean { 569my$string=shift; 570my$page_created=shift; 571# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 572# This function right trims a string and adds a \n at the end to follow this rule 573$string=~s/\s+$//; 574if($stringeq""&&$page_created) { 575# Creating empty pages is forbidden. 576$string= EMPTY_CONTENT; 577} 578return$string."\n"; 579} 580 581# Filter applied on MediaWiki data before adding them to Git 582sub mediawiki_smudge { 583my$string=shift; 584if($stringeq EMPTY_CONTENT) { 585$string=""; 586} 587# This \n is important. This is due to mediawiki's way to handle end of files. 588return$string."\n"; 589} 590 591sub mediawiki_clean_filename { 592my$filename=shift; 593$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 594# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 595# Do a variant of URL-encoding, i.e. looks like URL-encoding, 596# but with _ added to prevent MediaWiki from thinking this is 597# an actual special character. 598$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 599# If we use the uri escape before 600# we should unescape here, before anything 601 602return$filename; 603} 604 605sub mediawiki_smudge_filename { 606my$filename=shift; 607$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 608$filename=~s/ /_/g; 609# Decode forbidden characters encoded in mediawiki_clean_filename 610$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 611return$filename; 612} 613 614sub literal_data { 615my($content) =@_; 616print STDOUT "data ", bytes::length($content),"\n",$content; 617} 618 619sub literal_data_raw { 620# Output possibly binary content. 621my($content) =@_; 622# Avoid confusion between size in bytes and in characters 623 utf8::downgrade($content); 624binmode STDOUT,":raw"; 625print STDOUT "data ", bytes::length($content),"\n",$content; 626binmode STDOUT,":utf8"; 627} 628 629sub mw_capabilities { 630# Revisions are imported to the private namespace 631# refs/mediawiki/$remotename/ by the helper and fetched into 632# refs/remotes/$remotename later by fetch. 633print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 634print STDOUT "import\n"; 635print STDOUT "list\n"; 636print STDOUT "push\n"; 637print STDOUT "\n"; 638} 639 640sub mw_list { 641# MediaWiki do not have branches, we consider one branch arbitrarily 642# called master, and HEAD pointing to it. 643print STDOUT "? refs/heads/master\n"; 644print STDOUT "\@refs/heads/masterHEAD\n"; 645print STDOUT "\n"; 646} 647 648sub mw_option { 649print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 650print STDOUT "unsupported\n"; 651} 652 653sub fetch_mw_revisions_for_page { 654my$page=shift; 655my$id=shift; 656my$fetch_from=shift; 657my@page_revs= (); 658my$query= { 659 action =>'query', 660 prop =>'revisions', 661 rvprop =>'ids', 662 rvdir =>'newer', 663 rvstartid =>$fetch_from, 664 rvlimit =>500, 665 pageids =>$id, 666}; 667 668my$revnum=0; 669# Get 500 revisions at a time due to the mediawiki api limit 670while(1) { 671my$result=$mediawiki->api($query); 672 673# Parse each of those 500 revisions 674foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 675my$page_rev_ids; 676$page_rev_ids->{pageid} =$page->{pageid}; 677$page_rev_ids->{revid} =$revision->{revid}; 678push(@page_revs,$page_rev_ids); 679$revnum++; 680} 681last unless$result->{'query-continue'}; 682$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 683} 684if($shallow_import&&@page_revs) { 685print STDERR " Found 1 revision (shallow import).\n"; 686@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 687return$page_revs[0]; 688} 689print STDERR " Found ",$revnum," revision(s).\n"; 690return@page_revs; 691} 692 693sub fetch_mw_revisions { 694my$pages=shift;my@pages= @{$pages}; 695my$fetch_from=shift; 696 697my@revisions= (); 698my$n=1; 699foreachmy$page(@pages) { 700my$id=$page->{pageid}; 701 702print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 703$n++; 704my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 705@revisions= (@page_revs,@revisions); 706} 707 708return($n,@revisions); 709} 710 711sub import_file_revision { 712my$commit=shift; 713my%commit= %{$commit}; 714my$full_import=shift; 715my$n=shift; 716my$mediafile=shift; 717my%mediafile; 718if($mediafile) { 719%mediafile= %{$mediafile}; 720} 721 722my$title=$commit{title}; 723my$comment=$commit{comment}; 724my$content=$commit{content}; 725my$author=$commit{author}; 726my$date=$commit{date}; 727 728print STDOUT "commit refs/mediawiki/$remotename/master\n"; 729print STDOUT "mark :$n\n"; 730print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 731 literal_data($comment); 732 733# If it's not a clone, we need to know where to start from 734if(!$full_import&&$n==1) { 735print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 736} 737if($contentne DELETED_CONTENT) { 738print STDOUT "M 644 inline$title.mw\n"; 739 literal_data($content); 740if(%mediafile) { 741print STDOUT "M 644 inline$mediafile{title}\n"; 742 literal_data_raw($mediafile{content}); 743} 744print STDOUT "\n\n"; 745}else{ 746print STDOUT "D$title.mw\n"; 747} 748 749# mediawiki revision number in the git note 750if($full_import&&$n==1) { 751print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 752} 753print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 754print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 755 literal_data("Note added by git-mediawiki during import"); 756if(!$full_import&&$n==1) { 757print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 758} 759print STDOUT "N inline :$n\n"; 760 literal_data("mediawiki_revision: ".$commit{mw_revision}); 761print STDOUT "\n\n"; 762} 763 764# parse a sequence of 765# <cmd> <arg1> 766# <cmd> <arg2> 767# \n 768# (like batch sequence of import and sequence of push statements) 769sub get_more_refs { 770my$cmd=shift; 771my@refs; 772while(1) { 773my$line= <STDIN>; 774if($line=~m/^$cmd (.*)$/) { 775push(@refs,$1); 776}elsif($lineeq"\n") { 777return@refs; 778}else{ 779die("Invalid command in a '$cmd' batch: ".$_); 780} 781} 782} 783 784sub mw_import { 785# multiple import commands can follow each other. 786my@refs= (shift, get_more_refs("import")); 787foreachmy$ref(@refs) { 788 mw_import_ref($ref); 789} 790print STDOUT "done\n"; 791} 792 793sub mw_import_ref { 794my$ref=shift; 795# The remote helper will call "import HEAD" and 796# "import refs/heads/master". 797# Since HEAD is a symbolic ref to master (by convention, 798# followed by the output of the command "list" that we gave), 799# we don't need to do anything in this case. 800if($refeq"HEAD") { 801return; 802} 803 804 mw_connect_maybe(); 805 806my%pages_hash= get_mw_pages(); 807my@pages=values(%pages_hash); 808 809print STDERR "Searching revisions...\n"; 810my$last_local= get_last_local_revision(); 811my$fetch_from=$last_local+1; 812if($fetch_from==1) { 813print STDERR ", fetching from beginning.\n"; 814}else{ 815print STDERR ", fetching from here.\n"; 816} 817my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 818 819# Creation of the fast-import stream 820print STDERR "Fetching & writing export data...\n"; 821 822@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 823my@revision_ids=map$_->{revid},@revisions; 824 825$n=0; 826my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 827 828foreachmy$pagerevid(@revision_ids) { 829# fetch the content of the pages 830my$query= { 831 action =>'query', 832 prop =>'revisions', 833 rvprop =>'content|timestamp|comment|user|ids', 834 revids =>$pagerevid, 835}; 836 837my$result=$mediawiki->api($query); 838 839my@result_pages=values(%{$result->{query}->{pages}}); 840my$result_page=$result_pages[0]; 841my$rev=$result_pages[0]->{revisions}->[0]; 842 843$n++; 844 845my$page_title=$result_page->{title}; 846my%commit; 847$commit{author} =$rev->{user} ||'Anonymous'; 848$commit{comment} =$rev->{comment} ||'*Empty MediaWiki Message*'; 849$commit{title} = mediawiki_smudge_filename($page_title); 850$commit{mw_revision} =$rev->{revid}; 851$commit{content} = mediawiki_smudge($rev->{'*'}); 852 853if(!defined($rev->{timestamp})) { 854$last_timestamp++; 855}else{ 856$last_timestamp=$rev->{timestamp}; 857} 858$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 859 860# Differentiates classic pages and media files. 861my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 862my%mediafile; 863if($namespace&& get_mw_namespace_id($namespace) == get_mw_namespace_id("File")) { 864%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 865} 866# If this is a revision of the media page for new version 867# of a file do one common commit for both file and media page. 868# Else do commit only for that page. 869print STDERR "$n/",scalar(@revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 870 import_file_revision(\%commit, ($fetch_from==1),$n, \%mediafile); 871} 872 873if($fetch_from==1&&$n==0) { 874print STDERR "You appear to have cloned an empty MediaWiki.\n"; 875# Something has to be done remote-helper side. If nothing is done, an error is 876# thrown saying that HEAD is refering to unknown object 0000000000000000000 877# and the clone fails. 878} 879} 880 881sub error_non_fast_forward { 882my$advice= run_git("config --bool advice.pushNonFastForward"); 883chomp($advice); 884if($advicene"false") { 885# Native git-push would show this after the summary. 886# We can't ask it to display it cleanly, so print it 887# ourselves before. 888print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 889print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 890print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 891} 892print STDOUT "error$_[0]\"non-fast-forward\"\n"; 893return0; 894} 895 896sub mw_upload_file { 897my$complete_file_name=shift; 898my$new_sha1=shift; 899my$extension=shift; 900my$file_deleted=shift; 901my$summary=shift; 902my$newrevid; 903my$path="File:".$complete_file_name; 904my%hashFiles= get_allowed_file_extensions(); 905if(!exists($hashFiles{$extension})) { 906print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 907print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 908return$newrevid; 909} 910# Deleting and uploading a file requires a priviledged user 911if($file_deleted) { 912 mw_connect_maybe(); 913my$query= { 914 action =>'delete', 915 title =>$path, 916 reason =>$summary 917}; 918if(!$mediawiki->edit($query)) { 919print STDERR "Failed to delete file on remote wiki\n"; 920print STDERR "Check your permissions on the remote site. Error code:\n"; 921print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 922exit1; 923} 924}else{ 925# Don't let perl try to interpret file content as UTF-8 => use "raw" 926my$content= run_git("cat-file blob$new_sha1","raw"); 927if($contentne"") { 928 mw_connect_maybe(); 929$mediawiki->{config}->{upload_url} = 930"$url/index.php/Special:Upload"; 931$mediawiki->edit({ 932 action =>'upload', 933 filename =>$complete_file_name, 934 comment =>$summary, 935 file => [undef, 936$complete_file_name, 937 Content =>$content], 938 ignorewarnings =>1, 939}, { 940 skip_encoding =>1 941} ) ||die$mediawiki->{error}->{code} .':' 942.$mediawiki->{error}->{details}; 943my$last_file_page=$mediawiki->get_page({title =>$path}); 944$newrevid=$last_file_page->{revid}; 945print STDERR "Pushed file:$new_sha1-$complete_file_name.\n"; 946}else{ 947print STDERR "Empty file$complete_file_namenot pushed.\n"; 948} 949} 950return$newrevid; 951} 952 953sub mw_push_file { 954my$diff_info=shift; 955# $diff_info contains a string in this format: 956# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status> 957my@diff_info_split=split(/[ \t]/,$diff_info); 958 959# Filename, including .mw extension 960my$complete_file_name=shift; 961# Commit message 962my$summary=shift; 963# MediaWiki revision number. Keep the previous one by default, 964# in case there's no edit to perform. 965my$oldrevid=shift; 966my$newrevid; 967 968my$new_sha1=$diff_info_split[3]; 969my$old_sha1=$diff_info_split[2]; 970my$page_created= ($old_sha1eq NULL_SHA1); 971my$page_deleted= ($new_sha1eq NULL_SHA1); 972$complete_file_name= mediawiki_clean_filename($complete_file_name); 973 974my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/; 975if(!defined($extension)) { 976$extension=""; 977} 978if($extensioneq"mw") { 979my$file_content; 980if($page_deleted) { 981# Deleting a page usually requires 982# special priviledges. A common 983# convention is to replace the page 984# with this content instead: 985$file_content= DELETED_CONTENT; 986}else{ 987$file_content= run_git("cat-file blob$new_sha1"); 988} 989 990 mw_connect_maybe(); 991 992my$result=$mediawiki->edit( { 993 action =>'edit', 994 summary =>$summary, 995 title =>$title, 996 basetimestamp =>$basetimestamps{$oldrevid}, 997 text => mediawiki_clean($file_content,$page_created), 998}, { 999 skip_encoding =>1# Helps with names with accentuated characters1000});1001if(!$result) {1002if($mediawiki->{error}->{code} ==3) {1003# edit conflicts, considered as non-fast-forward1004print STDERR 'Warning: Error '.1005$mediawiki->{error}->{code} .1006' from mediwiki: '.$mediawiki->{error}->{details} .1007".\n";1008return($oldrevid,"non-fast-forward");1009}else{1010# Other errors. Shouldn't happen => just die()1011die'Fatal: Error '.1012$mediawiki->{error}->{code} .1013' from mediwiki: '.$mediawiki->{error}->{details};1014}1015}1016$newrevid=$result->{edit}->{newrevid};1017print STDERR "Pushed file:$new_sha1-$title\n";1018}else{1019$newrevid= mw_upload_file($complete_file_name,$new_sha1,1020$extension,$page_deleted,1021$summary);1022}1023$newrevid= ($newrevidor$oldrevid);1024return($newrevid,"ok");1025}10261027sub mw_push {1028# multiple push statements can follow each other1029my@refsspecs= (shift, get_more_refs("push"));1030my$pushed;1031formy$refspec(@refsspecs) {1032my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1033or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1034if($force) {1035print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1036}1037if($localeq"") {1038print STDERR "Cannot delete remote branch on a MediaWiki\n";1039print STDOUT "error$remotecannot delete\n";1040next;1041}1042if($remotene"refs/heads/master") {1043print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1044print STDOUT "error$remoteonly master allowed\n";1045next;1046}1047if(mw_push_revision($local,$remote)) {1048$pushed=1;1049}1050}10511052# Notify Git that the push is done1053print STDOUT "\n";10541055if($pushed&&$dumb_push) {1056print STDERR "Just pushed some revisions to MediaWiki.\n";1057print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1058print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1059print STDERR "\n";1060print STDERR " git pull --rebase\n";1061print STDERR "\n";1062}1063}10641065sub mw_push_revision {1066my$local=shift;1067my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1068my$last_local_revid= get_last_local_revision();1069print STDERR ".\n";# Finish sentence started by get_last_local_revision()1070my$last_remote_revid= get_last_remote_revision();1071my$mw_revision=$last_remote_revid;10721073# Get sha1 of commit pointed by local HEAD1074my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1075# Get sha1 of commit pointed by remotes/$remotename/master1076my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1077chomp($remoteorigin_sha1);10781079if($last_local_revid>0&&1080$last_local_revid<$last_remote_revid) {1081return error_non_fast_forward($remote);1082}10831084if($HEAD_sha1eq$remoteorigin_sha1) {1085# nothing to push1086return0;1087}10881089# Get every commit in between HEAD and refs/remotes/origin/master,1090# including HEAD and refs/remotes/origin/master1091my@commit_pairs= ();1092if($last_local_revid>0) {1093my$parsed_sha1=$remoteorigin_sha1;1094# Find a path from last MediaWiki commit to pushed commit1095while($parsed_sha1ne$HEAD_sha1) {1096my@commit_info=grep(/^$parsed_sha1/,split(/\n/, run_git("rev-list --children$local")));1097if(!@commit_info) {1098return error_non_fast_forward($remote);1099}1100my@commit_info_split=split(/ |\n/,$commit_info[0]);1101# $commit_info_split[1] is the sha1 of the commit to export1102# $commit_info_split[0] is the sha1 of its direct child1103push(@commit_pairs, \@commit_info_split);1104$parsed_sha1=$commit_info_split[1];1105}1106}else{1107# No remote mediawiki revision. Export the whole1108# history (linearized with --first-parent)1109print STDERR "Warning: no common ancestor, pushing complete history\n";1110my$history= run_git("rev-list --first-parent --children$local");1111my@history=split('\n',$history);1112@history=@history[1..$#history];1113foreachmy$line(reverse@history) {1114my@commit_info_split=split(/ |\n/,$line);1115push(@commit_pairs, \@commit_info_split);1116}1117}11181119foreachmy$commit_info_split(@commit_pairs) {1120my$sha1_child= @{$commit_info_split}[0];1121my$sha1_commit= @{$commit_info_split}[1];1122my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1123# TODO: we could detect rename, and encode them with a #redirect on the wiki.1124# TODO: for now, it's just a delete+add1125my@diff_info_list=split(/\0/,$diff_infos);1126# Keep the subject line of the commit message as mediawiki comment for the revision1127my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1128chomp($commit_msg);1129# Push every blob1130while(@diff_info_list) {1131my$status;1132# git diff-tree -z gives an output like1133# <metadata>\0<filename1>\01134# <metadata>\0<filename2>\01135# and we've split on \0.1136my$info=shift(@diff_info_list);1137my$file=shift(@diff_info_list);1138($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1139if($statuseq"non-fast-forward") {1140# we may already have sent part of the1141# commit to MediaWiki, but it's too1142# late to cancel it. Stop the push in1143# the middle, but still give an1144# accurate error message.1145return error_non_fast_forward($remote);1146}1147if($statusne"ok") {1148die("Unknown error from mw_push_file()");1149}1150}1151unless($dumb_push) {1152 run_git("notes --ref=$remotename/mediawikiadd -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1153 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1154}1155}11561157print STDOUT "ok$remote\n";1158return1;1159}11601161sub get_allowed_file_extensions {1162 mw_connect_maybe();11631164my$query= {1165 action =>'query',1166 meta =>'siteinfo',1167 siprop =>'fileextensions'1168};1169my$result=$mediawiki->api($query);1170my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1171my%hashFile=map{$_=>1}@file_extensions;11721173return%hashFile;1174}11751176# In memory cache for MediaWiki namespace ids.1177my%namespace_id;11781179# Namespaces whose id is cached in the configuration file1180# (to avoid duplicates)1181my%cached_mw_namespace_id;11821183# Return MediaWiki id for a canonical namespace name.1184# Ex.: "File", "Project".1185sub get_mw_namespace_id {1186 mw_connect_maybe();1187my$name=shift;11881189if(!exists$namespace_id{$name}) {1190# Look at configuration file, if the record for that namespace is1191# already cached. Namespaces are stored in form:1192# "Name_of_namespace:Id_namespace", ex.: "File:6".1193my@temp=split(/[ \n]/, run_git("config --get-all remote."1194.$remotename.".namespaceCache"));1195chomp(@temp);1196foreachmy$ns(@temp) {1197my($n,$id) =split(/:/,$ns);1198$namespace_id{$n} =$id;1199$cached_mw_namespace_id{$n} =1;1200}1201}12021203if(!exists$namespace_id{$name}) {1204print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1205# NS not found => get namespace id from MW and store it in1206# configuration file.1207my$query= {1208 action =>'query',1209 meta =>'siteinfo',1210 siprop =>'namespaces'1211};1212my$result=$mediawiki->api($query);12131214while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1215if(defined($ns->{id}) &&defined($ns->{canonical})) {1216$namespace_id{$ns->{canonical}} =$ns->{id};1217if($ns->{'*'}) {1218# alias (e.g. french Fichier: as alias for canonical File:)1219$namespace_id{$ns->{'*'}} =$ns->{id};1220}1221}1222}1223}12241225my$id=$namespace_id{$name};12261227if(defined$id) {1228# Store explicitely requested namespaces on disk1229if(!exists$cached_mw_namespace_id{$name}) {1230 run_git("config --add remote.".$remotename1231.".namespaceCache\"".$name.":".$id."\"");1232$cached_mw_namespace_id{$name} =1;1233}1234return$id;1235}else{1236die"No such namespace$nameon MediaWiki.";1237}1238}