1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use DateTime::Format::ISO8601; 18use warnings; 19 20# By default, use UTF-8 to communicate with Git and the user 21binmode STDERR,":encoding(UTF-8)"; 22binmode STDOUT,":encoding(UTF-8)"; 23 24use URI::Escape; 25use IPC::Open2; 26 27# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 28useconstant SLASH_REPLACEMENT =>"%2F"; 29 30# It's not always possible to delete pages (may require some 31# privileges). Deleted pages are replaced with this content. 32useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 33 34# It's not possible to create empty pages. New empty files in Git are 35# sent with this content instead. 36useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 37 38# used to reflect file creation or deletion in diff. 39useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 40 41# Used on Git's side to reflect empty edit messages on the wiki 42useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 43 44my$remotename=$ARGV[0]; 45my$url=$ARGV[1]; 46 47# Accept both space-separated and multiple keys in config file. 48# Spaces should be written as _ anyway because we'll use chomp. 49my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 50chomp(@tracked_pages); 51 52# Just like @tracked_pages, but for MediaWiki categories. 53my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 54chomp(@tracked_categories); 55 56# Import media files on pull 57my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 58chomp($import_media); 59$import_media= ($import_mediaeq"true"); 60 61# Export media files on push 62my$export_media= run_git("config --get --bool remote.".$remotename.".mediaexport"); 63chomp($export_media); 64$export_media= !($export_mediaeq"false"); 65 66my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 67# Note: mwPassword is discourraged. Use the credential system instead. 68my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 69my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 70chomp($wiki_login); 71chomp($wiki_passwd); 72chomp($wiki_domain); 73 74# Import only last revisions (both for clone and fetch) 75my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 76chomp($shallow_import); 77$shallow_import= ($shallow_importeq"true"); 78 79# Fetch (clone and pull) by revisions instead of by pages. This behavior 80# is more efficient when we have a wiki with lots of pages and we fetch 81# the revisions quite often so that they concern only few pages. 82# Possible values: 83# - by_rev: perform one query per new revision on the remote wiki 84# - by_page: query each tracked page for new revision 85my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 86unless($fetch_strategy) { 87$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 88} 89chomp($fetch_strategy); 90unless($fetch_strategy) { 91$fetch_strategy="by_page"; 92} 93 94# Remember the timestamp corresponding to a revision id. 95my%basetimestamps; 96 97# Dumb push: don't update notes and mediawiki ref to reflect the last push. 98# 99# Configurable with mediawiki.dumbPush, or per-remote with 100# remote.<remotename>.dumbPush. 101# 102# This means the user will have to re-import the just-pushed 103# revisions. On the other hand, this means that the Git revisions 104# corresponding to MediaWiki revisions are all imported from the wiki, 105# regardless of whether they were initially created in Git or from the 106# web interface, hence all users will get the same history (i.e. if 107# the push from Git to MediaWiki loses some information, everybody 108# will get the history with information lost). If the import is 109# deterministic, this means everybody gets the same sha1 for each 110# MediaWiki revision. 111my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 112unless($dumb_push) { 113$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 114} 115chomp($dumb_push); 116$dumb_push= ($dumb_pusheq"true"); 117 118my$wiki_name=$url; 119$wiki_name=~s{[^/]*://}{}; 120# If URL is like http://user:password@example.com/, we clearly don't 121# want the password in $wiki_name. While we're there, also remove user 122# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 123$wiki_name=~s/^.*@//; 124 125# Commands parser 126my@cmd; 127while(<STDIN>) { 128chomp; 129@cmd=split(/ /); 130if(defined($cmd[0])) { 131# Line not blank 132if($cmd[0]eq"capabilities") { 133die("Too many arguments for capabilities\n")if(defined($cmd[1])); 134 mw_capabilities(); 135}elsif($cmd[0]eq"list") { 136die("Too many arguments for list\n")if(defined($cmd[2])); 137 mw_list($cmd[1]); 138}elsif($cmd[0]eq"import") { 139die("Invalid arguments for import\n")if($cmd[1]eq""||defined($cmd[2])); 140 mw_import($cmd[1]); 141}elsif($cmd[0]eq"option") { 142die("Too many arguments for option\n")if($cmd[1]eq""||$cmd[2]eq""||defined($cmd[3])); 143 mw_option($cmd[1],$cmd[2]); 144}elsif($cmd[0]eq"push") { 145 mw_push($cmd[1]); 146}else{ 147print STDERR "Unknown command. Aborting...\n"; 148last; 149} 150}else{ 151# blank line: we should terminate 152last; 153} 154 155BEGIN{ $| =1}# flush STDOUT, to make sure the previous 156# command is fully processed. 157} 158 159########################## Functions ############################## 160 161# MediaWiki API instance, created lazily. 162my$mediawiki; 163 164sub mw_connect_maybe { 165if($mediawiki) { 166return; 167} 168$mediawiki= MediaWiki::API->new; 169$mediawiki->{config}->{api_url} ="$url/api.php"; 170if($wiki_login) { 171my%credential= ( 172'url'=>$url, 173'username'=>$wiki_login, 174'password'=>$wiki_passwd 175); 176 Git::credential(\%credential); 177my$request= {lgname =>$credential{username}, 178 lgpassword =>$credential{password}, 179 lgdomain =>$wiki_domain}; 180if($mediawiki->login($request)) { 181 Git::credential(\%credential,'approve'); 182print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 183}else{ 184print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 185print STDERR " (error ". 186$mediawiki->{error}->{code} .': '. 187$mediawiki->{error}->{details} .")\n"; 188 Git::credential(\%credential,'reject'); 189exit1; 190} 191} 192return; 193} 194 195sub fatal_mw_error { 196my$action=shift; 197print STDERR "fatal: could not$action.\n"; 198print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 199if($url=~/^https/) { 200print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 201print STDERR "fatal: and the SSL certificate is correct.\n"; 202}else{ 203print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 204} 205print STDERR "fatal: (error ". 206$mediawiki->{error}->{code} .': '. 207$mediawiki->{error}->{details} .")\n"; 208exit1; 209} 210 211## Functions for listing pages on the remote wiki 212sub get_mw_tracked_pages { 213my$pages=shift; 214 get_mw_page_list(\@tracked_pages,$pages); 215return; 216} 217 218sub get_mw_page_list { 219my$page_list=shift; 220my$pages=shift; 221my@some_pages=@$page_list; 222while(@some_pages) { 223my$last_page=50; 224if($#some_pages<$last_page) { 225$last_page=$#some_pages; 226} 227my@slice=@some_pages[0..$last_page]; 228 get_mw_first_pages(\@slice,$pages); 229@some_pages=@some_pages[51..$#some_pages]; 230} 231return; 232} 233 234sub get_mw_tracked_categories { 235my$pages=shift; 236foreachmy$category(@tracked_categories) { 237if(index($category,':') <0) { 238# Mediawiki requires the Category 239# prefix, but let's not force the user 240# to specify it. 241$category="Category:".$category; 242} 243my$mw_pages=$mediawiki->list( { 244 action =>'query', 245 list =>'categorymembers', 246 cmtitle =>$category, 247 cmlimit =>'max'} ) 248||die$mediawiki->{error}->{code} .': ' 249.$mediawiki->{error}->{details} ."\n"; 250foreachmy$page(@{$mw_pages}) { 251$pages->{$page->{title}} =$page; 252} 253} 254return; 255} 256 257sub get_mw_all_pages { 258my$pages=shift; 259# No user-provided list, get the list of pages from the API. 260my$mw_pages=$mediawiki->list({ 261 action =>'query', 262 list =>'allpages', 263 aplimit =>'max' 264}); 265if(!defined($mw_pages)) { 266 fatal_mw_error("get the list of wiki pages"); 267} 268foreachmy$page(@{$mw_pages}) { 269$pages->{$page->{title}} =$page; 270} 271return; 272} 273 274# queries the wiki for a set of pages. Meant to be used within a loop 275# querying the wiki for slices of page list. 276sub get_mw_first_pages { 277my$some_pages=shift; 278my@some_pages= @{$some_pages}; 279 280my$pages=shift; 281 282# pattern 'page1|page2|...' required by the API 283my$titles=join('|',@some_pages); 284 285my$mw_pages=$mediawiki->api({ 286 action =>'query', 287 titles =>$titles, 288}); 289if(!defined($mw_pages)) { 290 fatal_mw_error("query the list of wiki pages"); 291} 292while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 293if($id<0) { 294print STDERR "Warning: page$page->{title} not found on wiki\n"; 295}else{ 296$pages->{$page->{title}} =$page; 297} 298} 299return; 300} 301 302# Get the list of pages to be fetched according to configuration. 303sub get_mw_pages { 304 mw_connect_maybe(); 305 306print STDERR "Listing pages on remote wiki...\n"; 307 308my%pages;# hash on page titles to avoid duplicates 309my$user_defined; 310if(@tracked_pages) { 311$user_defined=1; 312# The user provided a list of pages titles, but we 313# still need to query the API to get the page IDs. 314 get_mw_tracked_pages(\%pages); 315} 316if(@tracked_categories) { 317$user_defined=1; 318 get_mw_tracked_categories(\%pages); 319} 320if(!$user_defined) { 321 get_mw_all_pages(\%pages); 322} 323if($import_media) { 324print STDERR "Getting media files for selected pages...\n"; 325if($user_defined) { 326 get_linked_mediafiles(\%pages); 327}else{ 328 get_all_mediafiles(\%pages); 329} 330} 331print STDERR (scalar keys%pages) ." pages found.\n"; 332return%pages; 333} 334 335# usage: $out = run_git("command args"); 336# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 337sub run_git { 338my$args=shift; 339my$encoding= (shift||"encoding(UTF-8)"); 340open(my$git,"-|:$encoding","git ".$args) 341or die"Unable to open:$!\n"; 342my$res=do{ 343local$/=undef; 344<$git> 345}; 346close($git); 347 348return$res; 349} 350 351 352sub get_all_mediafiles { 353my$pages=shift; 354# Attach list of all pages for media files from the API, 355# they are in a different namespace, only one namespace 356# can be queried at the same moment 357my$mw_pages=$mediawiki->list({ 358 action =>'query', 359 list =>'allpages', 360 apnamespace => get_mw_namespace_id("File"), 361 aplimit =>'max' 362}); 363if(!defined($mw_pages)) { 364print STDERR "fatal: could not get the list of pages for media files.\n"; 365print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 366print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 367exit1; 368} 369foreachmy$page(@{$mw_pages}) { 370$pages->{$page->{title}} =$page; 371} 372return; 373} 374 375sub get_linked_mediafiles { 376my$pages=shift; 377my@titles=map{$_->{title} }values(%{$pages}); 378 379# The query is split in small batches because of the MW API limit of 380# the number of links to be returned (500 links max). 381my$batch=10; 382while(@titles) { 383if($#titles<$batch) { 384$batch=$#titles; 385} 386my@slice=@titles[0..$batch]; 387 388# pattern 'page1|page2|...' required by the API 389my$mw_titles=join('|',@slice); 390 391# Media files could be included or linked from 392# a page, get all related 393my$query= { 394 action =>'query', 395 prop =>'links|images', 396 titles =>$mw_titles, 397 plnamespace => get_mw_namespace_id("File"), 398 pllimit =>'max' 399}; 400my$result=$mediawiki->api($query); 401 402while(my($id,$page) =each(%{$result->{query}->{pages}})) { 403my@media_titles; 404if(defined($page->{links})) { 405my@link_titles 406=map{$_->{title} } @{$page->{links}}; 407push(@media_titles,@link_titles); 408} 409if(defined($page->{images})) { 410my@image_titles 411=map{$_->{title} } @{$page->{images}}; 412push(@media_titles,@image_titles); 413} 414if(@media_titles) { 415 get_mw_page_list(\@media_titles,$pages); 416} 417} 418 419@titles=@titles[($batch+1)..$#titles]; 420} 421return; 422} 423 424sub get_mw_mediafile_for_page_revision { 425# Name of the file on Wiki, with the prefix. 426my$filename=shift; 427my$timestamp=shift; 428my%mediafile; 429 430# Search if on a media file with given timestamp exists on 431# MediaWiki. In that case download the file. 432my$query= { 433 action =>'query', 434 prop =>'imageinfo', 435 titles =>"File:".$filename, 436 iistart =>$timestamp, 437 iiend =>$timestamp, 438 iiprop =>'timestamp|archivename|url', 439 iilimit =>1 440}; 441my$result=$mediawiki->api($query); 442 443my($fileid,$file) =each( %{$result->{query}->{pages}} ); 444# If not defined it means there is no revision of the file for 445# given timestamp. 446if(defined($file->{imageinfo})) { 447$mediafile{title} =$filename; 448 449my$fileinfo=pop(@{$file->{imageinfo}}); 450$mediafile{timestamp} =$fileinfo->{timestamp}; 451# Mediawiki::API's download function doesn't support https URLs 452# and can't download old versions of files. 453print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 454$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 455} 456return%mediafile; 457} 458 459sub download_mw_mediafile { 460my$download_url=shift; 461 462my$response=$mediawiki->{ua}->get($download_url); 463if($response->code==200) { 464return$response->decoded_content; 465}else{ 466print STDERR "Error downloading mediafile from :\n"; 467print STDERR "URL:$download_url\n"; 468print STDERR "Server response: ".$response->code." ".$response->message."\n"; 469exit1; 470} 471} 472 473sub get_last_local_revision { 474# Get note regarding last mediawiki revision 475my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 476my@note_info=split(/ /,$note); 477 478my$lastrevision_number; 479if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 480print STDERR "No previous mediawiki revision found"; 481$lastrevision_number=0; 482}else{ 483# Notes are formatted : mediawiki_revision: #number 484$lastrevision_number=$note_info[1]; 485chomp($lastrevision_number); 486print STDERR "Last local mediawiki revision found is$lastrevision_number"; 487} 488return$lastrevision_number; 489} 490 491# Get the last remote revision without taking in account which pages are 492# tracked or not. This function makes a single request to the wiki thus 493# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 494# option. 495sub get_last_global_remote_rev { 496 mw_connect_maybe(); 497 498my$query= { 499 action =>'query', 500 list =>'recentchanges', 501 prop =>'revisions', 502 rclimit =>'1', 503 rcdir =>'older', 504}; 505my$result=$mediawiki->api($query); 506return$result->{query}->{recentchanges}[0]->{revid}; 507} 508 509# Get the last remote revision concerning the tracked pages and the tracked 510# categories. 511sub get_last_remote_revision { 512 mw_connect_maybe(); 513 514my%pages_hash= get_mw_pages(); 515my@pages=values(%pages_hash); 516 517my$max_rev_num=0; 518 519print STDERR "Getting last revision id on tracked pages...\n"; 520 521foreachmy$page(@pages) { 522my$id=$page->{pageid}; 523 524my$query= { 525 action =>'query', 526 prop =>'revisions', 527 rvprop =>'ids|timestamp', 528 pageids =>$id, 529}; 530 531my$result=$mediawiki->api($query); 532 533my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 534 535$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 536 537$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 538} 539 540print STDERR "Last remote revision found is$max_rev_num.\n"; 541return$max_rev_num; 542} 543 544# Clean content before sending it to MediaWiki 545sub mediawiki_clean { 546my$string=shift; 547my$page_created=shift; 548# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 549# This function right trims a string and adds a \n at the end to follow this rule 550$string=~s/\s+$//; 551if($stringeq""&&$page_created) { 552# Creating empty pages is forbidden. 553$string= EMPTY_CONTENT; 554} 555return$string."\n"; 556} 557 558# Filter applied on MediaWiki data before adding them to Git 559sub mediawiki_smudge { 560my$string=shift; 561if($stringeq EMPTY_CONTENT) { 562$string=""; 563} 564# This \n is important. This is due to mediawiki's way to handle end of files. 565return$string."\n"; 566} 567 568sub mediawiki_clean_filename { 569my$filename=shift; 570$filename=~s{@{[SLASH_REPLACEMENT]}}{/}g; 571# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 572# Do a variant of URL-encoding, i.e. looks like URL-encoding, 573# but with _ added to prevent MediaWiki from thinking this is 574# an actual special character. 575$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 576# If we use the uri escape before 577# we should unescape here, before anything 578 579return$filename; 580} 581 582sub mediawiki_smudge_filename { 583my$filename=shift; 584$filename=~s{/}{@{[SLASH_REPLACEMENT]}}g; 585$filename=~s/ /_/g; 586# Decode forbidden characters encoded in mediawiki_clean_filename 587$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 588return$filename; 589} 590 591sub literal_data { 592my($content) =@_; 593print STDOUT "data ", bytes::length($content),"\n",$content; 594return; 595} 596 597sub literal_data_raw { 598# Output possibly binary content. 599my($content) =@_; 600# Avoid confusion between size in bytes and in characters 601 utf8::downgrade($content); 602binmode STDOUT,":raw"; 603print STDOUT "data ", bytes::length($content),"\n",$content; 604binmode STDOUT,":encoding(UTF-8)"; 605return; 606} 607 608sub mw_capabilities { 609# Revisions are imported to the private namespace 610# refs/mediawiki/$remotename/ by the helper and fetched into 611# refs/remotes/$remotename later by fetch. 612print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 613print STDOUT "import\n"; 614print STDOUT "list\n"; 615print STDOUT "push\n"; 616print STDOUT "\n"; 617return; 618} 619 620sub mw_list { 621# MediaWiki do not have branches, we consider one branch arbitrarily 622# called master, and HEAD pointing to it. 623print STDOUT "? refs/heads/master\n"; 624print STDOUT "\@refs/heads/masterHEAD\n"; 625print STDOUT "\n"; 626return; 627} 628 629sub mw_option { 630print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 631print STDOUT "unsupported\n"; 632return; 633} 634 635sub fetch_mw_revisions_for_page { 636my$page=shift; 637my$id=shift; 638my$fetch_from=shift; 639my@page_revs= (); 640my$query= { 641 action =>'query', 642 prop =>'revisions', 643 rvprop =>'ids', 644 rvdir =>'newer', 645 rvstartid =>$fetch_from, 646 rvlimit =>500, 647 pageids =>$id, 648}; 649 650my$revnum=0; 651# Get 500 revisions at a time due to the mediawiki api limit 652while(1) { 653my$result=$mediawiki->api($query); 654 655# Parse each of those 500 revisions 656foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 657my$page_rev_ids; 658$page_rev_ids->{pageid} =$page->{pageid}; 659$page_rev_ids->{revid} =$revision->{revid}; 660push(@page_revs,$page_rev_ids); 661$revnum++; 662} 663last unless$result->{'query-continue'}; 664$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 665} 666if($shallow_import&&@page_revs) { 667print STDERR " Found 1 revision (shallow import).\n"; 668@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 669return$page_revs[0]; 670} 671print STDERR " Found ",$revnum," revision(s).\n"; 672return@page_revs; 673} 674 675sub fetch_mw_revisions { 676my$pages=shift;my@pages= @{$pages}; 677my$fetch_from=shift; 678 679my@revisions= (); 680my$n=1; 681foreachmy$page(@pages) { 682my$id=$page->{pageid}; 683 684print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 685$n++; 686my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 687@revisions= (@page_revs,@revisions); 688} 689 690return($n,@revisions); 691} 692 693sub fe_escape_path { 694my$path=shift; 695$path=~s/\\/\\\\/g; 696$path=~s/"/\\"/g; 697$path=~s/\n/\\n/g; 698return'"'.$path.'"'; 699} 700 701sub import_file_revision { 702my$commit=shift; 703my%commit= %{$commit}; 704my$full_import=shift; 705my$n=shift; 706my$mediafile=shift; 707my%mediafile; 708if($mediafile) { 709%mediafile= %{$mediafile}; 710} 711 712my$title=$commit{title}; 713my$comment=$commit{comment}; 714my$content=$commit{content}; 715my$author=$commit{author}; 716my$date=$commit{date}; 717 718print STDOUT "commit refs/mediawiki/$remotename/master\n"; 719print STDOUT "mark :$n\n"; 720print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 721 literal_data($comment); 722 723# If it's not a clone, we need to know where to start from 724if(!$full_import&&$n==1) { 725print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 726} 727if($contentne DELETED_CONTENT) { 728print STDOUT "M 644 inline ". 729 fe_escape_path($title.".mw") ."\n"; 730 literal_data($content); 731if(%mediafile) { 732print STDOUT "M 644 inline " 733. fe_escape_path($mediafile{title}) ."\n"; 734 literal_data_raw($mediafile{content}); 735} 736print STDOUT "\n\n"; 737}else{ 738print STDOUT "D ". fe_escape_path($title.".mw") ."\n"; 739} 740 741# mediawiki revision number in the git note 742if($full_import&&$n==1) { 743print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 744} 745print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 746print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 747 literal_data("Note added by git-mediawiki during import"); 748if(!$full_import&&$n==1) { 749print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 750} 751print STDOUT "N inline :$n\n"; 752 literal_data("mediawiki_revision: ".$commit{mw_revision}); 753print STDOUT "\n\n"; 754return; 755} 756 757# parse a sequence of 758# <cmd> <arg1> 759# <cmd> <arg2> 760# \n 761# (like batch sequence of import and sequence of push statements) 762sub get_more_refs { 763my$cmd=shift; 764my@refs; 765while(1) { 766my$line= <STDIN>; 767if($line=~/^$cmd (.*)$/) { 768push(@refs,$1); 769}elsif($lineeq"\n") { 770return@refs; 771}else{ 772die("Invalid command in a '$cmd' batch:$_\n"); 773} 774} 775return; 776} 777 778sub mw_import { 779# multiple import commands can follow each other. 780my@refs= (shift, get_more_refs("import")); 781foreachmy$ref(@refs) { 782 mw_import_ref($ref); 783} 784print STDOUT "done\n"; 785return; 786} 787 788sub mw_import_ref { 789my$ref=shift; 790# The remote helper will call "import HEAD" and 791# "import refs/heads/master". 792# Since HEAD is a symbolic ref to master (by convention, 793# followed by the output of the command "list" that we gave), 794# we don't need to do anything in this case. 795if($refeq"HEAD") { 796return; 797} 798 799 mw_connect_maybe(); 800 801print STDERR "Searching revisions...\n"; 802my$last_local= get_last_local_revision(); 803my$fetch_from=$last_local+1; 804if($fetch_from==1) { 805print STDERR ", fetching from beginning.\n"; 806}else{ 807print STDERR ", fetching from here.\n"; 808} 809 810my$n=0; 811if($fetch_strategyeq"by_rev") { 812print STDERR "Fetching & writing export data by revs...\n"; 813$n= mw_import_ref_by_revs($fetch_from); 814}elsif($fetch_strategyeq"by_page") { 815print STDERR "Fetching & writing export data by pages...\n"; 816$n= mw_import_ref_by_pages($fetch_from); 817}else{ 818print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 819print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 820exit1; 821} 822 823if($fetch_from==1&&$n==0) { 824print STDERR "You appear to have cloned an empty MediaWiki.\n"; 825# Something has to be done remote-helper side. If nothing is done, an error is 826# thrown saying that HEAD is referring to unknown object 0000000000000000000 827# and the clone fails. 828} 829return; 830} 831 832sub mw_import_ref_by_pages { 833 834my$fetch_from=shift; 835my%pages_hash= get_mw_pages(); 836my@pages=values(%pages_hash); 837 838my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 839 840@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 841my@revision_ids=map{$_->{revid} }@revisions; 842 843return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 844} 845 846sub mw_import_ref_by_revs { 847 848my$fetch_from=shift; 849my%pages_hash= get_mw_pages(); 850 851my$last_remote= get_last_global_remote_rev(); 852my@revision_ids=$fetch_from..$last_remote; 853return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 854} 855 856# Import revisions given in second argument (array of integers). 857# Only pages appearing in the third argument (hash indexed by page titles) 858# will be imported. 859sub mw_import_revids { 860my$fetch_from=shift; 861my$revision_ids=shift; 862my$pages=shift; 863 864my$n=0; 865my$n_actual=0; 866my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 867 868foreachmy$pagerevid(@$revision_ids) { 869# Count page even if we skip it, since we display 870# $n/$total and $total includes skipped pages. 871$n++; 872 873# fetch the content of the pages 874my$query= { 875 action =>'query', 876 prop =>'revisions', 877 rvprop =>'content|timestamp|comment|user|ids', 878 revids =>$pagerevid, 879}; 880 881my$result=$mediawiki->api($query); 882 883if(!$result) { 884die"Failed to retrieve modified page for revision$pagerevid\n"; 885} 886 887if(defined($result->{query}->{badrevids}->{$pagerevid})) { 888# The revision id does not exist on the remote wiki. 889next; 890} 891 892if(!defined($result->{query}->{pages})) { 893die"Invalid revision$pagerevid.\n"; 894} 895 896my@result_pages=values(%{$result->{query}->{pages}}); 897my$result_page=$result_pages[0]; 898my$rev=$result_pages[0]->{revisions}->[0]; 899 900my$page_title=$result_page->{title}; 901 902if(!exists($pages->{$page_title})) { 903print STDERR "$n/",scalar(@$revision_ids), 904": Skipping revision #$rev->{revid} of$page_title\n"; 905next; 906} 907 908$n_actual++; 909 910my%commit; 911$commit{author} =$rev->{user} ||'Anonymous'; 912$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 913$commit{title} = mediawiki_smudge_filename($page_title); 914$commit{mw_revision} =$rev->{revid}; 915$commit{content} = mediawiki_smudge($rev->{'*'}); 916 917if(!defined($rev->{timestamp})) { 918$last_timestamp++; 919}else{ 920$last_timestamp=$rev->{timestamp}; 921} 922$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 923 924# Differentiates classic pages and media files. 925my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 926my%mediafile; 927if($namespace) { 928my$id= get_mw_namespace_id($namespace); 929if($id&&$id== get_mw_namespace_id("File")) { 930%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 931} 932} 933# If this is a revision of the media page for new version 934# of a file do one common commit for both file and media page. 935# Else do commit only for that page. 936print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 937 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 938} 939 940return$n_actual; 941} 942 943sub error_non_fast_forward { 944my$advice= run_git("config --bool advice.pushNonFastForward"); 945chomp($advice); 946if($advicene"false") { 947# Native git-push would show this after the summary. 948# We can't ask it to display it cleanly, so print it 949# ourselves before. 950print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 951print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 952print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 953} 954print STDOUT "error$_[0]\"non-fast-forward\"\n"; 955return0; 956} 957 958sub mw_upload_file { 959my$complete_file_name=shift; 960my$new_sha1=shift; 961my$extension=shift; 962my$file_deleted=shift; 963my$summary=shift; 964my$newrevid; 965my$path="File:".$complete_file_name; 966my%hashFiles= get_allowed_file_extensions(); 967if(!exists($hashFiles{$extension})) { 968print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 969print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 970return$newrevid; 971} 972# Deleting and uploading a file requires a priviledged user 973if($file_deleted) { 974 mw_connect_maybe(); 975my$query= { 976 action =>'delete', 977 title =>$path, 978 reason =>$summary 979}; 980if(!$mediawiki->edit($query)) { 981print STDERR "Failed to delete file on remote wiki\n"; 982print STDERR "Check your permissions on the remote site. Error code:\n"; 983print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 984exit1; 985} 986}else{ 987# Don't let perl try to interpret file content as UTF-8 => use "raw" 988my$content= run_git("cat-file blob$new_sha1","raw"); 989if($contentne"") { 990 mw_connect_maybe(); 991$mediawiki->{config}->{upload_url} = 992"$url/index.php/Special:Upload"; 993$mediawiki->edit({ 994 action =>'upload', 995 filename =>$complete_file_name, 996 comment =>$summary, 997 file => [undef, 998$complete_file_name, 999 Content =>$content],1000 ignorewarnings =>1,1001}, {1002 skip_encoding =>11003} ) ||die$mediawiki->{error}->{code} .':'1004.$mediawiki->{error}->{details} ."\n";1005my$last_file_page=$mediawiki->get_page({title =>$path});1006$newrevid=$last_file_page->{revid};1007print STDERR "Pushed file:$new_sha1-$complete_file_name.\n";1008}else{1009print STDERR "Empty file$complete_file_namenot pushed.\n";1010}1011}1012return$newrevid;1013}10141015sub mw_push_file {1016my$diff_info=shift;1017# $diff_info contains a string in this format:1018# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1019my@diff_info_split=split(/[ \t]/,$diff_info);10201021# Filename, including .mw extension1022my$complete_file_name=shift;1023# Commit message1024my$summary=shift;1025# MediaWiki revision number. Keep the previous one by default,1026# in case there's no edit to perform.1027my$oldrevid=shift;1028my$newrevid;10291030if($summaryeq EMPTY_MESSAGE) {1031$summary='';1032}10331034my$new_sha1=$diff_info_split[3];1035my$old_sha1=$diff_info_split[2];1036my$page_created= ($old_sha1eq NULL_SHA1);1037my$page_deleted= ($new_sha1eq NULL_SHA1);1038$complete_file_name= mediawiki_clean_filename($complete_file_name);10391040my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1041if(!defined($extension)) {1042$extension="";1043}1044if($extensioneq"mw") {1045my$ns= get_mw_namespace_id_for_page($complete_file_name);1046if($ns&&$ns== get_mw_namespace_id("File") && (!$export_media)) {1047print STDERR "Ignoring media file related page:$complete_file_name\n";1048return($oldrevid,"ok");1049}1050my$file_content;1051if($page_deleted) {1052# Deleting a page usually requires1053# special privileges. A common1054# convention is to replace the page1055# with this content instead:1056$file_content= DELETED_CONTENT;1057}else{1058$file_content= run_git("cat-file blob$new_sha1");1059}10601061 mw_connect_maybe();10621063my$result=$mediawiki->edit( {1064 action =>'edit',1065 summary =>$summary,1066 title =>$title,1067 basetimestamp =>$basetimestamps{$oldrevid},1068 text => mediawiki_clean($file_content,$page_created),1069}, {1070 skip_encoding =>1# Helps with names with accentuated characters1071});1072if(!$result) {1073if($mediawiki->{error}->{code} ==3) {1074# edit conflicts, considered as non-fast-forward1075print STDERR 'Warning: Error '.1076$mediawiki->{error}->{code} .1077' from mediwiki: '.$mediawiki->{error}->{details} .1078".\n";1079return($oldrevid,"non-fast-forward");1080}else{1081# Other errors. Shouldn't happen => just die()1082die'Fatal: Error '.1083$mediawiki->{error}->{code} .1084' from mediwiki: '.$mediawiki->{error}->{details} ."\n";1085}1086}1087$newrevid=$result->{edit}->{newrevid};1088print STDERR "Pushed file:$new_sha1-$title\n";1089}elsif($export_media) {1090$newrevid= mw_upload_file($complete_file_name,$new_sha1,1091$extension,$page_deleted,1092$summary);1093}else{1094print STDERR "Ignoring media file$title\n";1095}1096$newrevid= ($newrevidor$oldrevid);1097return($newrevid,"ok");1098}10991100sub mw_push {1101# multiple push statements can follow each other1102my@refsspecs= (shift, get_more_refs("push"));1103my$pushed;1104formy$refspec(@refsspecs) {1105my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1106or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>\n");1107if($force) {1108print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1109}1110if($localeq"") {1111print STDERR "Cannot delete remote branch on a MediaWiki\n";1112print STDOUT "error$remotecannot delete\n";1113next;1114}1115if($remotene"refs/heads/master") {1116print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1117print STDOUT "error$remoteonly master allowed\n";1118next;1119}1120if(mw_push_revision($local,$remote)) {1121$pushed=1;1122}1123}11241125# Notify Git that the push is done1126print STDOUT "\n";11271128if($pushed&&$dumb_push) {1129print STDERR "Just pushed some revisions to MediaWiki.\n";1130print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1131print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1132print STDERR "\n";1133print STDERR " git pull --rebase\n";1134print STDERR "\n";1135}1136return;1137}11381139sub mw_push_revision {1140my$local=shift;1141my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1142my$last_local_revid= get_last_local_revision();1143print STDERR ".\n";# Finish sentence started by get_last_local_revision()1144my$last_remote_revid= get_last_remote_revision();1145my$mw_revision=$last_remote_revid;11461147# Get sha1 of commit pointed by local HEAD1148my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1149# Get sha1 of commit pointed by remotes/$remotename/master1150my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1151chomp($remoteorigin_sha1);11521153if($last_local_revid>0&&1154$last_local_revid<$last_remote_revid) {1155return error_non_fast_forward($remote);1156}11571158if($HEAD_sha1eq$remoteorigin_sha1) {1159# nothing to push1160return0;1161}11621163# Get every commit in between HEAD and refs/remotes/origin/master,1164# including HEAD and refs/remotes/origin/master1165my@commit_pairs= ();1166if($last_local_revid>0) {1167my$parsed_sha1=$remoteorigin_sha1;1168# Find a path from last MediaWiki commit to pushed commit1169print STDERR "Computing path from local to remote ...\n";1170my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents$local^$parsed_sha1"));1171my%local_ancestry;1172foreachmy$line(@local_ancestry) {1173if(my($child,$parents) =$line=~/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1174foreachmy$parent(split(/ /,$parents)) {1175$local_ancestry{$parent} =$child;1176}1177}elsif(!$line=~/^([a-f0-9]+)/) {1178die"Unexpected output from git rev-list:$line\n";1179}1180}1181while($parsed_sha1ne$HEAD_sha1) {1182my$child=$local_ancestry{$parsed_sha1};1183if(!$child) {1184printf STDERR "Cannot find a path in history from remote commit to last commit\n";1185return error_non_fast_forward($remote);1186}1187push(@commit_pairs, [$parsed_sha1,$child]);1188$parsed_sha1=$child;1189}1190}else{1191# No remote mediawiki revision. Export the whole1192# history (linearized with --first-parent)1193print STDERR "Warning: no common ancestor, pushing complete history\n";1194my$history= run_git("rev-list --first-parent --children$local");1195my@history=split(/\n/,$history);1196@history=@history[1..$#history];1197foreachmy$line(reverse@history) {1198my@commit_info_split=split(/[ \n]/,$line);1199push(@commit_pairs, \@commit_info_split);1200}1201}12021203foreachmy$commit_info_split(@commit_pairs) {1204my$sha1_child= @{$commit_info_split}[0];1205my$sha1_commit= @{$commit_info_split}[1];1206my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1207# TODO: we could detect rename, and encode them with a #redirect on the wiki.1208# TODO: for now, it's just a delete+add1209my@diff_info_list=split(/\0/,$diff_infos);1210# Keep the subject line of the commit message as mediawiki comment for the revision1211my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1212chomp($commit_msg);1213# Push every blob1214while(@diff_info_list) {1215my$status;1216# git diff-tree -z gives an output like1217# <metadata>\0<filename1>\01218# <metadata>\0<filename2>\01219# and we've split on \0.1220my$info=shift(@diff_info_list);1221my$file=shift(@diff_info_list);1222($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1223if($statuseq"non-fast-forward") {1224# we may already have sent part of the1225# commit to MediaWiki, but it's too1226# late to cancel it. Stop the push in1227# the middle, but still give an1228# accurate error message.1229return error_non_fast_forward($remote);1230}1231if($statusne"ok") {1232die("Unknown error from mw_push_file()\n");1233}1234}1235unless($dumb_push) {1236 run_git("notes --ref=$remotename/mediawikiadd -f -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1237 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1238}1239}12401241print STDOUT "ok$remote\n";1242return1;1243}12441245sub get_allowed_file_extensions {1246 mw_connect_maybe();12471248my$query= {1249 action =>'query',1250 meta =>'siteinfo',1251 siprop =>'fileextensions'1252};1253my$result=$mediawiki->api($query);1254my@file_extensions=map{$_->{ext}} @{$result->{query}->{fileextensions}};1255my%hashFile=map{$_=>1}@file_extensions;12561257return%hashFile;1258}12591260# In memory cache for MediaWiki namespace ids.1261my%namespace_id;12621263# Namespaces whose id is cached in the configuration file1264# (to avoid duplicates)1265my%cached_mw_namespace_id;12661267# Return MediaWiki id for a canonical namespace name.1268# Ex.: "File", "Project".1269sub get_mw_namespace_id {1270 mw_connect_maybe();1271my$name=shift;12721273if(!exists$namespace_id{$name}) {1274# Look at configuration file, if the record for that namespace is1275# already cached. Namespaces are stored in form:1276# "Name_of_namespace:Id_namespace", ex.: "File:6".1277my@temp=split(/\n/, run_git("config --get-all remote."1278.$remotename.".namespaceCache"));1279chomp(@temp);1280foreachmy$ns(@temp) {1281my($n,$id) =split(/:/,$ns);1282if($ideq'notANameSpace') {1283$namespace_id{$n} = {is_namespace =>0};1284}else{1285$namespace_id{$n} = {is_namespace =>1, id =>$id};1286}1287$cached_mw_namespace_id{$n} =1;1288}1289}12901291if(!exists$namespace_id{$name}) {1292print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1293# NS not found => get namespace id from MW and store it in1294# configuration file.1295my$query= {1296 action =>'query',1297 meta =>'siteinfo',1298 siprop =>'namespaces'1299};1300my$result=$mediawiki->api($query);13011302while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1303if(defined($ns->{id}) &&defined($ns->{canonical})) {1304$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1305if($ns->{'*'}) {1306# alias (e.g. french Fichier: as alias for canonical File:)1307$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1308}1309}1310}1311}13121313my$ns=$namespace_id{$name};1314my$id;13151316unless(defined$ns) {1317print STDERR "No such namespace$nameon MediaWiki.\n";1318$ns= {is_namespace =>0};1319$namespace_id{$name} =$ns;1320}13211322if($ns->{is_namespace}) {1323$id=$ns->{id};1324}13251326# Store "notANameSpace" as special value for inexisting namespaces1327my$store_id= ($id||'notANameSpace');13281329# Store explicitely requested namespaces on disk1330if(!exists$cached_mw_namespace_id{$name}) {1331 run_git("config --add remote.".$remotename1332.".namespaceCache\"".$name.":".$store_id."\"");1333$cached_mw_namespace_id{$name} =1;1334}1335return$id;1336}13371338sub get_mw_namespace_id_for_page {1339my$namespace=shift;1340if($namespace=~/^([^:]*):/) {1341return get_mw_namespace_id($namespace);1342}else{1343return;1344}1345}