1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use DateTime::Format::ISO8601; 18 19# By default, use UTF-8 to communicate with Git and the user 20binmode STDERR,":utf8"; 21binmode STDOUT,":utf8"; 22 23use URI::Escape; 24use IPC::Open2; 25 26use warnings; 27 28# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 29useconstant SLASH_REPLACEMENT =>"%2F"; 30 31# It's not always possible to delete pages (may require some 32# privileges). Deleted pages are replaced with this content. 33useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 34 35# It's not possible to create empty pages. New empty files in Git are 36# sent with this content instead. 37useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 38 39# used to reflect file creation or deletion in diff. 40useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 41 42# Used on Git's side to reflect empty edit messages on the wiki 43useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 44 45if(@ARGV!=2) { 46 exit_error_usage(); 47} 48 49my$remotename=$ARGV[0]; 50my$url=$ARGV[1]; 51 52# Accept both space-separated and multiple keys in config file. 53# Spaces should be written as _ anyway because we'll use chomp. 54my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 55chomp(@tracked_pages); 56 57# Just like @tracked_pages, but for MediaWiki categories. 58my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 59chomp(@tracked_categories); 60 61# Import media files on pull 62my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 63chomp($import_media); 64$import_media= ($import_mediaeq"true"); 65 66# Export media files on push 67my$export_media= run_git("config --get --bool remote.".$remotename.".mediaexport"); 68chomp($export_media); 69$export_media= !($export_mediaeq"false"); 70 71my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 72# Note: mwPassword is discourraged. Use the credential system instead. 73my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 74my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 75chomp($wiki_login); 76chomp($wiki_passwd); 77chomp($wiki_domain); 78 79# Import only last revisions (both for clone and fetch) 80my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 81chomp($shallow_import); 82$shallow_import= ($shallow_importeq"true"); 83 84# Fetch (clone and pull) by revisions instead of by pages. This behavior 85# is more efficient when we have a wiki with lots of pages and we fetch 86# the revisions quite often so that they concern only few pages. 87# Possible values: 88# - by_rev: perform one query per new revision on the remote wiki 89# - by_page: query each tracked page for new revision 90my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 91unless($fetch_strategy) { 92$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 93} 94chomp($fetch_strategy); 95unless($fetch_strategy) { 96$fetch_strategy="by_page"; 97} 98 99# Dumb push: don't update notes and mediawiki ref to reflect the last push. 100# 101# Configurable with mediawiki.dumbPush, or per-remote with 102# remote.<remotename>.dumbPush. 103# 104# This means the user will have to re-import the just-pushed 105# revisions. On the other hand, this means that the Git revisions 106# corresponding to MediaWiki revisions are all imported from the wiki, 107# regardless of whether they were initially created in Git or from the 108# web interface, hence all users will get the same history (i.e. if 109# the push from Git to MediaWiki loses some information, everybody 110# will get the history with information lost). If the import is 111# deterministic, this means everybody gets the same sha1 for each 112# MediaWiki revision. 113my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 114unless($dumb_push) { 115$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 116} 117chomp($dumb_push); 118$dumb_push= ($dumb_pusheq"true"); 119 120my$wiki_name=$url; 121$wiki_name=~s/[^\/]*:\/\///; 122# If URL is like http://user:password@example.com/, we clearly don't 123# want the password in $wiki_name. While we're there, also remove user 124# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 125$wiki_name=~s/^.*@//; 126 127# Commands parser 128my$entry; 129my@cmd; 130while(<STDIN>) { 131chomp; 132@cmd=split(/ /); 133if(defined($cmd[0])) { 134# Line not blank 135if($cmd[0]eq"capabilities") { 136die("Too many arguments for capabilities")unless(!defined($cmd[1])); 137 mw_capabilities(); 138}elsif($cmd[0]eq"list") { 139die("Too many arguments for list")unless(!defined($cmd[2])); 140 mw_list($cmd[1]); 141}elsif($cmd[0]eq"import") { 142die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 143 mw_import($cmd[1]); 144}elsif($cmd[0]eq"option") { 145die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 146 mw_option($cmd[1],$cmd[2]); 147}elsif($cmd[0]eq"push") { 148 mw_push($cmd[1]); 149}else{ 150print STDERR "Unknown command. Aborting...\n"; 151last; 152} 153}else{ 154# blank line: we should terminate 155last; 156} 157 158BEGIN{ $| =1}# flush STDOUT, to make sure the previous 159# command is fully processed. 160} 161 162########################## Functions ############################## 163 164## error handling 165sub exit_error_usage { 166die"ERROR: git-remote-mediawiki module was not called with a correct number of\n". 167"parameters\n". 168"You may obtain this error because you attempted to run the git-remote-mediawiki\n". 169"module directly.\n". 170"This module can be used the following way:\n". 171"\tgit clone mediawiki://<address of a mediawiki>\n". 172"Then, use git commit, push and pull as with every normal git repository.\n"; 173} 174 175# MediaWiki API instance, created lazily. 176my$mediawiki; 177 178sub mw_connect_maybe { 179if($mediawiki) { 180return; 181} 182$mediawiki= MediaWiki::API->new; 183$mediawiki->{config}->{api_url} ="$url/api.php"; 184if($wiki_login) { 185my%credential= ( 186'url'=>$url, 187'username'=>$wiki_login, 188'password'=>$wiki_passwd 189); 190 Git::credential(\%credential); 191my$request= {lgname =>$credential{username}, 192 lgpassword =>$credential{password}, 193 lgdomain =>$wiki_domain}; 194if($mediawiki->login($request)) { 195 Git::credential(\%credential,'approve'); 196print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 197}else{ 198print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 199print STDERR " (error ". 200$mediawiki->{error}->{code} .': '. 201$mediawiki->{error}->{details} .")\n"; 202 Git::credential(\%credential,'reject'); 203exit1; 204} 205} 206} 207 208sub fatal_mw_error { 209my$action=shift; 210print STDERR "fatal: could not$action.\n"; 211print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 212if($url=~/^https/) { 213print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 214print STDERR "fatal: and the SSL certificate is correct.\n"; 215}else{ 216print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 217} 218print STDERR "fatal: (error ". 219$mediawiki->{error}->{code} .': '. 220$mediawiki->{error}->{details} .")\n"; 221exit1; 222} 223 224## Functions for listing pages on the remote wiki 225sub get_mw_tracked_pages { 226my$pages=shift; 227 get_mw_page_list(\@tracked_pages,$pages); 228} 229 230sub get_mw_page_list { 231my$page_list=shift; 232my$pages=shift; 233my@some_pages=@$page_list; 234while(@some_pages) { 235my$last=50; 236if($#some_pages<$last) { 237$last=$#some_pages; 238} 239my@slice=@some_pages[0..$last]; 240 get_mw_first_pages(\@slice,$pages); 241@some_pages=@some_pages[51..$#some_pages]; 242} 243} 244 245sub get_mw_tracked_categories { 246my$pages=shift; 247foreachmy$category(@tracked_categories) { 248if(index($category,':') <0) { 249# Mediawiki requires the Category 250# prefix, but let's not force the user 251# to specify it. 252$category="Category:".$category; 253} 254my$mw_pages=$mediawiki->list( { 255 action =>'query', 256 list =>'categorymembers', 257 cmtitle =>$category, 258 cmlimit =>'max'} ) 259||die$mediawiki->{error}->{code} .': ' 260.$mediawiki->{error}->{details}; 261foreachmy$page(@{$mw_pages}) { 262$pages->{$page->{title}} =$page; 263} 264} 265} 266 267sub get_mw_all_pages { 268my$pages=shift; 269# No user-provided list, get the list of pages from the API. 270my$mw_pages=$mediawiki->list({ 271 action =>'query', 272 list =>'allpages', 273 aplimit =>'max' 274}); 275if(!defined($mw_pages)) { 276 fatal_mw_error("get the list of wiki pages"); 277} 278foreachmy$page(@{$mw_pages}) { 279$pages->{$page->{title}} =$page; 280} 281} 282 283# queries the wiki for a set of pages. Meant to be used within a loop 284# querying the wiki for slices of page list. 285sub get_mw_first_pages { 286my$some_pages=shift; 287my@some_pages= @{$some_pages}; 288 289my$pages=shift; 290 291# pattern 'page1|page2|...' required by the API 292my$titles=join('|',@some_pages); 293 294my$mw_pages=$mediawiki->api({ 295 action =>'query', 296 titles =>$titles, 297}); 298if(!defined($mw_pages)) { 299 fatal_mw_error("query the list of wiki pages"); 300} 301while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 302if($id<0) { 303print STDERR "Warning: page$page->{title} not found on wiki\n"; 304}else{ 305$pages->{$page->{title}} =$page; 306} 307} 308} 309 310# Get the list of pages to be fetched according to configuration. 311sub get_mw_pages { 312 mw_connect_maybe(); 313 314print STDERR "Listing pages on remote wiki...\n"; 315 316my%pages;# hash on page titles to avoid duplicates 317my$user_defined; 318if(@tracked_pages) { 319$user_defined=1; 320# The user provided a list of pages titles, but we 321# still need to query the API to get the page IDs. 322 get_mw_tracked_pages(\%pages); 323} 324if(@tracked_categories) { 325$user_defined=1; 326 get_mw_tracked_categories(\%pages); 327} 328if(!$user_defined) { 329 get_mw_all_pages(\%pages); 330} 331if($import_media) { 332print STDERR "Getting media files for selected pages...\n"; 333if($user_defined) { 334 get_linked_mediafiles(\%pages); 335}else{ 336 get_all_mediafiles(\%pages); 337} 338} 339print STDERR (scalar keys%pages) ." pages found.\n"; 340return%pages; 341} 342 343# usage: $out = run_git("command args"); 344# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 345sub run_git { 346my$args=shift; 347my$encoding= (shift||"encoding(UTF-8)"); 348open(my$git,"-|:$encoding","git ".$args); 349my$res=do{local$/; <$git> }; 350close($git); 351 352return$res; 353} 354 355 356sub get_all_mediafiles { 357my$pages=shift; 358# Attach list of all pages for media files from the API, 359# they are in a different namespace, only one namespace 360# can be queried at the same moment 361my$mw_pages=$mediawiki->list({ 362 action =>'query', 363 list =>'allpages', 364 apnamespace => get_mw_namespace_id("File"), 365 aplimit =>'max' 366}); 367if(!defined($mw_pages)) { 368print STDERR "fatal: could not get the list of pages for media files.\n"; 369print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 370print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 371exit1; 372} 373foreachmy$page(@{$mw_pages}) { 374$pages->{$page->{title}} =$page; 375} 376} 377 378sub get_linked_mediafiles { 379my$pages=shift; 380my@titles=map$_->{title},values(%{$pages}); 381 382# The query is split in small batches because of the MW API limit of 383# the number of links to be returned (500 links max). 384my$batch=10; 385while(@titles) { 386if($#titles<$batch) { 387$batch=$#titles; 388} 389my@slice=@titles[0..$batch]; 390 391# pattern 'page1|page2|...' required by the API 392my$mw_titles=join('|',@slice); 393 394# Media files could be included or linked from 395# a page, get all related 396my$query= { 397 action =>'query', 398 prop =>'links|images', 399 titles =>$mw_titles, 400 plnamespace => get_mw_namespace_id("File"), 401 pllimit =>'max' 402}; 403my$result=$mediawiki->api($query); 404 405while(my($id,$page) =each(%{$result->{query}->{pages}})) { 406my@media_titles; 407if(defined($page->{links})) { 408my@link_titles=map$_->{title}, @{$page->{links}}; 409push(@media_titles,@link_titles); 410} 411if(defined($page->{images})) { 412my@image_titles=map$_->{title}, @{$page->{images}}; 413push(@media_titles,@image_titles); 414} 415if(@media_titles) { 416 get_mw_page_list(\@media_titles,$pages); 417} 418} 419 420@titles=@titles[($batch+1)..$#titles]; 421} 422} 423 424sub get_mw_mediafile_for_page_revision { 425# Name of the file on Wiki, with the prefix. 426my$filename=shift; 427my$timestamp=shift; 428my%mediafile; 429 430# Search if on a media file with given timestamp exists on 431# MediaWiki. In that case download the file. 432my$query= { 433 action =>'query', 434 prop =>'imageinfo', 435 titles =>"File:".$filename, 436 iistart =>$timestamp, 437 iiend =>$timestamp, 438 iiprop =>'timestamp|archivename|url', 439 iilimit =>1 440}; 441my$result=$mediawiki->api($query); 442 443my($fileid,$file) =each( %{$result->{query}->{pages}} ); 444# If not defined it means there is no revision of the file for 445# given timestamp. 446if(defined($file->{imageinfo})) { 447$mediafile{title} =$filename; 448 449my$fileinfo=pop(@{$file->{imageinfo}}); 450$mediafile{timestamp} =$fileinfo->{timestamp}; 451# Mediawiki::API's download function doesn't support https URLs 452# and can't download old versions of files. 453print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 454$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 455} 456return%mediafile; 457} 458 459sub download_mw_mediafile { 460my$url=shift; 461 462my$response=$mediawiki->{ua}->get($url); 463if($response->code==200) { 464return$response->decoded_content; 465}else{ 466print STDERR "Error downloading mediafile from :\n"; 467print STDERR "URL:$url\n"; 468print STDERR "Server response: ".$response->code." ".$response->message."\n"; 469exit1; 470} 471} 472 473sub get_last_local_revision { 474# Get note regarding last mediawiki revision 475my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 476my@note_info=split(/ /,$note); 477 478my$lastrevision_number; 479if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 480print STDERR "No previous mediawiki revision found"; 481$lastrevision_number=0; 482}else{ 483# Notes are formatted : mediawiki_revision: #number 484$lastrevision_number=$note_info[1]; 485chomp($lastrevision_number); 486print STDERR "Last local mediawiki revision found is$lastrevision_number"; 487} 488return$lastrevision_number; 489} 490 491# Remember the timestamp corresponding to a revision id. 492my%basetimestamps; 493 494# Get the last remote revision without taking in account which pages are 495# tracked or not. This function makes a single request to the wiki thus 496# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 497# option. 498sub get_last_global_remote_rev { 499 mw_connect_maybe(); 500 501my$query= { 502 action =>'query', 503 list =>'recentchanges', 504 prop =>'revisions', 505 rclimit =>'1', 506 rcdir =>'older', 507}; 508my$result=$mediawiki->api($query); 509return$result->{query}->{recentchanges}[0]->{revid}; 510} 511 512# Get the last remote revision concerning the tracked pages and the tracked 513# categories. 514sub get_last_remote_revision { 515 mw_connect_maybe(); 516 517my%pages_hash= get_mw_pages(); 518my@pages=values(%pages_hash); 519 520my$max_rev_num=0; 521 522print STDERR "Getting last revision id on tracked pages...\n"; 523 524foreachmy$page(@pages) { 525my$id=$page->{pageid}; 526 527my$query= { 528 action =>'query', 529 prop =>'revisions', 530 rvprop =>'ids|timestamp', 531 pageids =>$id, 532}; 533 534my$result=$mediawiki->api($query); 535 536my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 537 538$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 539 540$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 541} 542 543print STDERR "Last remote revision found is$max_rev_num.\n"; 544return$max_rev_num; 545} 546 547# Clean content before sending it to MediaWiki 548sub mediawiki_clean { 549my$string=shift; 550my$page_created=shift; 551# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 552# This function right trims a string and adds a \n at the end to follow this rule 553$string=~s/\s+$//; 554if($stringeq""&&$page_created) { 555# Creating empty pages is forbidden. 556$string= EMPTY_CONTENT; 557} 558return$string."\n"; 559} 560 561# Filter applied on MediaWiki data before adding them to Git 562sub mediawiki_smudge { 563my$string=shift; 564if($stringeq EMPTY_CONTENT) { 565$string=""; 566} 567# This \n is important. This is due to mediawiki's way to handle end of files. 568return$string."\n"; 569} 570 571sub mediawiki_clean_filename { 572my$filename=shift; 573$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 574# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 575# Do a variant of URL-encoding, i.e. looks like URL-encoding, 576# but with _ added to prevent MediaWiki from thinking this is 577# an actual special character. 578$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 579# If we use the uri escape before 580# we should unescape here, before anything 581 582return$filename; 583} 584 585sub mediawiki_smudge_filename { 586my$filename=shift; 587$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 588$filename=~s/ /_/g; 589# Decode forbidden characters encoded in mediawiki_clean_filename 590$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 591return$filename; 592} 593 594sub literal_data { 595my($content) =@_; 596print STDOUT "data ", bytes::length($content),"\n",$content; 597} 598 599sub literal_data_raw { 600# Output possibly binary content. 601my($content) =@_; 602# Avoid confusion between size in bytes and in characters 603 utf8::downgrade($content); 604binmode STDOUT,":raw"; 605print STDOUT "data ", bytes::length($content),"\n",$content; 606binmode STDOUT,":utf8"; 607} 608 609sub mw_capabilities { 610# Revisions are imported to the private namespace 611# refs/mediawiki/$remotename/ by the helper and fetched into 612# refs/remotes/$remotename later by fetch. 613print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 614print STDOUT "import\n"; 615print STDOUT "list\n"; 616print STDOUT "push\n"; 617print STDOUT "\n"; 618} 619 620sub mw_list { 621# MediaWiki do not have branches, we consider one branch arbitrarily 622# called master, and HEAD pointing to it. 623print STDOUT "? refs/heads/master\n"; 624print STDOUT "\@refs/heads/masterHEAD\n"; 625print STDOUT "\n"; 626} 627 628sub mw_option { 629print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 630print STDOUT "unsupported\n"; 631} 632 633sub fetch_mw_revisions_for_page { 634my$page=shift; 635my$id=shift; 636my$fetch_from=shift; 637my@page_revs= (); 638my$query= { 639 action =>'query', 640 prop =>'revisions', 641 rvprop =>'ids', 642 rvdir =>'newer', 643 rvstartid =>$fetch_from, 644 rvlimit =>500, 645 pageids =>$id, 646}; 647 648my$revnum=0; 649# Get 500 revisions at a time due to the mediawiki api limit 650while(1) { 651my$result=$mediawiki->api($query); 652 653# Parse each of those 500 revisions 654foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 655my$page_rev_ids; 656$page_rev_ids->{pageid} =$page->{pageid}; 657$page_rev_ids->{revid} =$revision->{revid}; 658push(@page_revs,$page_rev_ids); 659$revnum++; 660} 661last unless$result->{'query-continue'}; 662$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 663} 664if($shallow_import&&@page_revs) { 665print STDERR " Found 1 revision (shallow import).\n"; 666@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 667return$page_revs[0]; 668} 669print STDERR " Found ",$revnum," revision(s).\n"; 670return@page_revs; 671} 672 673sub fetch_mw_revisions { 674my$pages=shift;my@pages= @{$pages}; 675my$fetch_from=shift; 676 677my@revisions= (); 678my$n=1; 679foreachmy$page(@pages) { 680my$id=$page->{pageid}; 681 682print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 683$n++; 684my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 685@revisions= (@page_revs,@revisions); 686} 687 688return($n,@revisions); 689} 690 691sub fe_escape_path { 692my$path=shift; 693$path=~s/\\/\\\\/g; 694$path=~s/"/\\"/g; 695$path=~s/\n/\\n/g; 696return'"'.$path.'"'; 697} 698 699sub import_file_revision { 700my$commit=shift; 701my%commit= %{$commit}; 702my$full_import=shift; 703my$n=shift; 704my$mediafile=shift; 705my%mediafile; 706if($mediafile) { 707%mediafile= %{$mediafile}; 708} 709 710my$title=$commit{title}; 711my$comment=$commit{comment}; 712my$content=$commit{content}; 713my$author=$commit{author}; 714my$date=$commit{date}; 715 716print STDOUT "commit refs/mediawiki/$remotename/master\n"; 717print STDOUT "mark :$n\n"; 718print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 719 literal_data($comment); 720 721# If it's not a clone, we need to know where to start from 722if(!$full_import&&$n==1) { 723print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 724} 725if($contentne DELETED_CONTENT) { 726print STDOUT "M 644 inline ". 727 fe_escape_path($title.".mw") ."\n"; 728 literal_data($content); 729if(%mediafile) { 730print STDOUT "M 644 inline " 731. fe_escape_path($mediafile{title}) ."\n"; 732 literal_data_raw($mediafile{content}); 733} 734print STDOUT "\n\n"; 735}else{ 736print STDOUT "D ". fe_escape_path($title.".mw") ."\n"; 737} 738 739# mediawiki revision number in the git note 740if($full_import&&$n==1) { 741print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 742} 743print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 744print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 745 literal_data("Note added by git-mediawiki during import"); 746if(!$full_import&&$n==1) { 747print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 748} 749print STDOUT "N inline :$n\n"; 750 literal_data("mediawiki_revision: ".$commit{mw_revision}); 751print STDOUT "\n\n"; 752} 753 754# parse a sequence of 755# <cmd> <arg1> 756# <cmd> <arg2> 757# \n 758# (like batch sequence of import and sequence of push statements) 759sub get_more_refs { 760my$cmd=shift; 761my@refs; 762while(1) { 763my$line= <STDIN>; 764if($line=~m/^$cmd (.*)$/) { 765push(@refs,$1); 766}elsif($lineeq"\n") { 767return@refs; 768}else{ 769die("Invalid command in a '$cmd' batch: ".$_); 770} 771} 772} 773 774sub mw_import { 775# multiple import commands can follow each other. 776my@refs= (shift, get_more_refs("import")); 777foreachmy$ref(@refs) { 778 mw_import_ref($ref); 779} 780print STDOUT "done\n"; 781} 782 783sub mw_import_ref { 784my$ref=shift; 785# The remote helper will call "import HEAD" and 786# "import refs/heads/master". 787# Since HEAD is a symbolic ref to master (by convention, 788# followed by the output of the command "list" that we gave), 789# we don't need to do anything in this case. 790if($refeq"HEAD") { 791return; 792} 793 794 mw_connect_maybe(); 795 796print STDERR "Searching revisions...\n"; 797my$last_local= get_last_local_revision(); 798my$fetch_from=$last_local+1; 799if($fetch_from==1) { 800print STDERR ", fetching from beginning.\n"; 801}else{ 802print STDERR ", fetching from here.\n"; 803} 804 805my$n=0; 806if($fetch_strategyeq"by_rev") { 807print STDERR "Fetching & writing export data by revs...\n"; 808$n= mw_import_ref_by_revs($fetch_from); 809}elsif($fetch_strategyeq"by_page") { 810print STDERR "Fetching & writing export data by pages...\n"; 811$n= mw_import_ref_by_pages($fetch_from); 812}else{ 813print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 814print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 815exit1; 816} 817 818if($fetch_from==1&&$n==0) { 819print STDERR "You appear to have cloned an empty MediaWiki.\n"; 820# Something has to be done remote-helper side. If nothing is done, an error is 821# thrown saying that HEAD is referring to unknown object 0000000000000000000 822# and the clone fails. 823} 824} 825 826sub mw_import_ref_by_pages { 827 828my$fetch_from=shift; 829my%pages_hash= get_mw_pages(); 830my@pages=values(%pages_hash); 831 832my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 833 834@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 835my@revision_ids=map$_->{revid},@revisions; 836 837return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 838} 839 840sub mw_import_ref_by_revs { 841 842my$fetch_from=shift; 843my%pages_hash= get_mw_pages(); 844 845my$last_remote= get_last_global_remote_rev(); 846my@revision_ids=$fetch_from..$last_remote; 847return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 848} 849 850# Import revisions given in second argument (array of integers). 851# Only pages appearing in the third argument (hash indexed by page titles) 852# will be imported. 853sub mw_import_revids { 854my$fetch_from=shift; 855my$revision_ids=shift; 856my$pages=shift; 857 858my$n=0; 859my$n_actual=0; 860my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 861 862foreachmy$pagerevid(@$revision_ids) { 863# Count page even if we skip it, since we display 864# $n/$total and $total includes skipped pages. 865$n++; 866 867# fetch the content of the pages 868my$query= { 869 action =>'query', 870 prop =>'revisions', 871 rvprop =>'content|timestamp|comment|user|ids', 872 revids =>$pagerevid, 873}; 874 875my$result=$mediawiki->api($query); 876 877if(!$result) { 878die"Failed to retrieve modified page for revision$pagerevid"; 879} 880 881if(defined($result->{query}->{badrevids}->{$pagerevid})) { 882# The revision id does not exist on the remote wiki. 883next; 884} 885 886if(!defined($result->{query}->{pages})) { 887die"Invalid revision$pagerevid."; 888} 889 890my@result_pages=values(%{$result->{query}->{pages}}); 891my$result_page=$result_pages[0]; 892my$rev=$result_pages[0]->{revisions}->[0]; 893 894my$page_title=$result_page->{title}; 895 896if(!exists($pages->{$page_title})) { 897print STDERR "$n/",scalar(@$revision_ids), 898": Skipping revision #$rev->{revid} of$page_title\n"; 899next; 900} 901 902$n_actual++; 903 904my%commit; 905$commit{author} =$rev->{user} ||'Anonymous'; 906$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 907$commit{title} = mediawiki_smudge_filename($page_title); 908$commit{mw_revision} =$rev->{revid}; 909$commit{content} = mediawiki_smudge($rev->{'*'}); 910 911if(!defined($rev->{timestamp})) { 912$last_timestamp++; 913}else{ 914$last_timestamp=$rev->{timestamp}; 915} 916$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 917 918# Differentiates classic pages and media files. 919my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 920my%mediafile; 921if($namespace) { 922my$id= get_mw_namespace_id($namespace); 923if($id&&$id== get_mw_namespace_id("File")) { 924%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 925} 926} 927# If this is a revision of the media page for new version 928# of a file do one common commit for both file and media page. 929# Else do commit only for that page. 930print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 931 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 932} 933 934return$n_actual; 935} 936 937sub error_non_fast_forward { 938my$advice= run_git("config --bool advice.pushNonFastForward"); 939chomp($advice); 940if($advicene"false") { 941# Native git-push would show this after the summary. 942# We can't ask it to display it cleanly, so print it 943# ourselves before. 944print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 945print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 946print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 947} 948print STDOUT "error$_[0]\"non-fast-forward\"\n"; 949return0; 950} 951 952sub mw_upload_file { 953my$complete_file_name=shift; 954my$new_sha1=shift; 955my$extension=shift; 956my$file_deleted=shift; 957my$summary=shift; 958my$newrevid; 959my$path="File:".$complete_file_name; 960my%hashFiles= get_allowed_file_extensions(); 961if(!exists($hashFiles{$extension})) { 962print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 963print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 964return$newrevid; 965} 966# Deleting and uploading a file requires a priviledged user 967if($file_deleted) { 968 mw_connect_maybe(); 969my$query= { 970 action =>'delete', 971 title =>$path, 972 reason =>$summary 973}; 974if(!$mediawiki->edit($query)) { 975print STDERR "Failed to delete file on remote wiki\n"; 976print STDERR "Check your permissions on the remote site. Error code:\n"; 977print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 978exit1; 979} 980}else{ 981# Don't let perl try to interpret file content as UTF-8 => use "raw" 982my$content= run_git("cat-file blob$new_sha1","raw"); 983if($contentne"") { 984 mw_connect_maybe(); 985$mediawiki->{config}->{upload_url} = 986"$url/index.php/Special:Upload"; 987$mediawiki->edit({ 988 action =>'upload', 989 filename =>$complete_file_name, 990 comment =>$summary, 991 file => [undef, 992$complete_file_name, 993 Content =>$content], 994 ignorewarnings =>1, 995}, { 996 skip_encoding =>1 997} ) ||die$mediawiki->{error}->{code} .':' 998.$mediawiki->{error}->{details}; 999my$last_file_page=$mediawiki->get_page({title =>$path});1000$newrevid=$last_file_page->{revid};1001print STDERR "Pushed file:$new_sha1-$complete_file_name.\n";1002}else{1003print STDERR "Empty file$complete_file_namenot pushed.\n";1004}1005}1006return$newrevid;1007}10081009sub mw_push_file {1010my$diff_info=shift;1011# $diff_info contains a string in this format:1012# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1013my@diff_info_split=split(/[ \t]/,$diff_info);10141015# Filename, including .mw extension1016my$complete_file_name=shift;1017# Commit message1018my$summary=shift;1019# MediaWiki revision number. Keep the previous one by default,1020# in case there's no edit to perform.1021my$oldrevid=shift;1022my$newrevid;10231024if($summaryeq EMPTY_MESSAGE) {1025$summary='';1026}10271028my$new_sha1=$diff_info_split[3];1029my$old_sha1=$diff_info_split[2];1030my$page_created= ($old_sha1eq NULL_SHA1);1031my$page_deleted= ($new_sha1eq NULL_SHA1);1032$complete_file_name= mediawiki_clean_filename($complete_file_name);10331034my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1035if(!defined($extension)) {1036$extension="";1037}1038if($extensioneq"mw") {1039my$ns= get_mw_namespace_id_for_page($complete_file_name);1040if($ns&&$ns== get_mw_namespace_id("File") && (!$export_media)) {1041print STDERR "Ignoring media file related page:$complete_file_name\n";1042return($oldrevid,"ok");1043}1044my$file_content;1045if($page_deleted) {1046# Deleting a page usually requires1047# special privileges. A common1048# convention is to replace the page1049# with this content instead:1050$file_content= DELETED_CONTENT;1051}else{1052$file_content= run_git("cat-file blob$new_sha1");1053}10541055 mw_connect_maybe();10561057my$result=$mediawiki->edit( {1058 action =>'edit',1059 summary =>$summary,1060 title =>$title,1061 basetimestamp =>$basetimestamps{$oldrevid},1062 text => mediawiki_clean($file_content,$page_created),1063}, {1064 skip_encoding =>1# Helps with names with accentuated characters1065});1066if(!$result) {1067if($mediawiki->{error}->{code} ==3) {1068# edit conflicts, considered as non-fast-forward1069print STDERR 'Warning: Error '.1070$mediawiki->{error}->{code} .1071' from mediwiki: '.$mediawiki->{error}->{details} .1072".\n";1073return($oldrevid,"non-fast-forward");1074}else{1075# Other errors. Shouldn't happen => just die()1076die'Fatal: Error '.1077$mediawiki->{error}->{code} .1078' from mediwiki: '.$mediawiki->{error}->{details};1079}1080}1081$newrevid=$result->{edit}->{newrevid};1082print STDERR "Pushed file:$new_sha1-$title\n";1083}elsif($export_media) {1084$newrevid= mw_upload_file($complete_file_name,$new_sha1,1085$extension,$page_deleted,1086$summary);1087}else{1088print STDERR "Ignoring media file$title\n";1089}1090$newrevid= ($newrevidor$oldrevid);1091return($newrevid,"ok");1092}10931094sub mw_push {1095# multiple push statements can follow each other1096my@refsspecs= (shift, get_more_refs("push"));1097my$pushed;1098formy$refspec(@refsspecs) {1099my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1100or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1101if($force) {1102print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1103}1104if($localeq"") {1105print STDERR "Cannot delete remote branch on a MediaWiki\n";1106print STDOUT "error$remotecannot delete\n";1107next;1108}1109if($remotene"refs/heads/master") {1110print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1111print STDOUT "error$remoteonly master allowed\n";1112next;1113}1114if(mw_push_revision($local,$remote)) {1115$pushed=1;1116}1117}11181119# Notify Git that the push is done1120print STDOUT "\n";11211122if($pushed&&$dumb_push) {1123print STDERR "Just pushed some revisions to MediaWiki.\n";1124print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1125print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1126print STDERR "\n";1127print STDERR " git pull --rebase\n";1128print STDERR "\n";1129}1130}11311132sub mw_push_revision {1133my$local=shift;1134my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1135my$last_local_revid= get_last_local_revision();1136print STDERR ".\n";# Finish sentence started by get_last_local_revision()1137my$last_remote_revid= get_last_remote_revision();1138my$mw_revision=$last_remote_revid;11391140# Get sha1 of commit pointed by local HEAD1141my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1142# Get sha1 of commit pointed by remotes/$remotename/master1143my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1144chomp($remoteorigin_sha1);11451146if($last_local_revid>0&&1147$last_local_revid<$last_remote_revid) {1148return error_non_fast_forward($remote);1149}11501151if($HEAD_sha1eq$remoteorigin_sha1) {1152# nothing to push1153return0;1154}11551156# Get every commit in between HEAD and refs/remotes/origin/master,1157# including HEAD and refs/remotes/origin/master1158my@commit_pairs= ();1159if($last_local_revid>0) {1160my$parsed_sha1=$remoteorigin_sha1;1161# Find a path from last MediaWiki commit to pushed commit1162print STDERR "Computing path from local to remote ...\n";1163my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents$local^$parsed_sha1"));1164my%local_ancestry;1165foreachmy$line(@local_ancestry) {1166if(my($child,$parents) =$line=~m/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1167foreachmy$parent(split(' ',$parents)) {1168$local_ancestry{$parent} =$child;1169}1170}elsif(!$line=~m/^([a-f0-9]+)/) {1171die"Unexpected output from git rev-list:$line";1172}1173}1174while($parsed_sha1ne$HEAD_sha1) {1175my$child=$local_ancestry{$parsed_sha1};1176if(!$child) {1177printf STDERR "Cannot find a path in history from remote commit to last commit\n";1178return error_non_fast_forward($remote);1179}1180push(@commit_pairs, [$parsed_sha1,$child]);1181$parsed_sha1=$child;1182}1183}else{1184# No remote mediawiki revision. Export the whole1185# history (linearized with --first-parent)1186print STDERR "Warning: no common ancestor, pushing complete history\n";1187my$history= run_git("rev-list --first-parent --children$local");1188my@history=split('\n',$history);1189@history=@history[1..$#history];1190foreachmy$line(reverse@history) {1191my@commit_info_split=split(/ |\n/,$line);1192push(@commit_pairs, \@commit_info_split);1193}1194}11951196foreachmy$commit_info_split(@commit_pairs) {1197my$sha1_child= @{$commit_info_split}[0];1198my$sha1_commit= @{$commit_info_split}[1];1199my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1200# TODO: we could detect rename, and encode them with a #redirect on the wiki.1201# TODO: for now, it's just a delete+add1202my@diff_info_list=split(/\0/,$diff_infos);1203# Keep the subject line of the commit message as mediawiki comment for the revision1204my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1205chomp($commit_msg);1206# Push every blob1207while(@diff_info_list) {1208my$status;1209# git diff-tree -z gives an output like1210# <metadata>\0<filename1>\01211# <metadata>\0<filename2>\01212# and we've split on \0.1213my$info=shift(@diff_info_list);1214my$file=shift(@diff_info_list);1215($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1216if($statuseq"non-fast-forward") {1217# we may already have sent part of the1218# commit to MediaWiki, but it's too1219# late to cancel it. Stop the push in1220# the middle, but still give an1221# accurate error message.1222return error_non_fast_forward($remote);1223}1224if($statusne"ok") {1225die("Unknown error from mw_push_file()");1226}1227}1228unless($dumb_push) {1229 run_git("notes --ref=$remotename/mediawikiadd -f -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1230 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1231}1232}12331234print STDOUT "ok$remote\n";1235return1;1236}12371238sub get_allowed_file_extensions {1239 mw_connect_maybe();12401241my$query= {1242 action =>'query',1243 meta =>'siteinfo',1244 siprop =>'fileextensions'1245};1246my$result=$mediawiki->api($query);1247my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1248my%hashFile=map{$_=>1}@file_extensions;12491250return%hashFile;1251}12521253# In memory cache for MediaWiki namespace ids.1254my%namespace_id;12551256# Namespaces whose id is cached in the configuration file1257# (to avoid duplicates)1258my%cached_mw_namespace_id;12591260# Return MediaWiki id for a canonical namespace name.1261# Ex.: "File", "Project".1262sub get_mw_namespace_id {1263 mw_connect_maybe();1264my$name=shift;12651266if(!exists$namespace_id{$name}) {1267# Look at configuration file, if the record for that namespace is1268# already cached. Namespaces are stored in form:1269# "Name_of_namespace:Id_namespace", ex.: "File:6".1270my@temp=split(/[\n]/, run_git("config --get-all remote."1271.$remotename.".namespaceCache"));1272chomp(@temp);1273foreachmy$ns(@temp) {1274my($n,$id) =split(/:/,$ns);1275if($ideq'notANameSpace') {1276$namespace_id{$n} = {is_namespace =>0};1277}else{1278$namespace_id{$n} = {is_namespace =>1, id =>$id};1279}1280$cached_mw_namespace_id{$n} =1;1281}1282}12831284if(!exists$namespace_id{$name}) {1285print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1286# NS not found => get namespace id from MW and store it in1287# configuration file.1288my$query= {1289 action =>'query',1290 meta =>'siteinfo',1291 siprop =>'namespaces'1292};1293my$result=$mediawiki->api($query);12941295while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1296if(defined($ns->{id}) &&defined($ns->{canonical})) {1297$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1298if($ns->{'*'}) {1299# alias (e.g. french Fichier: as alias for canonical File:)1300$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1301}1302}1303}1304}13051306my$ns=$namespace_id{$name};1307my$id;13081309unless(defined$ns) {1310print STDERR "No such namespace$nameon MediaWiki.\n";1311$ns= {is_namespace =>0};1312$namespace_id{$name} =$ns;1313}13141315if($ns->{is_namespace}) {1316$id=$ns->{id};1317}13181319# Store "notANameSpace" as special value for inexisting namespaces1320my$store_id= ($id||'notANameSpace');13211322# Store explicitely requested namespaces on disk1323if(!exists$cached_mw_namespace_id{$name}) {1324 run_git("config --add remote.".$remotename1325.".namespaceCache\"".$name.":".$store_id."\"");1326$cached_mw_namespace_id{$name} =1;1327}1328return$id;1329}13301331sub get_mw_namespace_id_for_page {1332if(my($namespace) =$_[0] =~/^([^:]*):/) {1333return get_mw_namespace_id($namespace);1334}else{1335return;1336}1337}