1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use DateTime::Format::ISO8601; 18 19# By default, use UTF-8 to communicate with Git and the user 20binmode STDERR,":utf8"; 21binmode STDOUT,":utf8"; 22 23use URI::Escape; 24use IPC::Open2; 25 26use warnings; 27 28# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 29useconstant SLASH_REPLACEMENT =>"%2F"; 30 31# It's not always possible to delete pages (may require some 32# privileges). Deleted pages are replaced with this content. 33useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 34 35# It's not possible to create empty pages. New empty files in Git are 36# sent with this content instead. 37useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 38 39# used to reflect file creation or deletion in diff. 40useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 41 42# Used on Git's side to reflect empty edit messages on the wiki 43useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 44 45my$remotename=$ARGV[0]; 46my$url=$ARGV[1]; 47 48# Accept both space-separated and multiple keys in config file. 49# Spaces should be written as _ anyway because we'll use chomp. 50my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 51chomp(@tracked_pages); 52 53# Just like @tracked_pages, but for MediaWiki categories. 54my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 55chomp(@tracked_categories); 56 57# Import media files on pull 58my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 59chomp($import_media); 60$import_media= ($import_mediaeq"true"); 61 62# Export media files on push 63my$export_media= run_git("config --get --bool remote.".$remotename.".mediaexport"); 64chomp($export_media); 65$export_media= !($export_mediaeq"false"); 66 67my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 68# Note: mwPassword is discourraged. Use the credential system instead. 69my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 70my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 71chomp($wiki_login); 72chomp($wiki_passwd); 73chomp($wiki_domain); 74 75# Import only last revisions (both for clone and fetch) 76my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 77chomp($shallow_import); 78$shallow_import= ($shallow_importeq"true"); 79 80# Fetch (clone and pull) by revisions instead of by pages. This behavior 81# is more efficient when we have a wiki with lots of pages and we fetch 82# the revisions quite often so that they concern only few pages. 83# Possible values: 84# - by_rev: perform one query per new revision on the remote wiki 85# - by_page: query each tracked page for new revision 86my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 87unless($fetch_strategy) { 88$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 89} 90chomp($fetch_strategy); 91unless($fetch_strategy) { 92$fetch_strategy="by_page"; 93} 94 95# Dumb push: don't update notes and mediawiki ref to reflect the last push. 96# 97# Configurable with mediawiki.dumbPush, or per-remote with 98# remote.<remotename>.dumbPush. 99# 100# This means the user will have to re-import the just-pushed 101# revisions. On the other hand, this means that the Git revisions 102# corresponding to MediaWiki revisions are all imported from the wiki, 103# regardless of whether they were initially created in Git or from the 104# web interface, hence all users will get the same history (i.e. if 105# the push from Git to MediaWiki loses some information, everybody 106# will get the history with information lost). If the import is 107# deterministic, this means everybody gets the same sha1 for each 108# MediaWiki revision. 109my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 110unless($dumb_push) { 111$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 112} 113chomp($dumb_push); 114$dumb_push= ($dumb_pusheq"true"); 115 116my$wiki_name=$url; 117$wiki_name=~s/[^\/]*:\/\///; 118# If URL is like http://user:password@example.com/, we clearly don't 119# want the password in $wiki_name. While we're there, also remove user 120# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 121$wiki_name=~s/^.*@//; 122 123# Commands parser 124my$entry; 125my@cmd; 126while(<STDIN>) { 127chomp; 128@cmd=split(/ /); 129if(defined($cmd[0])) { 130# Line not blank 131if($cmd[0]eq"capabilities") { 132die("Too many arguments for capabilities")unless(!defined($cmd[1])); 133 mw_capabilities(); 134}elsif($cmd[0]eq"list") { 135die("Too many arguments for list")unless(!defined($cmd[2])); 136 mw_list($cmd[1]); 137}elsif($cmd[0]eq"import") { 138die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 139 mw_import($cmd[1]); 140}elsif($cmd[0]eq"option") { 141die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 142 mw_option($cmd[1],$cmd[2]); 143}elsif($cmd[0]eq"push") { 144 mw_push($cmd[1]); 145}else{ 146print STDERR "Unknown command. Aborting...\n"; 147last; 148} 149}else{ 150# blank line: we should terminate 151last; 152} 153 154BEGIN{ $| =1}# flush STDOUT, to make sure the previous 155# command is fully processed. 156} 157 158########################## Functions ############################## 159 160# MediaWiki API instance, created lazily. 161my$mediawiki; 162 163sub mw_connect_maybe { 164if($mediawiki) { 165return; 166} 167$mediawiki= MediaWiki::API->new; 168$mediawiki->{config}->{api_url} ="$url/api.php"; 169if($wiki_login) { 170my%credential= ( 171'url'=>$url, 172'username'=>$wiki_login, 173'password'=>$wiki_passwd 174); 175 Git::credential(\%credential); 176my$request= {lgname =>$credential{username}, 177 lgpassword =>$credential{password}, 178 lgdomain =>$wiki_domain}; 179if($mediawiki->login($request)) { 180 Git::credential(\%credential,'approve'); 181print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 182}else{ 183print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 184print STDERR " (error ". 185$mediawiki->{error}->{code} .': '. 186$mediawiki->{error}->{details} .")\n"; 187 Git::credential(\%credential,'reject'); 188exit1; 189} 190} 191} 192 193sub fatal_mw_error { 194my$action=shift; 195print STDERR "fatal: could not$action.\n"; 196print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 197if($url=~/^https/) { 198print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 199print STDERR "fatal: and the SSL certificate is correct.\n"; 200}else{ 201print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 202} 203print STDERR "fatal: (error ". 204$mediawiki->{error}->{code} .': '. 205$mediawiki->{error}->{details} .")\n"; 206exit1; 207} 208 209## Functions for listing pages on the remote wiki 210sub get_mw_tracked_pages { 211my$pages=shift; 212 get_mw_page_list(\@tracked_pages,$pages); 213} 214 215sub get_mw_page_list { 216my$page_list=shift; 217my$pages=shift; 218my@some_pages=@$page_list; 219while(@some_pages) { 220my$last=50; 221if($#some_pages<$last) { 222$last=$#some_pages; 223} 224my@slice=@some_pages[0..$last]; 225 get_mw_first_pages(\@slice,$pages); 226@some_pages=@some_pages[51..$#some_pages]; 227} 228} 229 230sub get_mw_tracked_categories { 231my$pages=shift; 232foreachmy$category(@tracked_categories) { 233if(index($category,':') <0) { 234# Mediawiki requires the Category 235# prefix, but let's not force the user 236# to specify it. 237$category="Category:".$category; 238} 239my$mw_pages=$mediawiki->list( { 240 action =>'query', 241 list =>'categorymembers', 242 cmtitle =>$category, 243 cmlimit =>'max'} ) 244||die$mediawiki->{error}->{code} .': ' 245.$mediawiki->{error}->{details}; 246foreachmy$page(@{$mw_pages}) { 247$pages->{$page->{title}} =$page; 248} 249} 250} 251 252sub get_mw_all_pages { 253my$pages=shift; 254# No user-provided list, get the list of pages from the API. 255my$mw_pages=$mediawiki->list({ 256 action =>'query', 257 list =>'allpages', 258 aplimit =>'max' 259}); 260if(!defined($mw_pages)) { 261 fatal_mw_error("get the list of wiki pages"); 262} 263foreachmy$page(@{$mw_pages}) { 264$pages->{$page->{title}} =$page; 265} 266} 267 268# queries the wiki for a set of pages. Meant to be used within a loop 269# querying the wiki for slices of page list. 270sub get_mw_first_pages { 271my$some_pages=shift; 272my@some_pages= @{$some_pages}; 273 274my$pages=shift; 275 276# pattern 'page1|page2|...' required by the API 277my$titles=join('|',@some_pages); 278 279my$mw_pages=$mediawiki->api({ 280 action =>'query', 281 titles =>$titles, 282}); 283if(!defined($mw_pages)) { 284 fatal_mw_error("query the list of wiki pages"); 285} 286while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 287if($id<0) { 288print STDERR "Warning: page$page->{title} not found on wiki\n"; 289}else{ 290$pages->{$page->{title}} =$page; 291} 292} 293} 294 295# Get the list of pages to be fetched according to configuration. 296sub get_mw_pages { 297 mw_connect_maybe(); 298 299print STDERR "Listing pages on remote wiki...\n"; 300 301my%pages;# hash on page titles to avoid duplicates 302my$user_defined; 303if(@tracked_pages) { 304$user_defined=1; 305# The user provided a list of pages titles, but we 306# still need to query the API to get the page IDs. 307 get_mw_tracked_pages(\%pages); 308} 309if(@tracked_categories) { 310$user_defined=1; 311 get_mw_tracked_categories(\%pages); 312} 313if(!$user_defined) { 314 get_mw_all_pages(\%pages); 315} 316if($import_media) { 317print STDERR "Getting media files for selected pages...\n"; 318if($user_defined) { 319 get_linked_mediafiles(\%pages); 320}else{ 321 get_all_mediafiles(\%pages); 322} 323} 324print STDERR (scalar keys%pages) ." pages found.\n"; 325return%pages; 326} 327 328# usage: $out = run_git("command args"); 329# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 330sub run_git { 331my$args=shift; 332my$encoding= (shift||"encoding(UTF-8)"); 333open(my$git,"-|:$encoding","git ".$args); 334my$res=do{local$/; <$git> }; 335close($git); 336 337return$res; 338} 339 340 341sub get_all_mediafiles { 342my$pages=shift; 343# Attach list of all pages for media files from the API, 344# they are in a different namespace, only one namespace 345# can be queried at the same moment 346my$mw_pages=$mediawiki->list({ 347 action =>'query', 348 list =>'allpages', 349 apnamespace => get_mw_namespace_id("File"), 350 aplimit =>'max' 351}); 352if(!defined($mw_pages)) { 353print STDERR "fatal: could not get the list of pages for media files.\n"; 354print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 355print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 356exit1; 357} 358foreachmy$page(@{$mw_pages}) { 359$pages->{$page->{title}} =$page; 360} 361} 362 363sub get_linked_mediafiles { 364my$pages=shift; 365my@titles=map$_->{title},values(%{$pages}); 366 367# The query is split in small batches because of the MW API limit of 368# the number of links to be returned (500 links max). 369my$batch=10; 370while(@titles) { 371if($#titles<$batch) { 372$batch=$#titles; 373} 374my@slice=@titles[0..$batch]; 375 376# pattern 'page1|page2|...' required by the API 377my$mw_titles=join('|',@slice); 378 379# Media files could be included or linked from 380# a page, get all related 381my$query= { 382 action =>'query', 383 prop =>'links|images', 384 titles =>$mw_titles, 385 plnamespace => get_mw_namespace_id("File"), 386 pllimit =>'max' 387}; 388my$result=$mediawiki->api($query); 389 390while(my($id,$page) =each(%{$result->{query}->{pages}})) { 391my@media_titles; 392if(defined($page->{links})) { 393my@link_titles=map$_->{title}, @{$page->{links}}; 394push(@media_titles,@link_titles); 395} 396if(defined($page->{images})) { 397my@image_titles=map$_->{title}, @{$page->{images}}; 398push(@media_titles,@image_titles); 399} 400if(@media_titles) { 401 get_mw_page_list(\@media_titles,$pages); 402} 403} 404 405@titles=@titles[($batch+1)..$#titles]; 406} 407} 408 409sub get_mw_mediafile_for_page_revision { 410# Name of the file on Wiki, with the prefix. 411my$filename=shift; 412my$timestamp=shift; 413my%mediafile; 414 415# Search if on a media file with given timestamp exists on 416# MediaWiki. In that case download the file. 417my$query= { 418 action =>'query', 419 prop =>'imageinfo', 420 titles =>"File:".$filename, 421 iistart =>$timestamp, 422 iiend =>$timestamp, 423 iiprop =>'timestamp|archivename|url', 424 iilimit =>1 425}; 426my$result=$mediawiki->api($query); 427 428my($fileid,$file) =each( %{$result->{query}->{pages}} ); 429# If not defined it means there is no revision of the file for 430# given timestamp. 431if(defined($file->{imageinfo})) { 432$mediafile{title} =$filename; 433 434my$fileinfo=pop(@{$file->{imageinfo}}); 435$mediafile{timestamp} =$fileinfo->{timestamp}; 436# Mediawiki::API's download function doesn't support https URLs 437# and can't download old versions of files. 438print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 439$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 440} 441return%mediafile; 442} 443 444sub download_mw_mediafile { 445my$url=shift; 446 447my$response=$mediawiki->{ua}->get($url); 448if($response->code==200) { 449return$response->decoded_content; 450}else{ 451print STDERR "Error downloading mediafile from :\n"; 452print STDERR "URL:$url\n"; 453print STDERR "Server response: ".$response->code." ".$response->message."\n"; 454exit1; 455} 456} 457 458sub get_last_local_revision { 459# Get note regarding last mediawiki revision 460my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 461my@note_info=split(/ /,$note); 462 463my$lastrevision_number; 464if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 465print STDERR "No previous mediawiki revision found"; 466$lastrevision_number=0; 467}else{ 468# Notes are formatted : mediawiki_revision: #number 469$lastrevision_number=$note_info[1]; 470chomp($lastrevision_number); 471print STDERR "Last local mediawiki revision found is$lastrevision_number"; 472} 473return$lastrevision_number; 474} 475 476# Remember the timestamp corresponding to a revision id. 477my%basetimestamps; 478 479# Get the last remote revision without taking in account which pages are 480# tracked or not. This function makes a single request to the wiki thus 481# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 482# option. 483sub get_last_global_remote_rev { 484 mw_connect_maybe(); 485 486my$query= { 487 action =>'query', 488 list =>'recentchanges', 489 prop =>'revisions', 490 rclimit =>'1', 491 rcdir =>'older', 492}; 493my$result=$mediawiki->api($query); 494return$result->{query}->{recentchanges}[0]->{revid}; 495} 496 497# Get the last remote revision concerning the tracked pages and the tracked 498# categories. 499sub get_last_remote_revision { 500 mw_connect_maybe(); 501 502my%pages_hash= get_mw_pages(); 503my@pages=values(%pages_hash); 504 505my$max_rev_num=0; 506 507print STDERR "Getting last revision id on tracked pages...\n"; 508 509foreachmy$page(@pages) { 510my$id=$page->{pageid}; 511 512my$query= { 513 action =>'query', 514 prop =>'revisions', 515 rvprop =>'ids|timestamp', 516 pageids =>$id, 517}; 518 519my$result=$mediawiki->api($query); 520 521my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 522 523$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 524 525$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 526} 527 528print STDERR "Last remote revision found is$max_rev_num.\n"; 529return$max_rev_num; 530} 531 532# Clean content before sending it to MediaWiki 533sub mediawiki_clean { 534my$string=shift; 535my$page_created=shift; 536# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 537# This function right trims a string and adds a \n at the end to follow this rule 538$string=~s/\s+$//; 539if($stringeq""&&$page_created) { 540# Creating empty pages is forbidden. 541$string= EMPTY_CONTENT; 542} 543return$string."\n"; 544} 545 546# Filter applied on MediaWiki data before adding them to Git 547sub mediawiki_smudge { 548my$string=shift; 549if($stringeq EMPTY_CONTENT) { 550$string=""; 551} 552# This \n is important. This is due to mediawiki's way to handle end of files. 553return$string."\n"; 554} 555 556sub mediawiki_clean_filename { 557my$filename=shift; 558$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 559# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 560# Do a variant of URL-encoding, i.e. looks like URL-encoding, 561# but with _ added to prevent MediaWiki from thinking this is 562# an actual special character. 563$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 564# If we use the uri escape before 565# we should unescape here, before anything 566 567return$filename; 568} 569 570sub mediawiki_smudge_filename { 571my$filename=shift; 572$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 573$filename=~s/ /_/g; 574# Decode forbidden characters encoded in mediawiki_clean_filename 575$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 576return$filename; 577} 578 579sub literal_data { 580my($content) =@_; 581print STDOUT "data ", bytes::length($content),"\n",$content; 582} 583 584sub literal_data_raw { 585# Output possibly binary content. 586my($content) =@_; 587# Avoid confusion between size in bytes and in characters 588 utf8::downgrade($content); 589binmode STDOUT,":raw"; 590print STDOUT "data ", bytes::length($content),"\n",$content; 591binmode STDOUT,":utf8"; 592} 593 594sub mw_capabilities { 595# Revisions are imported to the private namespace 596# refs/mediawiki/$remotename/ by the helper and fetched into 597# refs/remotes/$remotename later by fetch. 598print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 599print STDOUT "import\n"; 600print STDOUT "list\n"; 601print STDOUT "push\n"; 602print STDOUT "\n"; 603} 604 605sub mw_list { 606# MediaWiki do not have branches, we consider one branch arbitrarily 607# called master, and HEAD pointing to it. 608print STDOUT "? refs/heads/master\n"; 609print STDOUT "\@refs/heads/masterHEAD\n"; 610print STDOUT "\n"; 611} 612 613sub mw_option { 614print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 615print STDOUT "unsupported\n"; 616} 617 618sub fetch_mw_revisions_for_page { 619my$page=shift; 620my$id=shift; 621my$fetch_from=shift; 622my@page_revs= (); 623my$query= { 624 action =>'query', 625 prop =>'revisions', 626 rvprop =>'ids', 627 rvdir =>'newer', 628 rvstartid =>$fetch_from, 629 rvlimit =>500, 630 pageids =>$id, 631}; 632 633my$revnum=0; 634# Get 500 revisions at a time due to the mediawiki api limit 635while(1) { 636my$result=$mediawiki->api($query); 637 638# Parse each of those 500 revisions 639foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 640my$page_rev_ids; 641$page_rev_ids->{pageid} =$page->{pageid}; 642$page_rev_ids->{revid} =$revision->{revid}; 643push(@page_revs,$page_rev_ids); 644$revnum++; 645} 646last unless$result->{'query-continue'}; 647$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 648} 649if($shallow_import&&@page_revs) { 650print STDERR " Found 1 revision (shallow import).\n"; 651@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 652return$page_revs[0]; 653} 654print STDERR " Found ",$revnum," revision(s).\n"; 655return@page_revs; 656} 657 658sub fetch_mw_revisions { 659my$pages=shift;my@pages= @{$pages}; 660my$fetch_from=shift; 661 662my@revisions= (); 663my$n=1; 664foreachmy$page(@pages) { 665my$id=$page->{pageid}; 666 667print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 668$n++; 669my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 670@revisions= (@page_revs,@revisions); 671} 672 673return($n,@revisions); 674} 675 676sub fe_escape_path { 677my$path=shift; 678$path=~s/\\/\\\\/g; 679$path=~s/"/\\"/g; 680$path=~s/\n/\\n/g; 681return'"'.$path.'"'; 682} 683 684sub import_file_revision { 685my$commit=shift; 686my%commit= %{$commit}; 687my$full_import=shift; 688my$n=shift; 689my$mediafile=shift; 690my%mediafile; 691if($mediafile) { 692%mediafile= %{$mediafile}; 693} 694 695my$title=$commit{title}; 696my$comment=$commit{comment}; 697my$content=$commit{content}; 698my$author=$commit{author}; 699my$date=$commit{date}; 700 701print STDOUT "commit refs/mediawiki/$remotename/master\n"; 702print STDOUT "mark :$n\n"; 703print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 704 literal_data($comment); 705 706# If it's not a clone, we need to know where to start from 707if(!$full_import&&$n==1) { 708print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 709} 710if($contentne DELETED_CONTENT) { 711print STDOUT "M 644 inline ". 712 fe_escape_path($title.".mw") ."\n"; 713 literal_data($content); 714if(%mediafile) { 715print STDOUT "M 644 inline " 716. fe_escape_path($mediafile{title}) ."\n"; 717 literal_data_raw($mediafile{content}); 718} 719print STDOUT "\n\n"; 720}else{ 721print STDOUT "D ". fe_escape_path($title.".mw") ."\n"; 722} 723 724# mediawiki revision number in the git note 725if($full_import&&$n==1) { 726print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 727} 728print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 729print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 730 literal_data("Note added by git-mediawiki during import"); 731if(!$full_import&&$n==1) { 732print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 733} 734print STDOUT "N inline :$n\n"; 735 literal_data("mediawiki_revision: ".$commit{mw_revision}); 736print STDOUT "\n\n"; 737} 738 739# parse a sequence of 740# <cmd> <arg1> 741# <cmd> <arg2> 742# \n 743# (like batch sequence of import and sequence of push statements) 744sub get_more_refs { 745my$cmd=shift; 746my@refs; 747while(1) { 748my$line= <STDIN>; 749if($line=~m/^$cmd (.*)$/) { 750push(@refs,$1); 751}elsif($lineeq"\n") { 752return@refs; 753}else{ 754die("Invalid command in a '$cmd' batch: ".$_); 755} 756} 757} 758 759sub mw_import { 760# multiple import commands can follow each other. 761my@refs= (shift, get_more_refs("import")); 762foreachmy$ref(@refs) { 763 mw_import_ref($ref); 764} 765print STDOUT "done\n"; 766} 767 768sub mw_import_ref { 769my$ref=shift; 770# The remote helper will call "import HEAD" and 771# "import refs/heads/master". 772# Since HEAD is a symbolic ref to master (by convention, 773# followed by the output of the command "list" that we gave), 774# we don't need to do anything in this case. 775if($refeq"HEAD") { 776return; 777} 778 779 mw_connect_maybe(); 780 781print STDERR "Searching revisions...\n"; 782my$last_local= get_last_local_revision(); 783my$fetch_from=$last_local+1; 784if($fetch_from==1) { 785print STDERR ", fetching from beginning.\n"; 786}else{ 787print STDERR ", fetching from here.\n"; 788} 789 790my$n=0; 791if($fetch_strategyeq"by_rev") { 792print STDERR "Fetching & writing export data by revs...\n"; 793$n= mw_import_ref_by_revs($fetch_from); 794}elsif($fetch_strategyeq"by_page") { 795print STDERR "Fetching & writing export data by pages...\n"; 796$n= mw_import_ref_by_pages($fetch_from); 797}else{ 798print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 799print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 800exit1; 801} 802 803if($fetch_from==1&&$n==0) { 804print STDERR "You appear to have cloned an empty MediaWiki.\n"; 805# Something has to be done remote-helper side. If nothing is done, an error is 806# thrown saying that HEAD is referring to unknown object 0000000000000000000 807# and the clone fails. 808} 809} 810 811sub mw_import_ref_by_pages { 812 813my$fetch_from=shift; 814my%pages_hash= get_mw_pages(); 815my@pages=values(%pages_hash); 816 817my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 818 819@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 820my@revision_ids=map$_->{revid},@revisions; 821 822return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 823} 824 825sub mw_import_ref_by_revs { 826 827my$fetch_from=shift; 828my%pages_hash= get_mw_pages(); 829 830my$last_remote= get_last_global_remote_rev(); 831my@revision_ids=$fetch_from..$last_remote; 832return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 833} 834 835# Import revisions given in second argument (array of integers). 836# Only pages appearing in the third argument (hash indexed by page titles) 837# will be imported. 838sub mw_import_revids { 839my$fetch_from=shift; 840my$revision_ids=shift; 841my$pages=shift; 842 843my$n=0; 844my$n_actual=0; 845my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 846 847foreachmy$pagerevid(@$revision_ids) { 848# Count page even if we skip it, since we display 849# $n/$total and $total includes skipped pages. 850$n++; 851 852# fetch the content of the pages 853my$query= { 854 action =>'query', 855 prop =>'revisions', 856 rvprop =>'content|timestamp|comment|user|ids', 857 revids =>$pagerevid, 858}; 859 860my$result=$mediawiki->api($query); 861 862if(!$result) { 863die"Failed to retrieve modified page for revision$pagerevid"; 864} 865 866if(defined($result->{query}->{badrevids}->{$pagerevid})) { 867# The revision id does not exist on the remote wiki. 868next; 869} 870 871if(!defined($result->{query}->{pages})) { 872die"Invalid revision$pagerevid."; 873} 874 875my@result_pages=values(%{$result->{query}->{pages}}); 876my$result_page=$result_pages[0]; 877my$rev=$result_pages[0]->{revisions}->[0]; 878 879my$page_title=$result_page->{title}; 880 881if(!exists($pages->{$page_title})) { 882print STDERR "$n/",scalar(@$revision_ids), 883": Skipping revision #$rev->{revid} of$page_title\n"; 884next; 885} 886 887$n_actual++; 888 889my%commit; 890$commit{author} =$rev->{user} ||'Anonymous'; 891$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 892$commit{title} = mediawiki_smudge_filename($page_title); 893$commit{mw_revision} =$rev->{revid}; 894$commit{content} = mediawiki_smudge($rev->{'*'}); 895 896if(!defined($rev->{timestamp})) { 897$last_timestamp++; 898}else{ 899$last_timestamp=$rev->{timestamp}; 900} 901$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 902 903# Differentiates classic pages and media files. 904my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 905my%mediafile; 906if($namespace) { 907my$id= get_mw_namespace_id($namespace); 908if($id&&$id== get_mw_namespace_id("File")) { 909%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 910} 911} 912# If this is a revision of the media page for new version 913# of a file do one common commit for both file and media page. 914# Else do commit only for that page. 915print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 916 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 917} 918 919return$n_actual; 920} 921 922sub error_non_fast_forward { 923my$advice= run_git("config --bool advice.pushNonFastForward"); 924chomp($advice); 925if($advicene"false") { 926# Native git-push would show this after the summary. 927# We can't ask it to display it cleanly, so print it 928# ourselves before. 929print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 930print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 931print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 932} 933print STDOUT "error$_[0]\"non-fast-forward\"\n"; 934return0; 935} 936 937sub mw_upload_file { 938my$complete_file_name=shift; 939my$new_sha1=shift; 940my$extension=shift; 941my$file_deleted=shift; 942my$summary=shift; 943my$newrevid; 944my$path="File:".$complete_file_name; 945my%hashFiles= get_allowed_file_extensions(); 946if(!exists($hashFiles{$extension})) { 947print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 948print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 949return$newrevid; 950} 951# Deleting and uploading a file requires a priviledged user 952if($file_deleted) { 953 mw_connect_maybe(); 954my$query= { 955 action =>'delete', 956 title =>$path, 957 reason =>$summary 958}; 959if(!$mediawiki->edit($query)) { 960print STDERR "Failed to delete file on remote wiki\n"; 961print STDERR "Check your permissions on the remote site. Error code:\n"; 962print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 963exit1; 964} 965}else{ 966# Don't let perl try to interpret file content as UTF-8 => use "raw" 967my$content= run_git("cat-file blob$new_sha1","raw"); 968if($contentne"") { 969 mw_connect_maybe(); 970$mediawiki->{config}->{upload_url} = 971"$url/index.php/Special:Upload"; 972$mediawiki->edit({ 973 action =>'upload', 974 filename =>$complete_file_name, 975 comment =>$summary, 976 file => [undef, 977$complete_file_name, 978 Content =>$content], 979 ignorewarnings =>1, 980}, { 981 skip_encoding =>1 982} ) ||die$mediawiki->{error}->{code} .':' 983.$mediawiki->{error}->{details}; 984my$last_file_page=$mediawiki->get_page({title =>$path}); 985$newrevid=$last_file_page->{revid}; 986print STDERR "Pushed file:$new_sha1-$complete_file_name.\n"; 987}else{ 988print STDERR "Empty file$complete_file_namenot pushed.\n"; 989} 990} 991return$newrevid; 992} 993 994sub mw_push_file { 995my$diff_info=shift; 996# $diff_info contains a string in this format: 997# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status> 998my@diff_info_split=split(/[ \t]/,$diff_info); 9991000# Filename, including .mw extension1001my$complete_file_name=shift;1002# Commit message1003my$summary=shift;1004# MediaWiki revision number. Keep the previous one by default,1005# in case there's no edit to perform.1006my$oldrevid=shift;1007my$newrevid;10081009if($summaryeq EMPTY_MESSAGE) {1010$summary='';1011}10121013my$new_sha1=$diff_info_split[3];1014my$old_sha1=$diff_info_split[2];1015my$page_created= ($old_sha1eq NULL_SHA1);1016my$page_deleted= ($new_sha1eq NULL_SHA1);1017$complete_file_name= mediawiki_clean_filename($complete_file_name);10181019my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1020if(!defined($extension)) {1021$extension="";1022}1023if($extensioneq"mw") {1024my$ns= get_mw_namespace_id_for_page($complete_file_name);1025if($ns&&$ns== get_mw_namespace_id("File") && (!$export_media)) {1026print STDERR "Ignoring media file related page:$complete_file_name\n";1027return($oldrevid,"ok");1028}1029my$file_content;1030if($page_deleted) {1031# Deleting a page usually requires1032# special privileges. A common1033# convention is to replace the page1034# with this content instead:1035$file_content= DELETED_CONTENT;1036}else{1037$file_content= run_git("cat-file blob$new_sha1");1038}10391040 mw_connect_maybe();10411042my$result=$mediawiki->edit( {1043 action =>'edit',1044 summary =>$summary,1045 title =>$title,1046 basetimestamp =>$basetimestamps{$oldrevid},1047 text => mediawiki_clean($file_content,$page_created),1048}, {1049 skip_encoding =>1# Helps with names with accentuated characters1050});1051if(!$result) {1052if($mediawiki->{error}->{code} ==3) {1053# edit conflicts, considered as non-fast-forward1054print STDERR 'Warning: Error '.1055$mediawiki->{error}->{code} .1056' from mediwiki: '.$mediawiki->{error}->{details} .1057".\n";1058return($oldrevid,"non-fast-forward");1059}else{1060# Other errors. Shouldn't happen => just die()1061die'Fatal: Error '.1062$mediawiki->{error}->{code} .1063' from mediwiki: '.$mediawiki->{error}->{details};1064}1065}1066$newrevid=$result->{edit}->{newrevid};1067print STDERR "Pushed file:$new_sha1-$title\n";1068}elsif($export_media) {1069$newrevid= mw_upload_file($complete_file_name,$new_sha1,1070$extension,$page_deleted,1071$summary);1072}else{1073print STDERR "Ignoring media file$title\n";1074}1075$newrevid= ($newrevidor$oldrevid);1076return($newrevid,"ok");1077}10781079sub mw_push {1080# multiple push statements can follow each other1081my@refsspecs= (shift, get_more_refs("push"));1082my$pushed;1083formy$refspec(@refsspecs) {1084my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1085or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1086if($force) {1087print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1088}1089if($localeq"") {1090print STDERR "Cannot delete remote branch on a MediaWiki\n";1091print STDOUT "error$remotecannot delete\n";1092next;1093}1094if($remotene"refs/heads/master") {1095print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1096print STDOUT "error$remoteonly master allowed\n";1097next;1098}1099if(mw_push_revision($local,$remote)) {1100$pushed=1;1101}1102}11031104# Notify Git that the push is done1105print STDOUT "\n";11061107if($pushed&&$dumb_push) {1108print STDERR "Just pushed some revisions to MediaWiki.\n";1109print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1110print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1111print STDERR "\n";1112print STDERR " git pull --rebase\n";1113print STDERR "\n";1114}1115}11161117sub mw_push_revision {1118my$local=shift;1119my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1120my$last_local_revid= get_last_local_revision();1121print STDERR ".\n";# Finish sentence started by get_last_local_revision()1122my$last_remote_revid= get_last_remote_revision();1123my$mw_revision=$last_remote_revid;11241125# Get sha1 of commit pointed by local HEAD1126my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1127# Get sha1 of commit pointed by remotes/$remotename/master1128my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1129chomp($remoteorigin_sha1);11301131if($last_local_revid>0&&1132$last_local_revid<$last_remote_revid) {1133return error_non_fast_forward($remote);1134}11351136if($HEAD_sha1eq$remoteorigin_sha1) {1137# nothing to push1138return0;1139}11401141# Get every commit in between HEAD and refs/remotes/origin/master,1142# including HEAD and refs/remotes/origin/master1143my@commit_pairs= ();1144if($last_local_revid>0) {1145my$parsed_sha1=$remoteorigin_sha1;1146# Find a path from last MediaWiki commit to pushed commit1147print STDERR "Computing path from local to remote ...\n";1148my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents$local^$parsed_sha1"));1149my%local_ancestry;1150foreachmy$line(@local_ancestry) {1151if(my($child,$parents) =$line=~m/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1152foreachmy$parent(split(' ',$parents)) {1153$local_ancestry{$parent} =$child;1154}1155}elsif(!$line=~m/^([a-f0-9]+)/) {1156die"Unexpected output from git rev-list:$line";1157}1158}1159while($parsed_sha1ne$HEAD_sha1) {1160my$child=$local_ancestry{$parsed_sha1};1161if(!$child) {1162printf STDERR "Cannot find a path in history from remote commit to last commit\n";1163return error_non_fast_forward($remote);1164}1165push(@commit_pairs, [$parsed_sha1,$child]);1166$parsed_sha1=$child;1167}1168}else{1169# No remote mediawiki revision. Export the whole1170# history (linearized with --first-parent)1171print STDERR "Warning: no common ancestor, pushing complete history\n";1172my$history= run_git("rev-list --first-parent --children$local");1173my@history=split('\n',$history);1174@history=@history[1..$#history];1175foreachmy$line(reverse@history) {1176my@commit_info_split=split(/ |\n/,$line);1177push(@commit_pairs, \@commit_info_split);1178}1179}11801181foreachmy$commit_info_split(@commit_pairs) {1182my$sha1_child= @{$commit_info_split}[0];1183my$sha1_commit= @{$commit_info_split}[1];1184my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1185# TODO: we could detect rename, and encode them with a #redirect on the wiki.1186# TODO: for now, it's just a delete+add1187my@diff_info_list=split(/\0/,$diff_infos);1188# Keep the subject line of the commit message as mediawiki comment for the revision1189my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1190chomp($commit_msg);1191# Push every blob1192while(@diff_info_list) {1193my$status;1194# git diff-tree -z gives an output like1195# <metadata>\0<filename1>\01196# <metadata>\0<filename2>\01197# and we've split on \0.1198my$info=shift(@diff_info_list);1199my$file=shift(@diff_info_list);1200($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1201if($statuseq"non-fast-forward") {1202# we may already have sent part of the1203# commit to MediaWiki, but it's too1204# late to cancel it. Stop the push in1205# the middle, but still give an1206# accurate error message.1207return error_non_fast_forward($remote);1208}1209if($statusne"ok") {1210die("Unknown error from mw_push_file()");1211}1212}1213unless($dumb_push) {1214 run_git("notes --ref=$remotename/mediawikiadd -f -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1215 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1216}1217}12181219print STDOUT "ok$remote\n";1220return1;1221}12221223sub get_allowed_file_extensions {1224 mw_connect_maybe();12251226my$query= {1227 action =>'query',1228 meta =>'siteinfo',1229 siprop =>'fileextensions'1230};1231my$result=$mediawiki->api($query);1232my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1233my%hashFile=map{$_=>1}@file_extensions;12341235return%hashFile;1236}12371238# In memory cache for MediaWiki namespace ids.1239my%namespace_id;12401241# Namespaces whose id is cached in the configuration file1242# (to avoid duplicates)1243my%cached_mw_namespace_id;12441245# Return MediaWiki id for a canonical namespace name.1246# Ex.: "File", "Project".1247sub get_mw_namespace_id {1248 mw_connect_maybe();1249my$name=shift;12501251if(!exists$namespace_id{$name}) {1252# Look at configuration file, if the record for that namespace is1253# already cached. Namespaces are stored in form:1254# "Name_of_namespace:Id_namespace", ex.: "File:6".1255my@temp=split(/[\n]/, run_git("config --get-all remote."1256.$remotename.".namespaceCache"));1257chomp(@temp);1258foreachmy$ns(@temp) {1259my($n,$id) =split(/:/,$ns);1260if($ideq'notANameSpace') {1261$namespace_id{$n} = {is_namespace =>0};1262}else{1263$namespace_id{$n} = {is_namespace =>1, id =>$id};1264}1265$cached_mw_namespace_id{$n} =1;1266}1267}12681269if(!exists$namespace_id{$name}) {1270print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1271# NS not found => get namespace id from MW and store it in1272# configuration file.1273my$query= {1274 action =>'query',1275 meta =>'siteinfo',1276 siprop =>'namespaces'1277};1278my$result=$mediawiki->api($query);12791280while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1281if(defined($ns->{id}) &&defined($ns->{canonical})) {1282$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1283if($ns->{'*'}) {1284# alias (e.g. french Fichier: as alias for canonical File:)1285$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1286}1287}1288}1289}12901291my$ns=$namespace_id{$name};1292my$id;12931294unless(defined$ns) {1295print STDERR "No such namespace$nameon MediaWiki.\n";1296$ns= {is_namespace =>0};1297$namespace_id{$name} =$ns;1298}12991300if($ns->{is_namespace}) {1301$id=$ns->{id};1302}13031304# Store "notANameSpace" as special value for inexisting namespaces1305my$store_id= ($id||'notANameSpace');13061307# Store explicitely requested namespaces on disk1308if(!exists$cached_mw_namespace_id{$name}) {1309 run_git("config --add remote.".$remotename1310.".namespaceCache\"".$name.":".$store_id."\"");1311$cached_mw_namespace_id{$name} =1;1312}1313return$id;1314}13151316sub get_mw_namespace_id_for_page {1317if(my($namespace) =$_[0] =~/^([^:]*):/) {1318return get_mw_namespace_id($namespace);1319}else{1320return;1321}1322}