+sub get_all_mediafiles {
+ my $pages = shift;
+ # Attach list of all pages for media files from the API,
+ # they are in a different namespace, only one namespace
+ # can be queried at the same moment
+ my $mw_pages = $mediawiki->list({
+ action => 'query',
+ list => 'allpages',
+ apnamespace => get_mw_namespace_id("File"),
+ aplimit => 'max'
+ });
+ if (!defined($mw_pages)) {
+ print STDERR "fatal: could not get the list of pages for media files.\n";
+ print STDERR "fatal: '$url' does not appear to be a mediawiki\n";
+ print STDERR "fatal: make sure '$url/api.php' is a valid page.\n";
+ exit 1;
+ }
+ foreach my $page (@{$mw_pages}) {
+ $pages->{$page->{title}} = $page;
+ }
+}
+
+sub get_linked_mediafiles {
+ my $pages = shift;
+ my @titles = map $_->{title}, values(%{$pages});
+
+ # The query is split in small batches because of the MW API limit of
+ # the number of links to be returned (500 links max).
+ my $batch = 10;
+ while (@titles) {
+ if ($#titles < $batch) {
+ $batch = $#titles;
+ }
+ my @slice = @titles[0..$batch];
+
+ # pattern 'page1|page2|...' required by the API
+ my $mw_titles = join('|', @slice);
+
+ # Media files could be included or linked from
+ # a page, get all related
+ my $query = {
+ action => 'query',
+ prop => 'links|images',
+ titles => $mw_titles,
+ plnamespace => get_mw_namespace_id("File"),
+ pllimit => 'max'
+ };
+ my $result = $mediawiki->api($query);
+
+ while (my ($id, $page) = each(%{$result->{query}->{pages}})) {
+ my @titles;
+ if (defined($page->{links})) {
+ my @link_titles = map $_->{title}, @{$page->{links}};
+ push(@titles, @link_titles);
+ }
+ if (defined($page->{images})) {
+ my @image_titles = map $_->{title}, @{$page->{images}};
+ push(@titles, @image_titles);
+ }
+ if (@titles) {
+ get_mw_first_pages(\@titles, \%{$pages});
+ }
+ }
+
+ @titles = @titles[($batch+1)..$#titles];
+ }
+}
+
+sub get_mw_mediafile_for_page_revision {
+ # Name of the file on Wiki, with the prefix.
+ my $mw_filename = shift;
+ my $timestamp = shift;
+ my %mediafile;
+
+ # Search if on MediaWiki exists a media file with given
+ # timestamp. In that case download the file.
+ my $query = {
+ action => 'query',
+ prop => 'imageinfo',
+ titles => $mw_filename,
+ iistart => $timestamp,
+ iiend => $timestamp,
+ iiprop => 'timestamp|archivename|url',
+ iilimit => 1
+ };
+ my $result = $mediawiki->api($query);
+
+ my ($fileid, $file) = each ( %{$result->{query}->{pages}} );
+ # If not defined it means there is no revision of the file for
+ # given timestamp.
+ if (defined($file->{imageinfo})) {
+ # Get real name of media file.
+ my $filename;
+ if (index($mw_filename, 'File:') == 0) {
+ $filename = substr $mw_filename, 5;
+ } else {
+ $filename = substr $mw_filename, 6;
+ }
+ $mediafile{title} = $filename;
+
+ my $fileinfo = pop(@{$file->{imageinfo}});
+ $mediafile{timestamp} = $fileinfo->{timestamp};
+ # If this is an old version of the file, the file has to be
+ # obtained from the archive. Otherwise it can be downloaded
+ # by MediaWiki API download() function.
+ if (defined($fileinfo->{archivename})) {
+ $mediafile{content} = download_mw_mediafile_from_archive($fileinfo->{url});
+ } else {
+ $mediafile{content} = download_mw_mediafile($mw_filename);
+ }
+ }
+ return %mediafile;
+}
+
+sub download_mw_mediafile_from_archive {
+ my $url = shift;
+ my $file;
+
+ my $ua = LWP::UserAgent->new;
+ my $response = $ua->get($url);
+ if ($response->code) {
+ $file = $response->decoded_content;
+ } else {
+ print STDERR "Error downloading a file from archive.\n";
+ }
+
+ return $file;
+}
+
+sub download_mw_mediafile {
+ my $filename = shift;
+
+ $mediawiki->{config}->{files_url} = $url;
+
+ my $file_content = $mediawiki->download( { title => $filename } );
+ if (!defined($file_content)) {
+ print STDERR "\tFile \'$filename\' could not be downloaded.\n";
+ exit 1;
+ } elsif ($file_content eq "") {
+ print STDERR "\tFile \'$filename\' does not exist on the wiki.\n";
+ exit 1;
+ } else {
+ return $file_content;
+ }
+}
+