contrib / stats / packinfo.plon commit split-index: convert struct split_index to object_id (2182abd)
   1#!/usr/bin/perl
   2#
   3# This tool will print vaguely pretty information about a pack.  It
   4# expects the output of "git verify-pack -v" as input on stdin.
   5#
   6# $ git verify-pack -v | packinfo.pl
   7#
   8# This prints some full-pack statistics; currently "all sizes", "all
   9# path sizes", "tree sizes", "tree path sizes", and "depths".
  10#
  11# * "all sizes" stats are across every object size in the file;
  12#   full sizes for base objects, and delta size for deltas.
  13# * "all path sizes" stats are across all object's "path sizes".
  14#   A path size is the sum of the size of the delta chain, including the
  15#   base object.  In other words, it's how many bytes need be read to
  16#   reassemble the file from deltas.
  17# * "tree sizes" are object sizes grouped into delta trees.
  18# * "tree path sizes" are path sizes grouped into delta trees.
  19# * "depths" should be obvious.
  20#
  21# When run as:
  22#
  23# $ git verify-pack -v | packinfo.pl -tree
  24#
  25# the trees of objects are output along with the stats.  This looks
  26# like:
  27#
  28#   0 commit 031321c6...      803      803
  29#
  30#   0   blob 03156f21...     1767     1767
  31#   1    blob f52a9d7f...       10     1777
  32#   2     blob a8cc5739...       51     1828
  33#   3      blob 660e90b1...       15     1843
  34#   4       blob 0cb8e3bb...       33     1876
  35#   2     blob e48607f0...      311     2088
  36#      size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85
  37# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26
  38#
  39# The first number after the sha1 is the object size, the second
  40# number is the path size.  The statistics are across all objects in
  41# the previous delta tree.  Obviously they are omitted for trees of
  42# one object.
  43#
  44# When run as:
  45#
  46# $ git verify-pack -v | packinfo.pl -tree -filenames
  47#
  48# it adds filenames to the tree.  Getting this information is slow:
  49#
  50#   0   blob 03156f21...     1767     1767 Documentation/git-lost-found.txt @ tags/v1.2.0~142
  51#   1    blob f52a9d7f...       10     1777 Documentation/git-lost-found.txt @ tags/v1.5.0-rc1~74
  52#   2     blob a8cc5739...       51     1828 Documentation/git-lost+found.txt @ tags/v0.99.9h^0
  53#   3      blob 660e90b1...       15     1843 Documentation/git-lost+found.txt @ master~3222^2~2
  54#   4       blob 0cb8e3bb...       33     1876 Documentation/git-lost+found.txt @ master~3222^2~3
  55#   2     blob e48607f0...      311     2088 Documentation/git-lost-found.txt @ tags/v1.5.2-rc3~4
  56#      size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85
  57# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26
  58#
  59# When run as:
  60#
  61# $ git verify-pack -v | packinfo.pl -dump
  62#
  63# it prints out "sha1 size pathsize depth" for each sha1 in lexical
  64# order.
  65#
  66# 000079a2eaef17b7eae70e1f0f635557ea67b644 30 472 7
  67# 00013cafe6980411aa6fdd940784917b5ff50f0a 44 1542 4
  68# 000182eacf99cde27d5916aa415921924b82972c 499 499 0
  69# ...
  70#
  71# This is handy for comparing two packs.  Adding "-filenames" will add
  72# filenames, as per "-tree -filenames" above.
  73
  74use strict;
  75use Getopt::Long;
  76
  77my $filenames = 0;
  78my $tree = 0;
  79my $dump = 0;
  80GetOptions("tree" => \$tree,
  81           "filenames" => \$filenames,
  82           "dump" => \$dump);
  83
  84my %parents;
  85my %children;
  86my %sizes;
  87my @roots;
  88my %paths;
  89my %types;
  90my @commits;
  91my %names;
  92my %depths;
  93my @depths;
  94
  95while (<STDIN>) {
  96    my ($sha1, $type, $size, $space, $offset, $depth, $parent) = split(/\s+/, $_);
  97    next unless ($sha1 =~ /^[0-9a-f]{40}$/);
  98    $depths{$sha1} = $depth || 0;
  99    push(@depths, $depth || 0);
 100    push(@commits, $sha1) if ($type eq 'commit');
 101    push(@roots, $sha1) unless $parent;
 102    $parents{$sha1} = $parent;
 103    $types{$sha1} = $type;
 104    push(@{$children{$parent}}, $sha1);
 105    $sizes{$sha1} = $size;
 106}
 107
 108if ($filenames && ($tree || $dump)) {
 109    open(NAMES, "git name-rev --all|");
 110    while (<NAMES>) {
 111        if (/^(\S+)\s+(.*)$/) {
 112            my ($sha1, $name) = ($1, $2);
 113            $names{$sha1} = $name;
 114        }
 115    }
 116    close NAMES;
 117
 118    for my $commit (@commits) {
 119        my $name = $names{$commit};
 120        open(TREE, "git ls-tree -t -r $commit|");
 121        print STDERR "Plumbing tree $name\n";
 122        while (<TREE>) {
 123            if (/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/) {
 124                my ($mode, $type, $sha1, $path) = ($1, $2, $3, $4);
 125                $paths{$sha1} = "$path @ $name";
 126            }
 127        }
 128        close TREE;
 129    }
 130}
 131
 132sub stats {
 133    my @data = sort {$a <=> $b} @_;
 134    my $min = $data[0];
 135    my $max = $data[$#data];
 136    my $total = 0;
 137    my $count = scalar @data;
 138    for my $datum (@data) {
 139        $total += $datum;
 140    }
 141    my $mean = $total / $count;
 142    my $median = $data[int(@data / 2)];
 143    my $diff_sum = 0;
 144    for my $datum (@data) {
 145        $diff_sum += ($datum - $mean)**2;
 146    }
 147    my $std_dev = sqrt($diff_sum / $count);
 148    return ($count, $total, $min, $max, $mean, $median, $std_dev);
 149}
 150
 151sub print_stats {
 152    my $name = shift;
 153    my ($count, $total, $min, $max, $mean, $median, $std_dev) = stats(@_);
 154    printf("%s: count %s total %s min %s max %s mean %.2f median %s std_dev %.2f\n",
 155           $name, $count, $total, $min, $max, $mean, $median, $std_dev);
 156}
 157
 158my @sizes;
 159my @path_sizes;
 160my @all_sizes;
 161my @all_path_sizes;
 162my %path_sizes;
 163
 164sub dig {
 165    my ($sha1, $depth, $path_size) = @_;
 166    $path_size += $sizes{$sha1};
 167    push(@sizes, $sizes{$sha1});
 168    push(@all_sizes, $sizes{$sha1});
 169    push(@path_sizes, $path_size);
 170    push(@all_path_sizes, $path_size);
 171    $path_sizes{$sha1} = $path_size;
 172    if ($tree) {
 173        printf("%3d%s %6s %s %8d %8d %s\n",
 174               $depth, (" " x $depth), $types{$sha1},
 175               $sha1, $sizes{$sha1}, $path_size, $paths{$sha1});
 176    }
 177    for my $child (@{$children{$sha1}}) {
 178        dig($child, $depth + 1, $path_size);
 179    }
 180}
 181
 182my @tree_sizes;
 183my @tree_path_sizes;
 184
 185for my $root (@roots) {
 186    undef @sizes;
 187    undef @path_sizes;
 188    dig($root, 0, 0);
 189    my ($aa, $sz_total) = stats(@sizes);
 190    my ($bb, $psz_total) = stats(@path_sizes);
 191    push(@tree_sizes, $sz_total);
 192    push(@tree_path_sizes, $psz_total);
 193    if ($tree) {
 194        if (@sizes > 1) {
 195            print_stats("     size", @sizes);
 196            print_stats("path size", @path_sizes);
 197        }
 198        print "\n";
 199    }
 200}
 201
 202if ($dump) {
 203    for my $sha1 (sort keys %sizes) {
 204        print "$sha1 $sizes{$sha1} $path_sizes{$sha1} $depths{$sha1} $paths{$sha1}\n";
 205    }
 206} else {
 207    print_stats("      all sizes", @all_sizes);
 208    print_stats(" all path sizes", @all_path_sizes);
 209    print_stats("     tree sizes", @tree_sizes);
 210    print_stats("tree path sizes", @tree_path_sizes);
 211    print_stats("         depths", @depths);
 212}