[PATCH] fix scalability problems with git-deltafy-script
[gitweb.git] / git-deltafy-script
index 21a95692ff3270c35a230c3046dee018109aff79..476d8796ecbb60688a91888f311ac6074eead45b 100755 (executable)
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Example script to deltafy an entire GIT repository based on the commit list.
+# Example script to deltify an entire GIT repository based on the commit list.
 # The most recent version of a file is the reference and previous versions
 # are made delta against the best earlier version available. And so on for
 # successive versions going back in time.  This way the increasing delta
 
 set -e
 
-depth=
-[ "$1" == "-d" ] && depth="--max-depth=$2" && shift 2
+max_depth=
+[ "$1" == "-d" ] && max_depth="--max-depth=$2" && shift 2
+
+overlap=30
+max_behind="--max-behind=$overlap"
 
 function process_list() {
        if [ "$list" ]; then
                echo "Processing $curr_file"
-               echo "$head $list" | xargs git-mkdelta $depth --max-behind=30 -v
+               echo "$list" | xargs git-mkdelta $max_depth $max_behind -v
        fi
 }
 
+rev_list=""
 curr_file=""
 
 git-rev-list HEAD |
-git-diff-tree -r -t --stdin |
-awk '/^:/ { if ($5 == "M" || $5 == "N") print $4, $6;
-            if ($5 == "M") print $3, $6 }' |
-LC_ALL=C sort -s -k 2 | uniq |
-while read sha1 file; do
-       if [ "$file" == "$curr_file" ]; then
-               list="$list $sha1"
-       else
-               process_list
-               curr_file="$file"
-               list=""
-               head="$sha1"
-       fi
+while true; do
+       # Let's batch revisions into groups of 1000 to give it a chance to
+       # scale with repositories containing long revision lists.  We also
+       # overlap with the previous batch the size of mkdelta's look behind
+       # value in order to account for the processing discontinuity.
+       rev_list="$(echo -e -n "$rev_list" | tail --lines=$overlap)"
+       for i in $(seq 1000); do
+               read rev || break
+               rev_list="$rev_list$rev\n"
+       done
+       echo -e -n "$rev_list" |
+       git-diff-tree -r -t --stdin |
+       awk '/^:/ { if ($5 == "M") printf "%s %s\n%s %s\n", $4, $6, $3, $6 }' |
+       LC_ALL=C sort -s -k 2 | uniq |
+       while read sha1 file; do
+               if [ "$file" == "$curr_file" ]; then
+                       list="$list $sha1"
+               else
+                       process_list
+                       curr_file="$file"
+                       list="$sha1"
+               fi
+       done
+       [ "$rev" ] || break
 done
 process_list
 
 curr_file="root directory"
-head=""
 list="$(
        git-rev-list HEAD |
        while read commit; do