[PATCH] fix scalability problems with git-deltafy-script

diff --git a/read-tree.c b/read-tree.c

index ecd40cc393fad70a15ca4efbdab4a2049ea8aefe..0d5ded5026c0a87987938c498e271c1d8cce122c 100644 (file)
--- a/read-tree.c
+++ b/read-tree.c
@@ -143,7 +143,72 @@ static int deleted_entry(struct cache_entry *ce, struct cache_entry *old, struct
         return 1;
  }
  
-static int threeway_merge(struct cache_entry *stages[4], struct cache_entry **dst)
+static int causes_df_conflict(struct cache_entry *ce, int stage,
+                             struct cache_entry **dst_,
+                             struct cache_entry **next_,
+                             int tail)
+{
+       /* This is called during the merge operation and walking
+        * the active_cache[] array is messy, because it is in the
+        * middle of overlapping copy operation.  The invariants
+        * are:
+        * (1) active_cache points at the first (zeroth) entry.
+        * (2) up to dst pointer are resolved entries.
+        * (3) from the next pointer (head-inclusive) to the tail
+        *     of the active_cache array have the remaining paths
+        *     to be processed.  There can be a gap between dst
+        *     and next.  Note that next is called "src" in the
+        *     merge_cache() function, and tail is the original
+        *     end of active_cache array when merge_cache() started.
+        * (4) the path corresponding to *ce is not found in (2)
+        *     or (3).  It is in the gap.
+        *
+        *  active_cache -----......+++++++++++++.
+        *                    ^dst  ^next        ^tail
+        */
+       int i, next, dst;
+       const char *path = ce->name;
+       int namelen = ce_namelen(ce);
+
+       next = next_ - active_cache;
+       dst = dst_ - active_cache;
+
+       for (i = 0; i < tail; i++) {
+               int entlen, len;
+               const char *one, *two;
+               if (dst <= i && i < next)
+                       continue;
+               ce = active_cache[i];
+               if (ce_stage(ce) != stage)
+                       continue;
+               /* If ce->name is a prefix of path, then path is a file
+                * that hangs underneath ce->name, which is bad.
+                * If path is a prefix of ce->name, then it is the
+                * other way around which also is bad.
+                */
+               entlen = ce_namelen(ce);
+               if (namelen == entlen)
+                       continue;
+               if (namelen < entlen) {
+                       len = namelen;
+                       one = path;
+                       two = ce->name;
+               } else {
+                       len = entlen;
+                       one = ce->name;
+                       two = path;
+               }
+               if (memcmp(one, two, len))
+                       continue;
+               if (two[len] == '/')
+                       return 1;
+       }
+       return 0;
+}
+
+static int threeway_merge(struct cache_entry *stages[4],
+                         struct cache_entry **dst,
+                         struct cache_entry **next, int tail)
  {
         struct cache_entry *old = stages[0];
         struct cache_entry *a = stages[1], *b = stages[2], *c = stages[3];
@@ -156,6 +221,71 @@ static int threeway_merge(struct cache_entry *stages[4], struct cache_entry **ds
                         return -1;
                 return merged_entry_allow_dirty(b, old, dst);
         }
+       /* #2ALT and #3ALT */
+       if (!a && (!!b != !!c)) {
+               /*
+                * The reason we need to worry about directory/file
+                * conflicts only in #2ALT and #3ALT case is this:
+                *
+                * (1) For all other cases that read-tree internally
+                *     resolves a path, we always have such a path in
+                *     *both* stage2 and stage3 when we begin.
+                *     Traditionally, the behaviour has been even
+                *     stricter and we did not resolve a path without
+                *     initially being in all of stage1, 2, and 3.
+                *
+                * (2) When read-tree finishes, all resolved paths (i.e.
+                *     the paths that are in stage0) must have come from
+                *     either stage2 or stage3.  It is not possible to
+                *     have a stage0 path as a result of a merge if
+                *     neither stage2 nor stage3 had that path.
+                *
+                * (3) It is guaranteed that just after reading the
+                *     stages, each stage cannot have directory/file
+                *     conflicts on its own, because they are populated
+                *     by reading hierarchy of a tree.  Combined with
+                *     (1) and (2) above, this means that no matter what
+                *     combination of paths we take from stage2 and
+                *     stage3 as a result of a merge, they cannot cause
+                *     a directory/file conflict situation (otherwise
+                *     the "guilty" path would have already had such a
+                *     conflict in the original stage, either stage2
+                *     or stage3).  Although its stage2 is synthesized
+                *     by overlaying the current index on top of "our
+                *     head" tree, --emu23 case also has this guarantee,
+                *     by calling add_cache_entry() to create such stage2
+                *     entries.
+                *
+                * (4) Only #2ALT and #3ALT lack the guarantee (1).
+                *     They resolve paths that exist only in stage2
+                *     or stage3.  The stage2 tree may have a file DF
+                *     while stage3 tree may have a file DF/DF.  If
+                *     #2ALT and #3ALT rules happen to apply to both
+                *     of them, we would end up having DF (coming from
+                *     stage2) and DF/DF (from stage3) in the result.
+                *     When we attempt to resolve a path that exists
+                *     only in stage2, we need to make sure there is
+                *     no path that would conflict with it in stage3
+                *     and vice versa.
+                */
+               if (c) { /* #2ALT */
+                       if (!causes_df_conflict(c, 2, dst, next, tail) &&
+                           (!old || same(c, old)))
+                               return merged_entry_allow_dirty(c, old, dst);
+               }
+               else { /* #3ALT */
+                       if (!causes_df_conflict(b, 3, dst, next, tail) &&
+                           (!old || same(b, old)))
+                               return merged_entry_allow_dirty(b, old, dst);
+               }
+               /* otherwise we will apply the original rule */
+       }
+       /* #14ALT */
+       if (a && b && c && same(a, b) && !same(a, c)) {
+               if (old && same(old, c))
+                       return merged_entry_allow_dirty(c, old, dst);
+               /* otherwise the regular rule applies */
+       }
         /*
          * If we have an entry in the index cache ("old"), then we want
          * to make sure that it matches any entries in stage 2 ("first
@@ -186,7 +316,8 @@ static int threeway_merge(struct cache_entry *stages[4], struct cache_entry **ds
   * "carry forward" rule, please see <Documentation/git-read-tree.txt>.
   *
   */
-static int twoway_merge(struct cache_entry **src, struct cache_entry **dst)
+static int twoway_merge(struct cache_entry **src, struct cache_entry **dst,
+                       struct cache_entry **next, int tail)
  {
         struct cache_entry *current = src[0];
         struct cache_entry *oldtree = src[1], *newtree = src[2];
@@ -283,7 +414,8 @@ static void setup_emu23(void)
   * The rule is:
   * - take the stat information from stage0, take the data from stage1
   */
-static int oneway_merge(struct cache_entry **src, struct cache_entry **dst)
+static int oneway_merge(struct cache_entry **src, struct cache_entry **dst,
+                       struct cache_entry **next, int tail)
  {
         struct cache_entry *old = src[0];
         struct cache_entry *a = src[1];
@@ -324,11 +456,12 @@ static void check_updates(struct cache_entry **src, int nr)
         }
  }
  
-typedef int (*merge_fn_t)(struct cache_entry **, struct cache_entry **);
+typedef int (*merge_fn_t)(struct cache_entry **, struct cache_entry **, struct cache_entry **, int);
  
  static void merge_cache(struct cache_entry **src, int nr, merge_fn_t fn)
  {
         struct cache_entry **dst = src;
+       int tail = nr;
  
         while (nr) {
                 int entries;
@@ -346,7 +479,7 @@ static void merge_cache(struct cache_entry **src, int nr, merge_fn_t fn)
                                 break;
                 }
  
-               entries = fn(stages, dst);
+               entries = fn(stages, dst, src, tail);
                 if (entries < 0)
                         reject_merge(name);
                 dst += entries;
@@ -410,6 +543,7 @@ int main(int argc, char **argv)
                         merge = 1;
                         stage = 1;
                         read_cache_unmerged();
+                       continue;
                 }
  
                 /* "-m" stands for "merge", meaning we start in stage 1 */