git-pickaxe: get rid of wasteful find_origin().
authorJunio C Hamano <junkio@cox.net>
Sat, 21 Oct 2006 09:56:33 +0000 (02:56 -0700)
committerJunio C Hamano <junkio@cox.net>
Sat, 21 Oct 2006 09:56:33 +0000 (02:56 -0700)
After finding out which path in the parent to scan to pass
blames, using get_tree_entry() to extract the blob information
again was quite wasteful, since diff-tree already gave us that
information. Separate the function to create an origin out as
get_origin().

You'll never know what is more efficient unless you try and/or
think hard. I somehow thought that extracting one known path
out of commit's tree is cheaper than running a diff-tree for the
current path between the commit and its parent, but it is not
the case. In real, non-toy projects, most commits do not touch
the path you are interested in, and if the path is a few levels
away from the toplevel, whole-subdirectory comparison logic
diff-tree allows us to skip opening lower subdirectories.

This commit rewrites find_origin() function to use a single-path
diff-tree to see if the parent has the same blob as the current
suspect, which is cheaper than extracting the blob information
using get_tree_entry() and comparing it with what the current
suspect has. This shaves about 6% overhead when annotating
kernel/sched.c in the Linux kernel repository on my machine.
The saving rises to 25% for arch/i386/kernel/Makefile.

Signed-off-by: Junio C Hamano <junkio@cox.net>
builtin-pickaxe.c
index 3ab87efac7171e690f227ed7315fb0c22c2b7357..cf474b0d11e1508b7f885f3886abfdeaeee30b0c 100644 (file)
@@ -140,48 +140,103 @@ static void coalesce(struct scoreboard *sb)
        }
 }
 
-static void free_origin(struct origin *o)
+static struct origin *get_origin(struct scoreboard *sb,
+                                struct commit *commit,
+                                const char *path)
 {
-       free(o);
-}
-
-static struct origin *find_origin(struct scoreboard *sb,
-                                 struct commit *commit,
-                                 const char *path)
-{
-       struct blame_entry *ent;
+       struct blame_entry *e;
        struct origin *o;
-       unsigned mode;
-       char type[10];
 
-       for (ent = sb->ent; ent; ent = ent->next) {
-               if (ent->suspect->commit == commit &&
-                   !strcmp(ent->suspect->path, path))
-                       return ent->suspect;
+       for (e = sb->ent; e; e = e->next) {
+               if (e->suspect->commit == commit &&
+                   !strcmp(e->suspect->path, path))
+                       return e->suspect;
        }
-
        o = xcalloc(1, sizeof(*o) + strlen(path) + 1);
        o->commit = commit;
        strcpy(o->path, path);
-       if (get_tree_entry(commit->object.sha1, path, o->blob_sha1, &mode))
-               goto err_out;
-       if (sha1_object_info(o->blob_sha1, type, NULL) ||
-           strcmp(type, blob_type))
-               goto err_out;
        return o;
- err_out:
-       free_origin(o);
-       return NULL;
 }
 
-static struct origin *find_rename(struct scoreboard *sb,
+static int fill_blob_sha1(struct origin *origin)
+{
+       unsigned mode;
+       char type[10];
+
+       if (!is_null_sha1(origin->blob_sha1))
+               return 0;
+       if (get_tree_entry(origin->commit->object.sha1,
+                          origin->path,
+                          origin->blob_sha1, &mode))
+               goto error_out;
+       if (sha1_object_info(origin->blob_sha1, type, NULL) ||
+           strcmp(type, blob_type))
+               goto error_out;
+       return 0;
+ error_out:
+       hashclr(origin->blob_sha1);
+       return -1;
+}
+
+static struct origin *find_origin(struct scoreboard *sb,
                                  struct commit *parent,
                                  struct origin *origin)
 {
        struct origin *porigin = NULL;
        struct diff_options diff_opts;
        int i;
-       const char *paths[1];
+       const char *paths[2];
+
+       /* See if the origin->path is different between parent
+        * and origin first.  Most of the time they are the
+        * same and diff-tree is fairly efficient about this.
+        */
+       diff_setup(&diff_opts);
+       diff_opts.recursive = 1;
+       diff_opts.detect_rename = 0;
+       diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;
+       paths[0] = origin->path;
+       paths[1] = NULL;
+
+       diff_tree_setup_paths(paths, &diff_opts);
+       if (diff_setup_done(&diff_opts) < 0)
+               die("diff-setup");
+       diff_tree_sha1(parent->tree->object.sha1,
+                      origin->commit->tree->object.sha1,
+                      "", &diff_opts);
+       diffcore_std(&diff_opts);
+
+       /* It is either one entry that says "modified", or "created",
+        * or nothing.
+        */
+       if (!diff_queued_diff.nr) {
+               /* The path is the same as parent */
+               porigin = get_origin(sb, parent, origin->path);
+               hashcpy(porigin->blob_sha1, origin->blob_sha1);
+       }
+       else if (diff_queued_diff.nr != 1)
+               die("internal error in pickaxe::find_origin");
+       else {
+               struct diff_filepair *p = diff_queued_diff.queue[0];
+               switch (p->status) {
+               default:
+                       die("internal error in pickaxe::find_origin (%c)",
+                           p->status);
+               case 'M':
+                       porigin = get_origin(sb, parent, origin->path);
+                       hashcpy(porigin->blob_sha1, p->one->sha1);
+                       break;
+               case 'A':
+               case 'T':
+                       /* Did not exist in parent, or type changed */
+                       break;
+               }
+       }
+       diff_flush(&diff_opts);
+       if (porigin)
+               return porigin;
+
+       /* Otherwise we would look for a rename */
 
        diff_setup(&diff_opts);
        diff_opts.recursive = 1;
@@ -191,16 +246,17 @@ static struct origin *find_rename(struct scoreboard *sb,
        diff_tree_setup_paths(paths, &diff_opts);
        if (diff_setup_done(&diff_opts) < 0)
                die("diff-setup");
-       diff_tree_sha1(origin->commit->tree->object.sha1,
-                      parent->tree->object.sha1,
+       diff_tree_sha1(parent->tree->object.sha1,
+                      origin->commit->tree->object.sha1,
                       "", &diff_opts);
        diffcore_std(&diff_opts);
 
        for (i = 0; i < diff_queued_diff.nr; i++) {
                struct diff_filepair *p = diff_queued_diff.queue[i];
                if ((p->status == 'R' || p->status == 'C') &&
-                   !strcmp(p->one->path, origin->path)) {
-                       porigin = find_origin(sb, parent, p->two->path);
+                   !strcmp(p->two->path, origin->path)) {
+                       porigin = get_origin(sb, parent, p->one->path);
+                       hashcpy(porigin->blob_sha1, p->one->sha1);
                        break;
                }
        }
@@ -705,6 +761,15 @@ static int find_copy_in_parent(struct scoreboard *sb,
                       "", &diff_opts);
        diffcore_std(&diff_opts);
 
+
+       /*
+        * NEEDSWORK: This loop is wasteful in that it opens the same
+        * blob in the parent number of times.  We should swap the
+        * loop inside out, which would require keeping track of
+        * "best blame so far" for blame entries that the current
+        * "target" is being suspected.
+        */
+
        for (e = sb->ent; e; e = e->next) {
                struct blame_entry split[3];
                if (e->guilty || cmp_suspect(e->suspect, target))
@@ -724,8 +789,8 @@ static int find_copy_in_parent(struct scoreboard *sb,
                        if (porigin && !strcmp(p->one->path, porigin->path))
                                /* find_move already dealt with this path */
                                continue;
-                       norigin = find_origin(sb, parent, p->one->path);
-
+                       norigin = get_origin(sb, parent, p->one->path);
+                       hashcpy(norigin->blob_sha1, p->one->sha1);
                        blob = read_sha1_file(norigin->blob_sha1, type,
                                              (unsigned long *) &file_p.size);
                        file_p.ptr = blob;
@@ -735,6 +800,7 @@ static int find_copy_in_parent(struct scoreboard *sb,
                        }
                        find_copy_in_blob(sb, e, norigin, this, &file_p);
                        copy_split_if_better(sb, split, this);
+                       free(blob);
                }
                if (split[1].suspect &&
                    blame_copy_score < ent_score(sb, &split[1]))
@@ -762,9 +828,7 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt)
 
                if (parse_commit(p))
                        continue;
-               porigin = find_origin(sb, parent->item, origin->path);
-               if (!porigin)
-                       porigin = find_rename(sb, parent->item, origin);
+               porigin = find_origin(sb, parent->item, origin);
                if (!porigin)
                        continue;
                if (!hashcmp(porigin->blob_sha1, origin->blob_sha1)) {
@@ -1423,8 +1487,8 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix)
         */
        prepare_revision_walk(&revs);
 
-       o = find_origin(&sb, sb.final, path);
-       if (!o)
+       o = get_origin(&sb, sb.final, path);
+       if (fill_blob_sha1(o))
                die("no such path %s in %s", path, final_commit_name);
 
        sb.final_buf = read_sha1_file(o->blob_sha1, type, &sb.final_buf_size);