Merge branch 'sp/stream-clean-filter'
authorJunio C Hamano <gitster@pobox.com>
Wed, 8 Oct 2014 20:05:32 +0000 (13:05 -0700)
committerJunio C Hamano <gitster@pobox.com>
Wed, 8 Oct 2014 20:05:32 +0000 (13:05 -0700)
When running a required clean filter, we do not have to mmap the
original before feeding the filter. Instead, stream the file
contents directly to the filter and process its output.

* sp/stream-clean-filter:
sha1_file: don't convert off_t to size_t too early to avoid potential die()
convert: stream from fd to required clean filter to reduce used address space
copy_fd(): do not close the input file descriptor
mmap_limit: introduce GIT_MMAP_LIMIT to allow testing expected mmap size
memory_limit: use git_env_ulong() to parse GIT_ALLOC_LIMIT
config.c: add git_env_ulong() to parse environment variable
convert: drop arguments other than 'path' from would_convert_to_git()

cache.h
config.c
convert.c
convert.h
copy.c
lockfile.c
sha1_file.c
t/t0021-conversion.sh
t/t1050-large.sh
wrapper.c
diff --git a/cache.h b/cache.h
index 8206039cfe9545b4f6eb6757dfd62bf468b3be52..3e6a914dba74419c131f75b86459b4b7cea560ea 100644 (file)
--- a/cache.h
+++ b/cache.h
@@ -1324,6 +1324,7 @@ extern int git_config_rename_section_in_file(const char *, const char *, const c
 extern const char *git_etc_gitconfig(void);
 extern int check_repository_format_version(const char *var, const char *value, void *cb);
 extern int git_env_bool(const char *, int);
+extern unsigned long git_env_ulong(const char *, unsigned long);
 extern int git_config_system(void);
 extern int config_error_nonbool(const char *);
 #if defined(__GNUC__)
index a677eb6c599ddb39f1b4bb7ac44a59bc167fb3a7..039647d247e0ac1e3a523800bbb90c86d89354de 100644 (file)
--- a/config.c
+++ b/config.c
@@ -1139,12 +1139,28 @@ const char *git_etc_gitconfig(void)
        return system_wide;
 }
 
+/*
+ * Parse environment variable 'k' as a boolean (in various
+ * possible spellings); if missing, use the default value 'def'.
+ */
 int git_env_bool(const char *k, int def)
 {
        const char *v = getenv(k);
        return v ? git_config_bool(k, v) : def;
 }
 
+/*
+ * Parse environment variable 'k' as ulong with possibly a unit
+ * suffix; if missing, use the default value 'val'.
+ */
+unsigned long git_env_ulong(const char *k, unsigned long val)
+{
+       const char *v = getenv(k);
+       if (v && !git_parse_ulong(v, &val))
+               die("failed to parse %s", k);
+       return val;
+}
+
 int git_config_system(void)
 {
        return !git_env_bool("GIT_CONFIG_NOSYSTEM", 0);
index aa7a139bcf7f3d33511e106545e6d8989a0ffe35..9a5612e93da2058f64bcec20cfd474a8b6b2d223 100644 (file)
--- a/convert.c
+++ b/convert.c
@@ -312,11 +312,12 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len,
 struct filter_params {
        const char *src;
        unsigned long size;
+       int fd;
        const char *cmd;
        const char *path;
 };
 
-static int filter_buffer(int in, int out, void *data)
+static int filter_buffer_or_fd(int in, int out, void *data)
 {
        /*
         * Spawn cmd and feed the buffer contents through its stdin.
@@ -354,7 +355,12 @@ static int filter_buffer(int in, int out, void *data)
 
        sigchain_push(SIGPIPE, SIG_IGN);
 
-       write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
+       if (params->src) {
+               write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
+       } else {
+               write_err = copy_fd(params->fd, child_process.in);
+       }
+
        if (close(child_process.in))
                write_err = 1;
        if (write_err)
@@ -370,7 +376,7 @@ static int filter_buffer(int in, int out, void *data)
        return (write_err || status);
 }
 
-static int apply_filter(const char *path, const char *src, size_t len,
+static int apply_filter(const char *path, const char *src, size_t len, int fd,
                         struct strbuf *dst, const char *cmd)
 {
        /*
@@ -391,11 +397,12 @@ static int apply_filter(const char *path, const char *src, size_t len,
                return 1;
 
        memset(&async, 0, sizeof(async));
-       async.proc = filter_buffer;
+       async.proc = filter_buffer_or_fd;
        async.data = &params;
        async.out = -1;
        params.src = src;
        params.size = len;
+       params.fd = fd;
        params.cmd = cmd;
        params.path = path;
 
@@ -746,6 +753,25 @@ static void convert_attrs(struct conv_attrs *ca, const char *path)
        }
 }
 
+int would_convert_to_git_filter_fd(const char *path)
+{
+       struct conv_attrs ca;
+
+       convert_attrs(&ca, path);
+       if (!ca.drv)
+               return 0;
+
+       /*
+        * Apply a filter to an fd only if the filter is required to succeed.
+        * We must die if the filter fails, because the original data before
+        * filtering is not available.
+        */
+       if (!ca.drv->required)
+               return 0;
+
+       return apply_filter(path, NULL, 0, -1, NULL, ca.drv->clean);
+}
+
 int convert_to_git(const char *path, const char *src, size_t len,
                    struct strbuf *dst, enum safe_crlf checksafe)
 {
@@ -760,7 +786,7 @@ int convert_to_git(const char *path, const char *src, size_t len,
                required = ca.drv->required;
        }
 
-       ret |= apply_filter(path, src, len, dst, filter);
+       ret |= apply_filter(path, src, len, -1, dst, filter);
        if (!ret && required)
                die("%s: clean filter '%s' failed", path, ca.drv->name);
 
@@ -777,6 +803,23 @@ int convert_to_git(const char *path, const char *src, size_t len,
        return ret | ident_to_git(path, src, len, dst, ca.ident);
 }
 
+void convert_to_git_filter_fd(const char *path, int fd, struct strbuf *dst,
+                             enum safe_crlf checksafe)
+{
+       struct conv_attrs ca;
+       convert_attrs(&ca, path);
+
+       assert(ca.drv);
+       assert(ca.drv->clean);
+
+       if (!apply_filter(path, NULL, 0, fd, dst, ca.drv->clean))
+               die("%s: clean filter '%s' failed", path, ca.drv->name);
+
+       ca.crlf_action = input_crlf_action(ca.crlf_action, ca.eol_attr);
+       crlf_to_git(path, dst->buf, dst->len, dst, ca.crlf_action, checksafe);
+       ident_to_git(path, dst->buf, dst->len, dst, ca.ident);
+}
+
 static int convert_to_working_tree_internal(const char *path, const char *src,
                                            size_t len, struct strbuf *dst,
                                            int normalizing)
@@ -810,7 +853,7 @@ static int convert_to_working_tree_internal(const char *path, const char *src,
                }
        }
 
-       ret_filter = apply_filter(path, src, len, dst, filter);
+       ret_filter = apply_filter(path, src, len, -1, dst, filter);
        if (!ret_filter && required)
                die("%s: smudge filter %s failed", path, ca.drv->name);
 
index 0c2143c152734feb3bdc7ba44b61b8327f02881c..d9d853cd3d2f6d94965a2c2fc8e587386402a83b 100644 (file)
--- a/convert.h
+++ b/convert.h
@@ -40,11 +40,15 @@ extern int convert_to_working_tree(const char *path, const char *src,
                                   size_t len, struct strbuf *dst);
 extern int renormalize_buffer(const char *path, const char *src, size_t len,
                              struct strbuf *dst);
-static inline int would_convert_to_git(const char *path, const char *src,
-                                      size_t len, enum safe_crlf checksafe)
+static inline int would_convert_to_git(const char *path)
 {
-       return convert_to_git(path, src, len, NULL, checksafe);
+       return convert_to_git(path, NULL, 0, NULL, 0);
 }
+/* Precondition: would_convert_to_git_filter_fd(path) == true */
+extern void convert_to_git_filter_fd(const char *path, int fd,
+                                    struct strbuf *dst,
+                                    enum safe_crlf checksafe);
+extern int would_convert_to_git_filter_fd(const char *path);
 
 /*****************************************************************
  *
diff --git a/copy.c b/copy.c
index a7f58fd905b31b2634f74580090ec664a640e279..f2970ec46282a07593366c598f688e5d70c05c83 100644 (file)
--- a/copy.c
+++ b/copy.c
@@ -4,34 +4,17 @@ int copy_fd(int ifd, int ofd)
 {
        while (1) {
                char buffer[8192];
-               char *buf = buffer;
                ssize_t len = xread(ifd, buffer, sizeof(buffer));
                if (!len)
                        break;
                if (len < 0) {
-                       int read_error = errno;
-                       close(ifd);
                        return error("copy-fd: read returned %s",
-                                    strerror(read_error));
-               }
-               while (len) {
-                       int written = xwrite(ofd, buf, len);
-                       if (written > 0) {
-                               buf += written;
-                               len -= written;
-                       }
-                       else if (!written) {
-                               close(ifd);
-                               return error("copy-fd: write returned 0");
-                       } else {
-                               int write_error = errno;
-                               close(ifd);
-                               return error("copy-fd: write returned %s",
-                                            strerror(write_error));
-                       }
+                                    strerror(errno));
                }
+               if (write_in_full(ofd, buffer, len) < 0)
+                       return error("copy-fd: write returned %s",
+                                    strerror(errno));
        }
-       close(ifd);
        return 0;
 }
 
@@ -60,6 +43,7 @@ int copy_file(const char *dst, const char *src, int mode)
                return fdo;
        }
        status = copy_fd(fdi, fdo);
+       close(fdi);
        if (close(fdo) != 0)
                return error("%s: close error: %s", dst, strerror(errno));
 
index 2a800cef33ed7fb1ba7d3499459eccd824ab33e4..d34a96df4f859feeaa7597abba374128ff9dc598 100644 (file)
@@ -224,8 +224,11 @@ int hold_lock_file_for_append(struct lock_file *lk, const char *path, int flags)
        } else if (copy_fd(orig_fd, fd)) {
                if (flags & LOCK_DIE_ON_ERROR)
                        exit(128);
+               close(orig_fd);
                close(fd);
                return -1;
+       } else {
+               close(orig_fd);
        }
        return fd;
 }
index c08c0cbea805b38104504b9b51266949affb6991..6f18c22ab186ba2214b6bedcec0cddb92e750c51 100644 (file)
@@ -663,10 +663,26 @@ void release_pack_memory(size_t need)
                ; /* nothing */
 }
 
+static void mmap_limit_check(size_t length)
+{
+       static size_t limit = 0;
+       if (!limit) {
+               limit = git_env_ulong("GIT_MMAP_LIMIT", 0);
+               if (!limit)
+                       limit = SIZE_MAX;
+       }
+       if (length > limit)
+               die("attempting to mmap %"PRIuMAX" over limit %"PRIuMAX,
+                   (uintmax_t)length, (uintmax_t)limit);
+}
+
 void *xmmap(void *start, size_t length,
        int prot, int flags, int fd, off_t offset)
 {
-       void *ret = mmap(start, length, prot, flags, fd, offset);
+       void *ret;
+
+       mmap_limit_check(length);
+       ret = mmap(start, length, prot, flags, fd, offset);
        if (ret == MAP_FAILED) {
                if (!length)
                        return NULL;
@@ -3076,6 +3092,29 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size,
        return ret;
 }
 
+static int index_stream_convert_blob(unsigned char *sha1, int fd,
+                                    const char *path, unsigned flags)
+{
+       int ret;
+       const int write_object = flags & HASH_WRITE_OBJECT;
+       struct strbuf sbuf = STRBUF_INIT;
+
+       assert(path);
+       assert(would_convert_to_git_filter_fd(path));
+
+       convert_to_git_filter_fd(path, fd, &sbuf,
+                                write_object ? safe_crlf : SAFE_CRLF_FALSE);
+
+       if (write_object)
+               ret = write_sha1_file(sbuf.buf, sbuf.len, typename(OBJ_BLOB),
+                                     sha1);
+       else
+               ret = hash_sha1_file(sbuf.buf, sbuf.len, typename(OBJ_BLOB),
+                                    sha1);
+       strbuf_release(&sbuf);
+       return ret;
+}
+
 static int index_pipe(unsigned char *sha1, int fd, enum object_type type,
                      const char *path, unsigned flags)
 {
@@ -3141,15 +3180,22 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st,
             enum object_type type, const char *path, unsigned flags)
 {
        int ret;
-       size_t size = xsize_t(st->st_size);
 
-       if (!S_ISREG(st->st_mode))
+       /*
+        * Call xsize_t() only when needed to avoid potentially unnecessary
+        * die() for large files.
+        */
+       if (type == OBJ_BLOB && path && would_convert_to_git_filter_fd(path))
+               ret = index_stream_convert_blob(sha1, fd, path, flags);
+       else if (!S_ISREG(st->st_mode))
                ret = index_pipe(sha1, fd, type, path, flags);
-       else if (size <= big_file_threshold || type != OBJ_BLOB ||
-                (path && would_convert_to_git(path, NULL, 0, 0)))
-               ret = index_core(sha1, fd, size, type, path, flags);
+       else if (st->st_size <= big_file_threshold || type != OBJ_BLOB ||
+                (path && would_convert_to_git(path)))
+               ret = index_core(sha1, fd, xsize_t(st->st_size), type, path,
+                                flags);
        else
-               ret = index_stream(sha1, fd, size, type, path, flags);
+               ret = index_stream(sha1, fd, xsize_t(st->st_size), type, path,
+                                  flags);
        close(fd);
        return ret;
 }
index f890c54d137aab021e6d6a61c5741af872555c1f..ca7d2a630a8442d8812b444708d6f28f2e45fb32 100755 (executable)
@@ -153,17 +153,23 @@ test_expect_success 'filter shell-escaped filenames' '
        :
 '
 
-test_expect_success 'required filter success' '
-       git config filter.required.smudge cat &&
-       git config filter.required.clean cat &&
+test_expect_success 'required filter should filter data' '
+       git config filter.required.smudge ./rot13.sh &&
+       git config filter.required.clean ./rot13.sh &&
        git config filter.required.required true &&
 
        echo "*.r filter=required" >.gitattributes &&
 
-       echo test >test.r &&
+       cat test.o >test.r &&
        git add test.r &&
+
        rm -f test.r &&
-       git checkout -- test.r
+       git checkout -- test.r &&
+       cmp test.o test.r &&
+
+       ./rot13.sh <test.o >expected &&
+       git cat-file blob :test.r >actual &&
+       cmp expected actual
 '
 
 test_expect_success 'required filter smudge failure' '
@@ -190,6 +196,14 @@ test_expect_success 'required filter clean failure' '
        test_must_fail git add test.fc
 '
 
+test_expect_success 'filtering large input to small output should use little memory' '
+       git config filter.devnull.clean "cat >/dev/null" &&
+       git config filter.devnull.required true &&
+       for i in $(test_seq 1 30); do printf "%1048576d" 1; done >30MB &&
+       echo "30MB filter=devnull" >.gitattributes &&
+       GIT_MMAP_LIMIT=1m GIT_ALLOC_LIMIT=1m git add 30MB
+'
+
 test_expect_success EXPENSIVE 'filter large file' '
        git config filter.largefile.smudge cat &&
        git config filter.largefile.clean cat &&
index 05a1e1d270d2f6254658607541f1b114299533e5..f5a91192903b455df3d7d13d8009ea90999405e3 100755 (executable)
@@ -13,7 +13,7 @@ test_expect_success setup '
        echo X | dd of=large2 bs=1k seek=2000 &&
        echo X | dd of=large3 bs=1k seek=2000 &&
        echo Y | dd of=huge bs=1k seek=2500 &&
-       GIT_ALLOC_LIMIT=1500 &&
+       GIT_ALLOC_LIMIT=1500k &&
        export GIT_ALLOC_LIMIT
 '
 
index 25074d71b6ce72066efc02abda74ddfb10f71d6c..5b77d2a1aecd494ea4bb5eb1e6f31afc9c96f82e 100644 (file)
--- a/wrapper.c
+++ b/wrapper.c
@@ -11,19 +11,20 @@ static void (*try_to_free_routine)(size_t size) = do_nothing;
 
 static int memory_limit_check(size_t size, int gentle)
 {
-       static int limit = -1;
-       if (limit == -1) {
-               const char *env = getenv("GIT_ALLOC_LIMIT");
-               limit = env ? atoi(env) * 1024 : 0;
+       static size_t limit = 0;
+       if (!limit) {
+               limit = git_env_ulong("GIT_ALLOC_LIMIT", 0);
+               if (!limit)
+                       limit = SIZE_MAX;
        }
-       if (limit && size > limit) {
+       if (size > limit) {
                if (gentle) {
-                       error("attempting to allocate %"PRIuMAX" over limit %d",
-                             (intmax_t)size, limit);
+                       error("attempting to allocate %"PRIuMAX" over limit %"PRIuMAX,
+                             (uintmax_t)size, (uintmax_t)limit);
                        return -1;
                } else
-                       die("attempting to allocate %"PRIuMAX" over limit %d",
-                           (intmax_t)size, limit);
+                       die("attempting to allocate %"PRIuMAX" over limit %"PRIuMAX,
+                           (uintmax_t)size, (uintmax_t)limit);
        }
        return 0;
 }