index-pack: smarter memory usage when resolving deltas
[gitweb.git] / builtin-mailinfo.c
index dacc8ac2d0e63e46dd1aa8fee6ba949bdd319e43..a50ac2256cdbacd76ed44a50804212be07f949db 100644 (file)
@@ -10,6 +10,7 @@
 static FILE *cmitmsg, *patchfile, *fin, *fout;
 
 static int keep_subject;
+static int keep_non_patch_brackets_in_subject;
 static const char *metainfo_charset;
 static struct strbuf line = STRBUF_INIT;
 static struct strbuf name = STRBUF_INIT;
@@ -25,10 +26,15 @@ static enum  {
 static struct strbuf charset = STRBUF_INIT;
 static int patch_lines;
 static struct strbuf **p_hdr_data, **s_hdr_data;
+static int use_scissors;
+static int use_inbody_headers = 1;
 
 #define MAX_HDR_PARSED 10
 #define MAX_BOUNDARIES 5
 
+static void cleanup_space(struct strbuf *sb);
+
+
 static void get_sane_name(struct strbuf *out, struct strbuf *name, struct strbuf *email)
 {
        struct strbuf *src = name;
@@ -109,11 +115,19 @@ static void handle_from(const struct strbuf *from)
        strbuf_add(&email, at, el);
        strbuf_remove(&f, at - f.buf, el + (at[el] ? 1 : 0));
 
-       /* The remainder is name.  It could be "John Doe <john.doe@xz>"
-        * or "john.doe@xz (John Doe)", but we have removed the
-        * email part, so trim from both ends, possibly removing
-        * the () pair at the end.
+       /* The remainder is name.  It could be
+        *
+        * - "John Doe <john.doe@xz>"                   (a), or
+        * - "john.doe@xz (John Doe)"                   (b), or
+        * - "John (zzz) Doe <john.doe@xz> (Comment)"   (c)
+        *
+        * but we have removed the email part, so
+        *
+        * - remove extra spaces which could stay after email (case 'c'), and
+        * - trim from both ends, possibly removing the () pair at the end
+        *   (cases 'a' and 'b').
         */
+       cleanup_space(&f);
        strbuf_trim(&f);
        if (f.buf[0] == '(' && f.len && f.buf[f.len - 1] == ')') {
                strbuf_remove(&f, 0, 1);
@@ -182,8 +196,7 @@ static void handle_content_type(struct strbuf *line)
                *content_top = boundary;
                boundary = NULL;
        }
-       if (slurp_attr(line->buf, "charset=", &charset))
-               strbuf_tolower(&charset);
+       slurp_attr(line->buf, "charset=", &charset);
 
        if (boundary) {
                strbuf_release(boundary);
@@ -209,35 +222,41 @@ static int is_multipart_boundary(const struct strbuf *line)
 
 static void cleanup_subject(struct strbuf *subject)
 {
-       char *pos;
-       size_t remove;
-       while (subject->len) {
-               switch (*subject->buf) {
+       size_t at = 0;
+
+       while (at < subject->len) {
+               char *pos;
+               size_t remove;
+
+               switch (subject->buf[at]) {
                case 'r': case 'R':
-                       if (subject->len <= 3)
+                       if (subject->len <= at + 3)
                                break;
-                       if (!memcmp(subject->buf + 1, "e:", 2)) {
-                               strbuf_remove(subject, 0, 3);
+                       if (!memcmp(subject->buf + at + 1, "e:", 2)) {
+                               strbuf_remove(subject, at, 3);
                                continue;
                        }
+                       at++;
                        break;
                case ' ': case '\t': case ':':
-                       strbuf_remove(subject, 0, 1);
+                       strbuf_remove(subject, at, 1);
                        continue;
                case '[':
-                       if ((pos = strchr(subject->buf, ']'))) {
-                               remove = pos - subject->buf;
-                               if (remove <= (subject->len - remove) * 2) {
-                                       strbuf_remove(subject, 0, remove + 1);
-                                       continue;
-                               }
-                       } else
-                               strbuf_remove(subject, 0, 1);
-                       break;
+                       pos = strchr(subject->buf + at, ']');
+                       if (!pos)
+                               break;
+                       remove = pos - subject->buf + at + 1;
+                       if (!keep_non_patch_brackets_in_subject ||
+                           (7 <= remove &&
+                            memmem(subject->buf + at, remove, "PATCH", 5)))
+                               strbuf_remove(subject, at, remove);
+                       else
+                               at += remove;
+                       continue;
                }
-               strbuf_trim(subject);
-               return;
+               break;
        }
+       strbuf_trim(subject);
 }
 
 static void cleanup_space(struct strbuf *sb)
@@ -470,7 +489,7 @@ static const char *guess_charset(const struct strbuf *line, const char *target_c
                if (is_utf8(line->buf))
                        return NULL;
        }
-       return "latin1";
+       return "ISO8859-1";
 }
 
 static void convert_to_utf8(struct strbuf *line, const char *charset)
@@ -483,11 +502,11 @@ static void convert_to_utf8(struct strbuf *line, const char *charset)
                        return;
        }
 
-       if (!strcmp(metainfo_charset, charset))
+       if (!strcasecmp(metainfo_charset, charset))
                return;
        out = reencode_string(line->buf, metainfo_charset, charset);
        if (!out)
-               die("cannot convert from %s to %s\n",
+               die("cannot convert from %s to %s",
                    charset, metainfo_charset);
        strbuf_attach(line, out, strlen(out), strlen(out));
 }
@@ -526,7 +545,6 @@ static int decode_header_bq(struct strbuf *it)
                                 */
                                strbuf_add(&outbuf, in, ep - in);
                        }
-                       in = ep;
                }
                /* E.g.
                 * ep : "=?iso-2022-jp?B?GyR...?= foo"
@@ -540,7 +558,6 @@ static int decode_header_bq(struct strbuf *it)
                if (cp + 3 - it->buf > it->len)
                        goto decode_header_bq_out;
                strbuf_add(&charset_q, ep, cp - ep);
-               strbuf_tolower(&charset_q);
 
                encoding = cp[1];
                if (!encoding || cp[2] != '?')
@@ -704,6 +721,56 @@ static inline int patchbreak(const struct strbuf *line)
        return 0;
 }
 
+static int is_scissors_line(const struct strbuf *line)
+{
+       size_t i, len = line->len;
+       int scissors = 0, gap = 0;
+       int first_nonblank = -1;
+       int last_nonblank = 0, visible, perforation = 0, in_perforation = 0;
+       const char *buf = line->buf;
+
+       for (i = 0; i < len; i++) {
+               if (isspace(buf[i])) {
+                       if (in_perforation) {
+                               perforation++;
+                               gap++;
+                       }
+                       continue;
+               }
+               last_nonblank = i;
+               if (first_nonblank < 0)
+                       first_nonblank = i;
+               if (buf[i] == '-') {
+                       in_perforation = 1;
+                       perforation++;
+                       continue;
+               }
+               if (i + 1 < len &&
+                   (!memcmp(buf + i, ">8", 2) || !memcmp(buf + i, "8<", 2))) {
+                       in_perforation = 1;
+                       perforation += 2;
+                       scissors += 2;
+                       i++;
+                       continue;
+               }
+               in_perforation = 0;
+       }
+
+       /*
+        * The mark must be at least 8 bytes long (e.g. "-- >8 --").
+        * Even though there can be arbitrary cruft on the same line
+        * (e.g. "cut here"), in order to avoid misidentification, the
+        * perforation must occupy more than a third of the visible
+        * width of the line, and dashes and scissors must occupy more
+        * than half of the perforation.
+        */
+
+       visible = last_nonblank - first_nonblank + 1;
+       return (scissors && 8 <= visible &&
+               visible < perforation * 3 &&
+               gap * 2 < perforation);
+}
+
 static int handle_commit_msg(struct strbuf *line)
 {
        static int still_looking = 1;
@@ -715,14 +782,42 @@ static int handle_commit_msg(struct strbuf *line)
                strbuf_ltrim(line);
                if (!line->len)
                        return 0;
-               if ((still_looking = check_header(line, s_hdr_data, 0)) != 0)
-                       return 0;
        }
 
+       if (use_inbody_headers && still_looking) {
+               still_looking = check_header(line, s_hdr_data, 0);
+               if (still_looking)
+                       return 0;
+       } else
+               /* Only trim the first (blank) line of the commit message
+                * when ignoring in-body headers.
+                */
+               still_looking = 0;
+
        /* normalize the log message to UTF-8. */
        if (metainfo_charset)
                convert_to_utf8(line, charset.buf);
 
+       if (use_scissors && is_scissors_line(line)) {
+               int i;
+               if (fseek(cmitmsg, 0L, SEEK_SET))
+                       die_errno("Could not rewind output message file");
+               if (ftruncate(fileno(cmitmsg), 0))
+                       die_errno("Could not truncate output message file at scissors");
+               still_looking = 1;
+
+               /*
+                * We may have already read "secondary headers"; purge
+                * them to give ourselves a clean restart.
+                */
+               for (i = 0; header[i]; i++) {
+                       if (s_hdr_data[i])
+                               strbuf_release(s_hdr_data[i]);
+                       s_hdr_data[i] = NULL;
+               }
+               return 0;
+       }
+
        if (patchbreak(line)) {
                fclose(cmitmsg);
                cmitmsg = NULL;
@@ -757,7 +852,6 @@ static void handle_filter(struct strbuf *line)
 
 static void handle_body(void)
 {
-       int len = 0;
        struct strbuf prev = STRBUF_INIT;
 
        /* Skip up to the first boundary */
@@ -767,8 +861,6 @@ static void handle_body(void)
        }
 
        do {
-               strbuf_setlen(&line, line.len + len);
-
                /* process any boundary lines */
                if (*content_top && is_multipart_boundary(&line)) {
                        /* flush any leftover */
@@ -824,10 +916,7 @@ static void handle_body(void)
                        handle_filter(&line);
                }
 
-               strbuf_reset(&line);
-               if (strbuf_avail(&line) < 100)
-                       strbuf_grow(&line, 100);
-       } while ((len = read_line_with_nul(line.buf, strbuf_avail(&line), fin)));
+       } while (!strbuf_getwholeline(&line, fin, '\n'));
 
 handle_body_out:
        strbuf_release(&prev);
@@ -883,12 +972,9 @@ static void handle_info(void)
        fprintf(fout, "\n");
 }
 
-static int mailinfo(FILE *in, FILE *out, int ks, const char *encoding,
-                   const char *msg, const char *patch)
+static int mailinfo(FILE *in, FILE *out, const char *msg, const char *patch)
 {
        int peek;
-       keep_subject = ks;
-       metainfo_charset = encoding;
        fin = in;
        fout = out;
 
@@ -922,8 +1008,20 @@ static int mailinfo(FILE *in, FILE *out, int ks, const char *encoding,
        return 0;
 }
 
+static int git_mailinfo_config(const char *var, const char *value, void *unused)
+{
+       if (prefixcmp(var, "mailinfo."))
+               return git_default_config(var, value, unused);
+       if (!strcmp(var, "mailinfo.scissors")) {
+               use_scissors = git_config_bool(var, value);
+               return 0;
+       }
+       /* perhaps others here */
+       return 0;
+}
+
 static const char mailinfo_usage[] =
-       "git mailinfo [-k] [-u | --encoding=<encoding> | -n] msg patch <mail >info";
+       "git mailinfo [-k|-b] [-u | --encoding=<encoding> | -n] [--scissors | --no-scissors] msg patch < mail >info";
 
 int cmd_mailinfo(int argc, const char **argv, const char *prefix)
 {
@@ -932,20 +1030,28 @@ int cmd_mailinfo(int argc, const char **argv, const char *prefix)
        /* NEEDSWORK: might want to do the optional .git/ directory
         * discovery
         */
-       git_config(git_default_config, NULL);
+       git_config(git_mailinfo_config, NULL);
 
-       def_charset = (git_commit_encoding ? git_commit_encoding : "utf-8");
+       def_charset = (git_commit_encoding ? git_commit_encoding : "UTF-8");
        metainfo_charset = def_charset;
 
        while (1 < argc && argv[1][0] == '-') {
                if (!strcmp(argv[1], "-k"))
                        keep_subject = 1;
+               else if (!strcmp(argv[1], "-b"))
+                       keep_non_patch_brackets_in_subject = 1;
                else if (!strcmp(argv[1], "-u"))
                        metainfo_charset = def_charset;
                else if (!strcmp(argv[1], "-n"))
                        metainfo_charset = NULL;
                else if (!prefixcmp(argv[1], "--encoding="))
                        metainfo_charset = argv[1] + 11;
+               else if (!strcmp(argv[1], "--scissors"))
+                       use_scissors = 1;
+               else if (!strcmp(argv[1], "--no-scissors"))
+                       use_scissors = 0;
+               else if (!strcmp(argv[1], "--no-inbody-headers"))
+                       use_inbody_headers = 0;
                else
                        usage(mailinfo_usage);
                argc--; argv++;
@@ -954,5 +1060,5 @@ int cmd_mailinfo(int argc, const char **argv, const char *prefix)
        if (argc != 3)
                usage(mailinfo_usage);
 
-       return !!mailinfo(stdin, stdout, keep_subject, metainfo_charset, argv[1], argv[2]);
+       return !!mailinfo(stdin, stdout, argv[1], argv[2]);
 }