hashmap: add simplified hashmap_get_from_hash() API
[gitweb.git] / refs.c
diff --git a/refs.c b/refs.c
index ceeeec645c12a0e35e33d2263dca38829668e444..20e2bf18c9da51904d53ec594022eb75e7cefcf0 100644 (file)
--- a/refs.c
+++ b/refs.c
@@ -6,51 +6,71 @@
 #include "string-list.h"
 
 /*
- * Make sure "ref" is something reasonable to have under ".git/refs/";
- * We do not like it if:
+ * How to handle various characters in refnames:
+ * This table is used by both the SIMD and non-SIMD code.  It has
+ * some cases that are only useful for the SIMD; these are handled
+ * equivalently to the listed disposition in the non-SIMD code.
+ * 0: An acceptable character for refs
+ * 1: @, look for a following { to reject @{ in refs (SIMD or = 0)
+ * 2: \0: End-of-component and string
+ * 3: /: End-of-component (SIMD or = 2)
+ * 4: ., look for a preceding . to reject .. in refs
+ * 5: {, look for a preceding @ to reject @{ in refs
+ * 6: *, usually a bad character except, once as a wildcard (SIMD or = 7)
+ * 7: A bad character except * (see check_refname_component below)
+ */
+static unsigned char refname_disposition[256] = {
+       2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+       7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 4, 3,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7,
+       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 0, 7, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 7, 7
+};
+
+/*
+ * Try to read one refname component from the front of refname.
+ * Return the length of the component found, or -1 if the component is
+ * not legal.  It is legal if it is something reasonable to have under
+ * ".git/refs/"; We do not like it if:
  *
  * - any path component of it begins with ".", or
  * - it has double dots "..", or
  * - it has ASCII control character, "~", "^", ":" or SP, anywhere, or
- * - it ends with a "/".
- * - it ends with ".lock"
+ * - it has pattern-matching notation "*", "?", "[", anywhere, or
+ * - it ends with a "/", or
+ * - it ends with ".lock", or
  * - it contains a "\" (backslash)
  */
-
-/* Return true iff ch is not allowed in reference names. */
-static inline int bad_ref_char(int ch)
-{
-       if (((unsigned) ch) <= ' ' || ch == 0x7f ||
-           ch == '~' || ch == '^' || ch == ':' || ch == '\\')
-               return 1;
-       /* 2.13 Pattern Matching Notation */
-       if (ch == '*' || ch == '?' || ch == '[') /* Unsupported */
-               return 1;
-       return 0;
-}
-
-/*
- * Try to read one refname component from the front of refname.  Return
- * the length of the component found, or -1 if the component is not
- * legal.
- */
 static int check_refname_component(const char *refname, int flags)
 {
        const char *cp;
        char last = '\0';
 
        for (cp = refname; ; cp++) {
-               char ch = *cp;
-               if (ch == '\0' || ch == '/')
+               int ch = *cp & 255;
+               unsigned char disp = refname_disposition[ch];
+               switch (disp) {
+               case 2: /* fall-through */
+               case 3:
+                       goto out;
+               case 4:
+                       if (last == '.')
+                               return -1; /* Refname contains "..". */
                        break;
-               if (bad_ref_char(ch))
-                       return -1; /* Illegal character in refname. */
-               if (last == '.' && ch == '.')
-                       return -1; /* Refname contains "..". */
-               if (last == '@' && ch == '{')
-                       return -1; /* Refname contains "@{". */
+               case 5:
+                       if (last == '@')
+                               return -1; /* Refname contains "@{". */
+                       break;
+               case 6: /* fall-through */
+               case 7:
+                       return -1;
+               }
                last = ch;
        }
+out:
        if (cp == refname)
                return 0; /* Component has zero length. */
        if (refname[0] == '.') {
@@ -68,7 +88,7 @@ static int check_refname_component(const char *refname, int flags)
        return cp - refname;
 }
 
-int check_refname_format(const char *refname, int flags)
+static int check_refname_format_bytewise(const char *refname, int flags)
 {
        int component_len, component_count = 0;
 
@@ -104,6 +124,195 @@ int check_refname_format(const char *refname, int flags)
        return 0;
 }
 
+#if defined(__GNUC__) && defined(__x86_64__)
+#define SSE_VECTOR_BYTES 16
+
+/* Vectorized version of check_refname_format. */
+int check_refname_format(const char *refname, int flags)
+{
+       const char *cp = refname;
+
+       const __m128i dot = _mm_set1_epi8('.');
+       const __m128i at = _mm_set1_epi8('@');
+       const __m128i curly = _mm_set1_epi8('{');
+       const __m128i slash = _mm_set1_epi8('/');
+       const __m128i zero = _mm_set1_epi8('\000');
+       const __m128i el = _mm_set1_epi8('l');
+
+       /* below '*', all characters are forbidden or rare */
+       const __m128i star_ub = _mm_set1_epi8('*' + 1);
+
+       const __m128i colon = _mm_set1_epi8(':');
+       const __m128i question = _mm_set1_epi8('?');
+
+       /* '['..'^' contains 4 characters: 3 forbidden and 1 rare */
+       const __m128i bracket_lb = _mm_set1_epi8('[' - 1);
+       const __m128i caret_ub = _mm_set1_epi8('^' + 1);
+
+       /* '~' and above are forbidden */
+       const __m128i tilde_lb = _mm_set1_epi8('~' - 1);
+
+       int component_count = 0;
+
+       if (refname[0] == 0 || refname[0] == '/') {
+               /* entirely empty ref or initial ref component */
+               return -1;
+       }
+
+       /*
+        * Initial ref component of '.'; below we look for /. so we'll
+        * miss this.
+        */
+       if (refname[0] == '.') {
+               if (refname[1] == '/' || refname[1] == '\0')
+                       return -1;
+               if (!(flags & REFNAME_DOT_COMPONENT))
+                       return -1;
+       }
+       while(1) {
+               __m128i tmp, tmp1, result;
+               uint64_t mask;
+
+               if ((uintptr_t) cp % PAGE_SIZE > PAGE_SIZE - SSE_VECTOR_BYTES  - 1)
+                       /*
+                        * End-of-page; fall back to slow method for
+                        * this entire ref.
+                        */
+                       return check_refname_format_bytewise(refname, flags);
+
+               tmp = _mm_loadu_si128((__m128i *)cp);
+               tmp1 = _mm_loadu_si128((__m128i *)(cp + 1));
+
+               /*
+                * This range (note the lt) contains some
+                * permissible-but-rare characters (including all
+                * characters >= 128), which we handle later.  It also
+                * includes \000.
+                */
+               result = _mm_cmplt_epi8(tmp, star_ub);
+
+               result = _mm_or_si128(result, _mm_cmpeq_epi8(tmp, question));
+               result = _mm_or_si128(result, _mm_cmpeq_epi8(tmp, colon));
+
+               /* This range contains the permissible ] as bycatch */
+               result = _mm_or_si128(result, _mm_and_si128(
+                                             _mm_cmpgt_epi8(tmp, bracket_lb),
+                                             _mm_cmplt_epi8(tmp, caret_ub)));
+
+               result = _mm_or_si128(result, _mm_cmpgt_epi8(tmp, tilde_lb));
+
+               /* .. */
+               result = _mm_or_si128(result, _mm_and_si128(
+                                             _mm_cmpeq_epi8(tmp, dot),
+                                             _mm_cmpeq_epi8(tmp1, dot)));
+               /* @{ */
+               result = _mm_or_si128(result, _mm_and_si128(
+                                             _mm_cmpeq_epi8(tmp, at),
+                                             _mm_cmpeq_epi8(tmp1, curly)));
+               /* // */
+               result = _mm_or_si128(result, _mm_and_si128(
+                                             _mm_cmpeq_epi8(tmp, slash),
+                                             _mm_cmpeq_epi8(tmp1, slash)));
+               /* trailing / */
+               result = _mm_or_si128(result, _mm_and_si128(
+                                             _mm_cmpeq_epi8(tmp, slash),
+                                             _mm_cmpeq_epi8(tmp1, zero)));
+               /* .l, beginning of .lock */
+               result = _mm_or_si128(result, _mm_and_si128(
+                                             _mm_cmpeq_epi8(tmp, dot),
+                                             _mm_cmpeq_epi8(tmp1, el)));
+               /*
+                * Even though /. is not necessarily an error, we flag
+                * it anyway. If we find it, we'll check if it's valid
+                * and if so we'll advance just past it.
+                */
+               result = _mm_or_si128(result, _mm_and_si128(
+                                             _mm_cmpeq_epi8(tmp, slash),
+                                             _mm_cmpeq_epi8(tmp1, dot)));
+
+               mask = _mm_movemask_epi8(result);
+               if (mask) {
+                       /*
+                        * We've found either end-of-string, or some
+                        * probably-bad character or substring.
+                        */
+                       int i = __builtin_ctz(mask);
+                       switch (refname_disposition[cp[i] & 255]) {
+                       case 0: /* fall-through */
+                       case 5:
+                               /*
+                                * bycatch: a good character that's in
+                                * one of the ranges of mostly-forbidden
+                                * characters
+                                */
+                               cp += i + 1;
+                               break;
+                       case 1:
+                               if (cp[i + 1] == '{')
+                                       return -1;
+                               cp += i + 1;
+                               break;
+                       case 2:
+                               if (!(flags & REFNAME_ALLOW_ONELEVEL)
+                                   && !component_count && !strchr(refname, '/'))
+                                       /* Refname has only one component. */
+                                       return -1;
+                               return 0;
+                       case 3:
+                               component_count ++;
+                               /*
+                                * Even if leading dots are allowed, don't
+                                * allow "." as a component (".." is
+                                * prevented by case 4 below).
+                                */
+                               if (cp[i + 1] == '.') {
+                                       if (cp[i + 2] == '\0')
+                                               return -1;
+                                       if (flags & REFNAME_DOT_COMPONENT) {
+                                               /* skip to just after the /. */
+                                               cp += i + 2;
+                                               break;
+                                       }
+                                       return -1;
+                               } else if (cp[i + 1] == '/' || cp[i + 1] == '\0')
+                                       return -1;
+                               break;
+                       case 4:
+                               if (cp[i + 1] == '.' || cp[i + 1] == '\0')
+                                       return -1;
+                               /* .lock as end-of-component or end-of-string */
+                               if ((!strncmp(cp + i, ".lock", 5))
+                                   && (cp[i + 5] == '/' || cp[i + 5] == 0))
+                                       return -1;
+                               cp += 1;
+                               break;
+                       case 6:
+                               if (((cp == refname + i) || cp[i - 1] == '/')
+                                   && (cp[i + 1] == '/' || cp[i + 1] == 0))
+                                       if (flags & REFNAME_REFSPEC_PATTERN) {
+                                               flags &= ~REFNAME_REFSPEC_PATTERN;
+                                               /* restart after the * */
+                                               cp += i + 1;
+                                               continue;
+                                       }
+                               /* fall-through */
+                       case 7:
+                               return -1;
+                       }
+               } else
+                       cp += SSE_VECTOR_BYTES;
+       }
+}
+
+#else
+
+int check_refname_format (const char *refname, int flags)
+{
+       return check_refname_format_bytewise(refname, flags);
+}
+
+#endif
+
 struct ref_entry;
 
 /*
@@ -2943,119 +3152,117 @@ int create_symref(const char *ref_target, const char *refs_heads_master,
        return 0;
 }
 
-static char *ref_msg(const char *line, const char *endp)
-{
-       const char *ep;
-       line += 82;
-       ep = memchr(line, '\n', endp - line);
-       if (!ep)
-               ep = endp;
-       return xmemdupz(line, ep - line);
+struct read_ref_at_cb {
+       const char *refname;
+       unsigned long at_time;
+       int cnt;
+       int reccnt;
+       unsigned char *sha1;
+       int found_it;
+
+       unsigned char osha1[20];
+       unsigned char nsha1[20];
+       int tz;
+       unsigned long date;
+       char **msg;
+       unsigned long *cutoff_time;
+       int *cutoff_tz;
+       int *cutoff_cnt;
+};
+
+static int read_ref_at_ent(unsigned char *osha1, unsigned char *nsha1,
+               const char *email, unsigned long timestamp, int tz,
+               const char *message, void *cb_data)
+{
+       struct read_ref_at_cb *cb = cb_data;
+
+       cb->reccnt++;
+       cb->tz = tz;
+       cb->date = timestamp;
+
+       if (timestamp <= cb->at_time || cb->cnt == 0) {
+               if (cb->msg)
+                       *cb->msg = xstrdup(message);
+               if (cb->cutoff_time)
+                       *cb->cutoff_time = timestamp;
+               if (cb->cutoff_tz)
+                       *cb->cutoff_tz = tz;
+               if (cb->cutoff_cnt)
+                       *cb->cutoff_cnt = cb->reccnt - 1;
+               /*
+                * we have not yet updated cb->[n|o]sha1 so they still
+                * hold the values for the previous record.
+                */
+               if (!is_null_sha1(cb->osha1)) {
+                       hashcpy(cb->sha1, nsha1);
+                       if (hashcmp(cb->osha1, nsha1))
+                               warning("Log for ref %s has gap after %s.",
+                                       cb->refname, show_date(cb->date, cb->tz, DATE_RFC2822));
+               }
+               else if (cb->date == cb->at_time)
+                       hashcpy(cb->sha1, nsha1);
+               else if (hashcmp(nsha1, cb->sha1))
+                       warning("Log for ref %s unexpectedly ended on %s.",
+                               cb->refname, show_date(cb->date, cb->tz,
+                                                  DATE_RFC2822));
+               hashcpy(cb->osha1, osha1);
+               hashcpy(cb->nsha1, nsha1);
+               cb->found_it = 1;
+               return 1;
+       }
+       hashcpy(cb->osha1, osha1);
+       hashcpy(cb->nsha1, nsha1);
+       if (cb->cnt > 0)
+               cb->cnt--;
+       return 0;
+}
+
+static int read_ref_at_ent_oldest(unsigned char *osha1, unsigned char *nsha1,
+                                 const char *email, unsigned long timestamp,
+                                 int tz, const char *message, void *cb_data)
+{
+       struct read_ref_at_cb *cb = cb_data;
+
+       if (cb->msg)
+               *cb->msg = xstrdup(message);
+       if (cb->cutoff_time)
+               *cb->cutoff_time = timestamp;
+       if (cb->cutoff_tz)
+               *cb->cutoff_tz = tz;
+       if (cb->cutoff_cnt)
+               *cb->cutoff_cnt = cb->reccnt;
+       hashcpy(cb->sha1, osha1);
+       if (is_null_sha1(cb->sha1))
+               hashcpy(cb->sha1, nsha1);
+       /* We just want the first entry */
+       return 1;
 }
 
 int read_ref_at(const char *refname, unsigned long at_time, int cnt,
                unsigned char *sha1, char **msg,
                unsigned long *cutoff_time, int *cutoff_tz, int *cutoff_cnt)
 {
-       const char *logfile, *logdata, *logend, *rec, *lastgt, *lastrec;
-       char *tz_c;
-       int logfd, tz, reccnt = 0;
-       struct stat st;
-       unsigned long date;
-       unsigned char logged_sha1[20];
-       void *log_mapped;
-       size_t mapsz;
+       struct read_ref_at_cb cb;
 
-       logfile = git_path("logs/%s", refname);
-       logfd = open(logfile, O_RDONLY, 0);
-       if (logfd < 0)
-               die_errno("Unable to read log '%s'", logfile);
-       fstat(logfd, &st);
-       if (!st.st_size)
-               die("Log %s is empty.", logfile);
-       mapsz = xsize_t(st.st_size);
-       log_mapped = xmmap(NULL, mapsz, PROT_READ, MAP_PRIVATE, logfd, 0);
-       logdata = log_mapped;
-       close(logfd);
+       memset(&cb, 0, sizeof(cb));
+       cb.refname = refname;
+       cb.at_time = at_time;
+       cb.cnt = cnt;
+       cb.msg = msg;
+       cb.cutoff_time = cutoff_time;
+       cb.cutoff_tz = cutoff_tz;
+       cb.cutoff_cnt = cutoff_cnt;
+       cb.sha1 = sha1;
+
+       for_each_reflog_ent_reverse(refname, read_ref_at_ent, &cb);
+
+       if (!cb.reccnt)
+               die("Log for %s is empty.", refname);
+       if (cb.found_it)
+               return 0;
+
+       for_each_reflog_ent(refname, read_ref_at_ent_oldest, &cb);
 
-       lastrec = NULL;
-       rec = logend = logdata + st.st_size;
-       while (logdata < rec) {
-               reccnt++;
-               if (logdata < rec && *(rec-1) == '\n')
-                       rec--;
-               lastgt = NULL;
-               while (logdata < rec && *(rec-1) != '\n') {
-                       rec--;
-                       if (*rec == '>')
-                               lastgt = rec;
-               }
-               if (!lastgt)
-                       die("Log %s is corrupt.", logfile);
-               date = strtoul(lastgt + 1, &tz_c, 10);
-               if (date <= at_time || cnt == 0) {
-                       tz = strtoul(tz_c, NULL, 10);
-                       if (msg)
-                               *msg = ref_msg(rec, logend);
-                       if (cutoff_time)
-                               *cutoff_time = date;
-                       if (cutoff_tz)
-                               *cutoff_tz = tz;
-                       if (cutoff_cnt)
-                               *cutoff_cnt = reccnt - 1;
-                       if (lastrec) {
-                               if (get_sha1_hex(lastrec, logged_sha1))
-                                       die("Log %s is corrupt.", logfile);
-                               if (get_sha1_hex(rec + 41, sha1))
-                                       die("Log %s is corrupt.", logfile);
-                               if (hashcmp(logged_sha1, sha1)) {
-                                       warning("Log %s has gap after %s.",
-                                               logfile, show_date(date, tz, DATE_RFC2822));
-                               }
-                       }
-                       else if (date == at_time) {
-                               if (get_sha1_hex(rec + 41, sha1))
-                                       die("Log %s is corrupt.", logfile);
-                       }
-                       else {
-                               if (get_sha1_hex(rec + 41, logged_sha1))
-                                       die("Log %s is corrupt.", logfile);
-                               if (hashcmp(logged_sha1, sha1)) {
-                                       warning("Log %s unexpectedly ended on %s.",
-                                               logfile, show_date(date, tz, DATE_RFC2822));
-                               }
-                       }
-                       munmap(log_mapped, mapsz);
-                       return 0;
-               }
-               lastrec = rec;
-               if (cnt > 0)
-                       cnt--;
-       }
-
-       rec = logdata;
-       while (rec < logend && *rec != '>' && *rec != '\n')
-               rec++;
-       if (rec == logend || *rec == '\n')
-               die("Log %s is corrupt.", logfile);
-       date = strtoul(rec + 1, &tz_c, 10);
-       tz = strtoul(tz_c, NULL, 10);
-       if (get_sha1_hex(logdata, sha1))
-               die("Log %s is corrupt.", logfile);
-       if (is_null_sha1(sha1)) {
-               if (get_sha1_hex(logdata + 41, sha1))
-                       die("Log %s is corrupt.", logfile);
-       }
-       if (msg)
-               *msg = ref_msg(logdata, logend);
-       munmap(log_mapped, mapsz);
-
-       if (cutoff_time)
-               *cutoff_time = date;
-       if (cutoff_tz)
-               *cutoff_tz = tz;
-       if (cutoff_cnt)
-               *cutoff_cnt = reccnt;
        return 1;
 }