From: Junio C Hamano Date: Fri, 6 Jun 2014 18:29:38 +0000 (-0700) Subject: Merge branch 'tb/unicode-6.3-zero-width' X-Git-Tag: v2.1.0-rc0~158 X-Git-Url: https://git.lorimer.id.au/gitweb.git/diff_plain/334d40e951fa3b3961135b3183633706d976c4bd?hp=-c Merge branch 'tb/unicode-6.3-zero-width' Update the logic to compute the display width needed for utf8 strings and allow us to more easily maintain the tables used in that logic. We may want to let the users choose if codepoints with ambiguous widths are treated as a double or single width in a follow-up patch. * tb/unicode-6.3-zero-width: utf8: make it easier to auto-update git_wcwidth() utf8.c: use a table for double_width --- 334d40e951fa3b3961135b3183633706d976c4bd diff --combined .gitignore index dc600f9b36,252792a11a..42294e59a1 --- a/.gitignore +++ b/.gitignore @@@ -182,7 -182,6 +182,7 @@@ /test-dump-cache-tree /test-scrap-cache-tree /test-genrandom +/test-hashmap /test-index-version /test-line-buffer /test-match-trees @@@ -226,6 -225,7 +226,7 @@@ /config.mak.autogen /config.mak.append /configure + /unicode /tags /TAGS /cscope* diff --combined Makefile index 08fc9ca430,a7c1f4a46d..07ea105837 --- a/Makefile +++ b/Makefile @@@ -30,8 -30,6 +30,8 @@@ all: # Define LIBPCREDIR=/foo/bar if your libpcre header and library files are in # /foo/bar/include and /foo/bar/lib directories. # +# Define HAVE_ALLOCA_H if you have working alloca(3) defined in that header. +# # Define NO_CURL if you do not have libcurl installed. git-http-fetch and # git-http-push are not built, and you cannot use http:// and https:// # transports (neither smart nor dumb). @@@ -61,9 -59,9 +61,9 @@@ # FreeBSD can use either, but MinGW and some others need to use # libcharset.h's locale_charset() instead. # -# Define CHARSET_LIB to you need to link with library other than -liconv to +# Define CHARSET_LIB to the library you need to link with in order to # use locale_charset() function. On some platforms this needs to set to -# -lcharset +# -lcharset, on others to -liconv . # # Define LIBC_CONTAINS_LIBINTL if your gettext implementation doesn't # need -lintl when linking. @@@ -103,6 -101,14 +103,6 @@@ # # Define NO_MKSTEMPS if you don't have mkstemps in the C library. # -# Define NO_FNMATCH if you don't have fnmatch in the C library. -# -# Define NO_FNMATCH_CASEFOLD if your fnmatch function doesn't have the -# FNM_CASEFOLD GNU extension. -# -# Define NO_WILDMATCH if you do not want to use Git's wildmatch -# implementation as fnmatch -# # Define NO_GECOS_IN_PWENT if you don't have pw_gecos in struct passwd # in the C library. # @@@ -153,7 -159,7 +153,7 @@@ # # Define NEEDS_LIBINTL_BEFORE_LIBICONV if you need libintl before libiconv. # -# Define NO_INTPTR_T if you don't have intptr_t nor uintptr_t. +# Define NO_INTPTR_T if you don't have intptr_t or uintptr_t. # # Define NO_UINTMAX_T if you don't have uintmax_t. # @@@ -185,6 -191,9 +185,6 @@@ # Define NO_STRUCT_ITIMERVAL if you don't have struct itimerval # This also implies NO_SETITIMER # -# Define NO_THREAD_SAFE_PREAD if your pread() implementation is not -# thread-safe. (e.g. compat/pread.c or cygwin) -# # Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is # generally faster on your platform than accessing the working directory. # @@@ -333,13 -342,6 +333,13 @@@ # Define DEFAULT_HELP_FORMAT to "man", "info" or "html" # (defaults to "man") if you want to have a different default when # "git help" is called without a parameter specifying the format. +# +# Define TEST_GIT_INDEX_VERSION to 2, 3 or 4 to run the test suite +# with a different indexfile format version. If it isn't set the index +# file format used is index-v[23]. +# +# Define GMTIME_UNRELIABLE_ERRORS if your gmtime() function does not +# return NULL when it receives a bogus time_t. GIT-VERSION-FILE: FORCE @$(SHELL_PATH) ./GIT-VERSION-GEN @@@ -553,7 -555,6 +553,7 @@@ TEST_PROGRAMS_NEED_X += test-dat TEST_PROGRAMS_NEED_X += test-delta TEST_PROGRAMS_NEED_X += test-dump-cache-tree TEST_PROGRAMS_NEED_X += test-genrandom +TEST_PROGRAMS_NEED_X += test-hashmap TEST_PROGRAMS_NEED_X += test-index-version TEST_PROGRAMS_NEED_X += test-line-buffer TEST_PROGRAMS_NEED_X += test-match-trees @@@ -662,8 -663,6 +662,8 @@@ LIB_H += diff. LIB_H += diffcore.h LIB_H += dir.h LIB_H += exec_cmd.h +LIB_H += ewah/ewok.h +LIB_H += ewah/ewok_rlw.h LIB_H += fetch-pack.h LIB_H += fmt-merge-msg.h LIB_H += fsck.h @@@ -672,7 -671,7 +672,7 @@@ LIB_H += git-compat-util. LIB_H += gpg-interface.h LIB_H += graph.h LIB_H += grep.h -LIB_H += hash.h +LIB_H += hashmap.h LIB_H += help.h LIB_H += http.h LIB_H += kwset.h @@@ -691,10 -690,8 +691,10 @@@ LIB_H += notes-merge. LIB_H += notes-utils.h LIB_H += notes.h LIB_H += object.h +LIB_H += pack-objects.h LIB_H += pack-revindex.h LIB_H += pack.h +LIB_H += pack-bitmap.h LIB_H += parse-options.h LIB_H += patch-ids.h LIB_H += pathspec.h @@@ -729,6 -726,7 +729,7 @@@ LIB_H += transport. LIB_H += tree-walk.h LIB_H += tree.h LIB_H += unpack-trees.h + LIB_H += unicode_width.h LIB_H += url.h LIB_H += urlmatch.h LIB_H += userdiff.h @@@ -798,10 -796,6 +799,10 @@@ LIB_OBJS += dir. LIB_OBJS += editor.o LIB_OBJS += entry.o LIB_OBJS += environment.o +LIB_OBJS += ewah/bitmap.o +LIB_OBJS += ewah/ewah_bitmap.o +LIB_OBJS += ewah/ewah_io.o +LIB_OBJS += ewah/ewah_rlw.o LIB_OBJS += exec_cmd.o LIB_OBJS += fetch-pack.o LIB_OBJS += fsck.o @@@ -809,7 -803,7 +810,7 @@@ LIB_OBJS += gettext. LIB_OBJS += gpg-interface.o LIB_OBJS += graph.o LIB_OBJS += grep.o -LIB_OBJS += hash.o +LIB_OBJS += hashmap.o LIB_OBJS += help.o LIB_OBJS += hex.o LIB_OBJS += ident.o @@@ -833,10 -827,7 +834,10 @@@ LIB_OBJS += notes-cache. LIB_OBJS += notes-merge.o LIB_OBJS += notes-utils.o LIB_OBJS += object.o +LIB_OBJS += pack-bitmap.o +LIB_OBJS += pack-bitmap-write.o LIB_OBJS += pack-check.o +LIB_OBJS += pack-objects.o LIB_OBJS += pack-revindex.o LIB_OBJS += pack-write.o LIB_OBJS += pager.o @@@ -894,7 -885,6 +895,7 @@@ LIB_OBJS += userdiff. LIB_OBJS += utf8.o LIB_OBJS += varint.o LIB_OBJS += version.o +LIB_OBJS += versioncmp.o LIB_OBJS += walker.o LIB_OBJS += wildmatch.o LIB_OBJS += wrapper.o @@@ -1110,10 -1100,6 +1111,10 @@@ ifdef USE_LIBPCR EXTLIBS += -lpcre endif +ifdef HAVE_ALLOCA_H + BASIC_CFLAGS += -DHAVE_ALLOCA_H +endif + ifdef NO_CURL BASIC_CFLAGS += -DNO_CURL REMOTE_CURL_PRIMARY = @@@ -1286,6 -1272,20 +1287,6 @@@ endi ifdef NO_STRTOULL COMPAT_CFLAGS += -DNO_STRTOULL endif -ifdef NO_FNMATCH - COMPAT_CFLAGS += -Icompat/fnmatch - COMPAT_CFLAGS += -DNO_FNMATCH - COMPAT_OBJS += compat/fnmatch/fnmatch.o -else -ifdef NO_FNMATCH_CASEFOLD - COMPAT_CFLAGS += -Icompat/fnmatch - COMPAT_CFLAGS += -DNO_FNMATCH_CASEFOLD - COMPAT_OBJS += compat/fnmatch/fnmatch.o -endif -endif -ifndef NO_WILDMATCH - COMPAT_CFLAGS += -DUSE_WILDMATCH -endif ifdef NO_SETENV COMPAT_CFLAGS += -DNO_SETENV COMPAT_OBJS += compat/setenv.o @@@ -1342,6 -1342,10 +1343,6 @@@ endi ifdef NO_PREAD COMPAT_CFLAGS += -DNO_PREAD COMPAT_OBJS += compat/pread.o - NO_THREAD_SAFE_PREAD = YesPlease -endif -ifdef NO_THREAD_SAFE_PREAD - BASIC_CFLAGS += -DNO_THREAD_SAFE_PREAD endif ifdef NO_FAST_WORKING_DIRECTORY BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY @@@ -1491,11 -1495,6 +1492,11 @@@ ifneq (,$(XDL_FAST_HASH) BASIC_CFLAGS += -DXDL_FAST_HASH endif +ifdef GMTIME_UNRELIABLE_ERRORS + COMPAT_OBJS += compat/gmtime.o + BASIC_CFLAGS += -DGMTIME_UNRELIABLE_ERRORS +endif + ifeq ($(TCLTK_PATH),) NO_TCLTK = NoThanks endif @@@ -2101,7 -2100,7 +2102,7 @@@ pdf XGETTEXT_FLAGS = \ --force-po \ - --add-comments \ + --add-comments=TRANSLATORS: \ --msgid-bugs-address="Git Mailing List " \ --from-code=UTF-8 XGETTEXT_FLAGS_C = $(XGETTEXT_FLAGS) --language=C \ @@@ -2224,9 -2223,6 +2225,9 @@@ endi ifdef GIT_PERF_MAKE_OPTS @echo GIT_PERF_MAKE_OPTS=\''$(subst ','\'',$(subst ','\'',$(GIT_PERF_MAKE_OPTS)))'\' >>$@ endif +ifdef TEST_GIT_INDEX_VERSION + @echo TEST_GIT_INDEX_VERSION=\''$(subst ','\'',$(subst ','\'',$(TEST_GIT_INDEX_VERSION)))'\' >>$@ +endif ### Detect Python interpreter path changes ifndef NO_PYTHON @@@ -2484,9 -2480,8 +2485,9 @@@ profile-clean $(RM) $(addsuffix *.gcno,$(addprefix $(PROFILE_DIR)/, $(object_dirs))) clean: profile-clean coverage-clean - $(RM) *.o *.res block-sha1/*.o ppc/*.o compat/*.o compat/*/*.o xdiff/*.o vcs-svn/*.o \ - builtin/*.o $(LIB_FILE) $(XDIFF_LIB) $(VCSSVN_LIB) + $(RM) *.o *.res block-sha1/*.o ppc/*.o compat/*.o compat/*/*.o + $(RM) xdiff/*.o vcs-svn/*.o ewah/*.o builtin/*.o + $(RM) $(LIB_FILE) $(XDIFF_LIB) $(VCSSVN_LIB) $(RM) $(ALL_PROGRAMS) $(SCRIPT_LIB) $(BUILT_INS) git$X $(RM) $(TEST_PROGRAMS) $(NO_INSTALL) $(RM) -r bin-wrappers $(dep_dirs) diff --combined utf8.c index 77c28d492c,db7ef3c10a..b30790d043 --- a/utf8.c +++ b/utf8.c @@@ -5,8 -5,8 +5,8 @@@ /* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */ struct interval { - int first; - int last; + ucs_char_t first; + ucs_char_t last; }; size_t display_mode_esc_sequence_len(const char *s) @@@ -80,52 -80,8 +80,8 @@@ static int git_wcwidth(ucs_char_t ch { /* * Sorted list of non-overlapping intervals of non-spacing characters, - * generated by - * "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c". */ - static const struct interval combining[] = { - { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD }, - { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 }, - { 0x05C7, 0x05C7 }, { 0x0600, 0x0604 }, { 0x0610, 0x061A }, - { 0x064B, 0x065F }, { 0x0670, 0x0670 }, { 0x06D6, 0x06E4 }, - { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x070F, 0x070F }, - { 0x0711, 0x0711 }, { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, - { 0x0901, 0x0902 }, { 0x093C, 0x093C }, { 0x0941, 0x0948 }, - { 0x094D, 0x094D }, { 0x0951, 0x0954 }, { 0x0962, 0x0963 }, - { 0x0981, 0x0981 }, { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, - { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 }, { 0x0A01, 0x0A02 }, - { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 }, - { 0x0A4B, 0x0A4D }, { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 }, - { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, - { 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 }, - { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, - { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, - { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, - { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, - { 0x0CBC, 0x0CBC }, { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, - { 0x0CCC, 0x0CCD }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D }, - { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 }, - { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E }, - { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC }, - { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 }, - { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E }, - { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 }, - { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 }, - { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 }, - { 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x1712, 0x1714 }, - { 0x1732, 0x1734 }, { 0x1752, 0x1753 }, { 0x1772, 0x1773 }, - { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD }, { 0x17C6, 0x17C6 }, - { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD }, { 0x180B, 0x180D }, - { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 }, { 0x1927, 0x1928 }, - { 0x1932, 0x1932 }, { 0x1939, 0x193B }, { 0x200B, 0x200F }, - { 0x202A, 0x202E }, { 0x2060, 0x2063 }, { 0x206A, 0x206F }, - { 0x20D0, 0x20EA }, { 0x302A, 0x302F }, { 0x3099, 0x309A }, - { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE23 }, - { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB }, { 0x1D167, 0x1D169 }, - { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, - { 0x1D1AA, 0x1D1AD }, { 0xE0001, 0xE0001 }, - { 0xE0020, 0xE007F }, { 0xE0100, 0xE01EF } - }; + #include "unicode_width.h" /* test for 8-bit control characters */ if (ch == 0) @@@ -134,34 -90,16 +90,16 @@@ return -1; /* binary search in table of non-spacing characters */ - if (bisearch(ch, combining, sizeof(combining) + if (bisearch(ch, zero_width, sizeof(zero_width) / sizeof(struct interval) - 1)) return 0; - /* - * If we arrive here, ch is neither a combining nor a C0/C1 - * control character. - */ + /* binary search in table of double width characters */ + if (bisearch(ch, double_width, sizeof(double_width) + / sizeof(struct interval) - 1)) + return 2; - return 1 + - (ch >= 0x1100 && - /* Hangul Jamo init. consonants */ - (ch <= 0x115f || - ch == 0x2329 || ch == 0x232a || - /* CJK ... Yi */ - (ch >= 0x2e80 && ch <= 0xa4cf && - ch != 0x303f) || - /* Hangul Syllables */ - (ch >= 0xac00 && ch <= 0xd7a3) || - /* CJK Compatibility Ideographs */ - (ch >= 0xf900 && ch <= 0xfaff) || - /* CJK Compatibility Forms */ - (ch >= 0xfe30 && ch <= 0xfe6f) || - /* Fullwidth Forms */ - (ch >= 0xff00 && ch <= 0xff60) || - (ch >= 0xffe0 && ch <= 0xffe6) || - (ch >= 0x20000 && ch <= 0x2fffd) || - (ch >= 0x30000 && ch <= 0x3fffd))); + return 1; } /* @@@ -528,7 -466,7 +466,7 @@@ char *reencode_string_iconv(const char while (1) { size_t cnt = iconv(conv, &cp, &insz, &outpos, &outsz); - if (cnt == -1) { + if (cnt == (size_t) -1) { size_t sofar; if (errno != E2BIG) { free(out);