gettext.con commit t9003: become resilient to GETTEXT_POISON (f9b3242)
   1/*
   2 * Copyright (c) 2010 Ævar Arnfjörð Bjarmason
   3 */
   4
   5#include "git-compat-util.h"
   6#include "gettext.h"
   7#include "strbuf.h"
   8#include "utf8.h"
   9
  10#ifndef NO_GETTEXT
  11#       include <locale.h>
  12#       include <libintl.h>
  13#       ifdef HAVE_LIBCHARSET_H
  14#               include <libcharset.h>
  15#       else
  16#               include <langinfo.h>
  17#               define locale_charset() nl_langinfo(CODESET)
  18#       endif
  19#endif
  20
  21/*
  22 * Guess the user's preferred languages from the value in LANGUAGE environment
  23 * variable and LC_MESSAGES locale category if NO_GETTEXT is not defined.
  24 *
  25 * The result can be a colon-separated list like "ko:ja:en".
  26 */
  27const char *get_preferred_languages(void)
  28{
  29        const char *retval;
  30
  31        retval = getenv("LANGUAGE");
  32        if (retval && *retval)
  33                return retval;
  34
  35#ifndef NO_GETTEXT
  36        retval = setlocale(LC_MESSAGES, NULL);
  37        if (retval && *retval &&
  38                strcmp(retval, "C") &&
  39                strcmp(retval, "POSIX"))
  40                return retval;
  41#endif
  42
  43        return NULL;
  44}
  45
  46#ifdef GETTEXT_POISON
  47int use_gettext_poison(void)
  48{
  49        static int poison_requested = -1;
  50        if (poison_requested == -1)
  51                poison_requested = getenv("GIT_GETTEXT_POISON") ? 1 : 0;
  52        return poison_requested;
  53}
  54#endif
  55
  56#ifndef NO_GETTEXT
  57static int test_vsnprintf(const char *fmt, ...)
  58{
  59        char buf[26];
  60        int ret;
  61        va_list ap;
  62        va_start(ap, fmt);
  63        ret = vsnprintf(buf, sizeof(buf), fmt, ap);
  64        va_end(ap);
  65        return ret;
  66}
  67
  68static const char *charset;
  69static void init_gettext_charset(const char *domain)
  70{
  71        /*
  72           This trick arranges for messages to be emitted in the user's
  73           requested encoding, but avoids setting LC_CTYPE from the
  74           environment for the whole program.
  75
  76           This primarily done to avoid a bug in vsnprintf in the GNU C
  77           Library [1]. which triggered a "your vsnprintf is broken" error
  78           on Git's own repository when inspecting v0.99.6~1 under a UTF-8
  79           locale.
  80
  81           That commit contains a ISO-8859-1 encoded author name, which
  82           the locale aware vsnprintf(3) won't interpolate in the format
  83           argument, due to mismatch between the data encoding and the
  84           locale.
  85
  86           Even if it wasn't for that bug we wouldn't want to use LC_CTYPE at
  87           this point, because it'd require auditing all the code that uses C
  88           functions whose semantics are modified by LC_CTYPE.
  89
  90           But only setting LC_MESSAGES as we do creates a problem, since
  91           we declare the encoding of our PO files[2] the gettext
  92           implementation will try to recode it to the user's locale, but
  93           without LC_CTYPE it'll emit something like this on 'git init'
  94           under the Icelandic locale:
  95
  96               Bj? til t?ma Git lind ? /hlagh/.git/
  97
  98           Gettext knows about the encoding of our PO file, but we haven't
  99           told it about the user's encoding, so all the non-US-ASCII
 100           characters get encoded to question marks.
 101
 102           But we're in luck! We can set LC_CTYPE from the environment
 103           only while we call nl_langinfo and
 104           bind_textdomain_codeset. That suffices to tell gettext what
 105           encoding it should emit in, so it'll now say:
 106
 107               Bjó til tóma Git lind í /hlagh/.git/
 108
 109           And the equivalent ISO-8859-1 string will be emitted under a
 110           ISO-8859-1 locale.
 111
 112           With this change way we get the advantages of setting LC_CTYPE
 113           (talk to the user in his language/encoding), without the major
 114           drawbacks (changed semantics for C functions we rely on).
 115
 116           However foreign functions using other message catalogs that
 117           aren't using our neat trick will still have a problem, e.g. if
 118           we have to call perror(3):
 119
 120           #include <stdio.h>
 121           #include <locale.h>
 122           #include <errno.h>
 123
 124           int main(void)
 125           {
 126                   setlocale(LC_MESSAGES, "");
 127                   setlocale(LC_CTYPE, "C");
 128                   errno = ENODEV;
 129                   perror("test");
 130                   return 0;
 131           }
 132
 133           Running that will give you a message with question marks:
 134
 135           $ LANGUAGE= LANG=de_DE.utf8 ./test
 136           test: Kein passendes Ger?t gefunden
 137
 138           The vsnprintf bug has been fixed since glibc 2.17.
 139
 140           Then we could simply set LC_CTYPE from the environment, which would
 141           make things like the external perror(3) messages work.
 142
 143           See t/t0203-gettext-setlocale-sanity.sh's "gettext.c" tests for
 144           regression tests.
 145
 146           1. http://sourceware.org/bugzilla/show_bug.cgi?id=6530
 147           2. E.g. "Content-Type: text/plain; charset=UTF-8\n" in po/is.po
 148        */
 149        setlocale(LC_CTYPE, "");
 150        charset = locale_charset();
 151        bind_textdomain_codeset(domain, charset);
 152        /* the string is taken from v0.99.6~1 */
 153        if (test_vsnprintf("%.*s", 13, "David_K\345gedal") < 0)
 154                setlocale(LC_CTYPE, "C");
 155}
 156
 157void git_setup_gettext(void)
 158{
 159        const char *podir = getenv("GIT_TEXTDOMAINDIR");
 160
 161        if (!podir)
 162                podir = GIT_LOCALE_PATH;
 163        bindtextdomain("git", podir);
 164        setlocale(LC_MESSAGES, "");
 165        setlocale(LC_TIME, "");
 166        init_gettext_charset("git");
 167        textdomain("git");
 168}
 169
 170/* return the number of columns of string 's' in current locale */
 171int gettext_width(const char *s)
 172{
 173        static int is_utf8 = -1;
 174        if (is_utf8 == -1)
 175                is_utf8 = !strcmp(charset, "UTF-8");
 176
 177        return is_utf8 ? utf8_strwidth(s) : strlen(s);
 178}
 179#endif