[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: What's the character encoding of manpages?



On Thu, Jul 24, 2003 at 04:24:51PM +0200, Michael Piefel wrote:
> Am 24.07.03 um 15:55:43 schrieb Aaron Isotton:
> > what are man pages, or more generally, groff documents, supposed to be
> > encoded in?  I didn't find any reference to that in groff(7).  Is it
> > ASCII?
> 
> Preferably ASCII, yes. I seem to remember having once read that the
> input actually is in Latin-1. There, 0xA0 is a non-breaking space.
> I wouldn't rely on this, though. Many manpages are in Latin-2, since man
> used to not do any conversion at all, assuming input would be in the
> same encoding as output.

Heh. Well, that's kind of still true under some circumstances. The
following is from src/encodings.c in man-db CVS, and is my attempt to
hit as many of the encoding problems with current groff as I can with a
very large hammer. Feel free to vomit after reading it.

/* Due to historical limitations in groff (which may be removed in the
 * future), there is no mechanism for a man page to specify its own
 * encoding. This means that each national language directory needs to carry
 * with it information about its encoding, and each groff device needs to
 * have a default encoding associated with it. Out of the box, groff
 * formally allows only ISO-8859-1 on input; however, patches originating
 * with Debian and imported by many other GNU/Linux distributions change
 * this somewhat.
 *
 * Eventually, groff will support proper Unicode input, and much of this
 * horror can go away.
 *
 * Do *not* confuse source encoding with groff encoding. The encoding
 * specified in this table is the encoding in which the source man pages in
 * each language directory are expected to be written. The groff encoding is
 * determined by the selected groff device and sometimes also by the user's
 * locale.
 *
 * This table is expected to change over time, particularly as man pages
 * begin to move towards UTF-8. Feel free to patch this for your
 * distribution; send me updates for languages I've missed.
 *
 * Explicit encodings in the directory name (e.g. de_DE.UTF-8) override this
 * table. TODO: Implement this.
 */
static struct {
        const char *lang_dir;
        const char *source_encoding;
} directory_table[] = {
        { "C",          "ISO-8859-1"    }, /* English */
        { "POSIX",      "ISO-8859-1"    }, /* English */
        { "da",         "ISO-8859-1"    }, /* Danish */
        { "de",         "ISO-8859-1"    }, /* German */
        { "en",         "ISO-8859-1"    }, /* English */
        { "es",         "ISO-8859-1"    }, /* Spanish */
        { "fi",         "ISO-8859-1"    }, /* Finnish */
        { "fr",         "ISO-8859-1"    }, /* French */
        { "ga",         "ISO-8859-1"    }, /* Irish */
        { "is",         "ISO-8859-1"    }, /* Icelandic */
        { "it",         "ISO-8859-1"    }, /* Italian */
        { "nl",         "ISO-8859-1"    }, /* Dutch */
        { "no",         "ISO-8859-1"    }, /* Norwegian */
        { "pt",         "ISO-8859-1"    }, /* Portuguese */
        { "sv",         "ISO-8859-1"    }, /* Swedish */

#ifdef MULTIBYTE_GROFF
        /* These languages require a patched version of groff with the
         * ascii8 and nippon devices.
         */
        { "cs",         "ISO-8859-2"    }, /* Czech */
        { "hu",         "ISO-8859-2"    }, /* Hungarian */
        { "ja",         "EUC-JP"        }, /* Japanese */
        { "ko",         "EUC-KR"        }, /* Korean */
        { "pl",         "ISO-8859-2"    }, /* Polish */
        { "ru",         "KOI8-R"        }, /* Russian */
#endif /* MULTIBYTE_GROFF */

        { NULL,         NULL            } };

/* The default groff terminal output device to be used is determined based
 * on nl_langinfo(CODESET), which returns the character set used by the
 * current locale.
 */
static struct {
        const char *locale_charset;
        const char *default_device;
} charset_table[] = {
        { "ANSI_X3.4-1968",     "ascii"         },
        { "ISO-8859-1",         "latin1"        },
        { "UTF-8",              "utf8"          },

#ifdef MULTIBYTE_GROFF
        { "EUC-JP",             "nippon"        },
#endif /* MULTIBYTE_GROFF */

        { NULL,                 NULL            } };

static const char *fallback_locale_charset = "ANSI_X3.4-1968";
static const char *fallback_default_device =
#ifdef MULTIBYTE_GROFF
        "ascii8"
#else /* !MULTIBYTE_GROFF */
        "ascii"
#endif /* MULTIBYTE_GROFF */
        ;

/* The encoding used for the text passed to groff is a function of the
 * selected groff device. Traditional devices expect ISO-8859-1 on input
 * (yes, even the utf8 device); devices added in the Debian multibyte patch
 * expect other encodings. The ascii8 device passes top-bit-set characters
 * straight through so is (probably ...) encoding-agnostic. If this encoding
 * does not match the source encoding, an iconv pipe is used (if available)
 * to perform recoding.
 *
 * Setting less_charset to latin1 tells the less pager that characters
 * between 0xA0 and 0xFF are displayable, not that its input is encoded in
 * ISO-8859-1. TODO: Perhaps using LESSCHARDEF would be better.
 */
static struct {
        const char *roff_device;
        const char *roff_encoding;
        const char *less_charset;
} device_table[] = {
        { "ascii",      "ISO-8859-1",   "ascii"         },
        { "latin1",     "ISO-8859-1",   "latin1"        },
        { "utf8",       "ISO-8859-1",   "utf-8"         },

#ifdef MULTIBYTE_GROFF
        { "ascii8",     NULL,           "latin1"        },
        { "nippon",     "EUC-JP",       "ja"            },
#endif /* MULTIBYTE_GROFF */

        { NULL,         NULL,           NULL            } };

static const char *fallback_roff_encoding = "ISO-8859-1";
static const char *fallback_less_charset = "latin1";

-- 
Colin Watson                                  [cjwatson@flatline.org.uk]



Reply to: