[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#929923: missing dictionaries.xcu confuses non-US English locales (e.g. en_AU)



I still advocate solving only MY problem, with a simple change:

    https://bugs.debian.org/cgi-bin/bugreport.cgi?att=2;bug=929923;filename=929923.patch;msg=22

However, I also considered a complex change:
generate debian/*.links automatically from */dictionaries.xcu.

A proof-of-concept is attached.
If this is interesting, I can look at merging it into debian/helper.py.
If this is not interesting, I'm happy to just forget about it :-)
#!/usr/bin/python3

"""Create symlinks to simulate missing dictionaries.xcu.

LibreOffice provides spelling/hyphenation/thesaurus dictionaries for different language varieties (xx_YY).
When another variety is similar, and no dedicated dictionary is available, they are aliased together.
LibreOffice defines these aliases in a "dictionaries.xcu" file.

For example, de_AT has its own hyphenation dictionary, but re-uses de_DE's thesaurus dictionary.
https://sources.debian.org/src/libreoffice-dictionaries/1:6.3.0-1/dictionaries/de/dictionaries.xcu/#L46
https://sources.debian.org/src/libreoffice-dictionaries/1:6.3.0-1/dictionaries/de/dictionaries.xcu/#L80

Debian does not ship dictionaries.xcu files because

 1. only LibreOffice understands them, but
    other packages use the dictionaries themselves.

 2. Debian packages the spelling/hyphenation/thesaurus dictionaries separately, but
    dictionaries.xcu assumes they are packaged together.

If your locale is set to the original language (e.g. LANG=de_DE for
th_de_DE_v2.dat), this Just Works, because of fallback behaviour in
the individual apps (including LibreOffice).

If your locale is set to the aliased language (e.g. LANG=de_AT for
th_de_DE_v2.dat), a symlink is needed to help the app "see" the dictionary.

Prior to this script, those symlinks were created on an ad-hoc basis.
This script tries to fully automate that process, so that

 1. there is less work for the Debian maintainer; and
 2. more consistent behaviour between Debian and upstream.

---Trent W. Buck, Aug 2019, https://bugs.debian.org/929923
"""

import sys
import glob
import pprint
import types
import re

import lxml.etree


def main() -> None:
    for d in xcu2dicts():
        for f, l in zip(d.files, d.locales):
            # Expand the "%origin%" variable to whatever it should be.
            f = f.replace('%origin%',
                          {'DICT_SPELL': '/usr/share/hunspell',
                           'DICT_HYPH': '/usr/share/hyphen',
                           'DICT_THES': '/usr/share/mythes'}[d.format])
            symlink_dst_path = f
            prefix = re.fullmatch(
                r'('
                r'/usr/share/hunspell/|'
                r'/usr/share/hyphen/hyph_|'
                r'/usr/share/mythes/(?:th|thes|thesaurus)_'
                r').*',
                f).group(1)
            suffix = re.fullmatch(
                r'.*'
                r'((?:_v2)?\.(?:dic|aff|dat|idx))',
                f).group(1)
            symlink_src_path = (
                prefix +
                IETF_locale_to_glibc_locale(l) +
                suffix)

            # FIXME: needs to use f'-p{package}', like helper.py:generate_installs().
            if symlink_dst_path != symlink_src_path:
                print('',       # indent for make
                      'dh_link',
                      symlink_dst_path,
                      symlink_src_path,
                      '# ' + l,   # comment
                      sep='\t')


# The upstream XCU use RFC 5646 notation (kmr-Latn-TR).
# The upstream dictionaries aren't completely consistent, but mostly use glibc notation (ks_IN@devanagari).
# libreoffice-dictionaries/debian/helper.py has a hand-written dict instead of this bodgy regex-replacement.
def IETF_locale_to_glibc_locale(lo_locale: str) -> str:
    s = lo_locale
    # Change -Latn- to @latin  (YUK!)
    s = re.sub(r'(.+)-Latn(-.+)?', r'\1\2@latin', s)
    # Change -valencia to @valencia  (YUK!)
    s = re.sub(r'(.+)-valencia', r'\1@valencia', s)
    # Change xx-YY to xx_YY
    s = re.sub(r'([^-]+)-(.+)', r'\1_\2', s)
    return s


# Scrape key/value pairs from the XCUs.
# Example output:
#     [namespace(files={'%origin%/af_ZA.aff', '%origin%/af_ZA.dic'},
#                format='DICT_SPELL',
#                locales={'af-NA', 'af-ZA'}),
#      namespace(files={'%origin%/hyph_af_ZA.dic'},
#                format='DICT_HYPH',
#                locales={'af-NA', 'af-ZA'})]
def xcu2dicts() -> list:
    acc = []                    # accumulator
    for xcu_path in glob.glob('dictionaries/*/dictionaries.xcu'):
        xcu_obj = lxml.etree.parse(xcu_path)
        nsmap = xcu_obj.getroot().nsmap
        for d in xcu_obj.xpath('//node[@oor:name="Dictionaries"]/node', namespaces=nsmap):
            format, = d.xpath('./prop[@oor:name="Format"]/value/text()', namespaces=nsmap)
            files = {
                l
                for value in d.xpath('./prop[@oor:name="Locations"]/value/text()', namespaces=nsmap)
                for l in value.split()}
            locales = {
                l
                for value in d.xpath('./prop[@oor:name="Locales"]/value/text()', namespaces=nsmap)
                for l in value.split()}

            acc.append(types.SimpleNamespace(
                format=format,
                files=files,
                locales=locales))
    return acc


if __name__ == '__main__':
    main()
	dh_link	/usr/share/hunspell/af_ZA.dic	/usr/share/hunspell/af_NA.dic	# af-NA
	dh_link	/usr/share/hunspell/en_GB.dic	/usr/share/hunspell/en_GH.dic	# en-GH
	dh_link	/usr/share/hunspell/en_GB.aff	/usr/share/hunspell/en_BS.aff	# en-BS
	dh_link	/usr/share/hunspell/en_ZA.aff	/usr/share/hunspell/en_ZW.aff	# en-ZW
	dh_link	/usr/share/hunspell/en_ZA.dic	/usr/share/hunspell/en_NA.dic	# en-NA
	dh_link	/usr/share/hunspell/en_US.dic	/usr/share/hunspell/en_PH.dic	# en-PH
	dh_link	/usr/share/hyphen/hyph_en_GB.dic	/usr/share/hyphen/hyph_en_AU.dic	# en-AU
	dh_link	/usr/share/hyphen/hyph_en_US.dic	/usr/share/hyphen/hyph_en_PH.dic	# en-PH
	dh_link	/usr/share/mythes/th_en_US_v2.dat	/usr/share/mythes/th_en_AU.dat	# en-AU
	dh_link	/usr/share/hunspell/gl_ES.dic	/usr/share/hunspell/gl.dic	# gl
	dh_link	/usr/share/hyphen/hyph_gl.dic	/usr/share/hyphen/hyph_gl_ES.dic	# gl-ES
	dh_link	/usr/share/mythes/thesaurus_gl.idx	/usr/share/mythes/thesaurus_gl_ES.idx	# gl-ES
	dh_link	/usr/share/hunspell/fr.dic	/usr/share/hunspell/fr_FR.dic	# fr-FR
	dh_link	/usr/share/hunspell/fr.aff	/usr/share/hunspell/fr_CH.aff	# fr-CH
	dh_link	/usr/share/hyphen/hyph_fr.dic	/usr/share/hyphen/hyph_fr_FR.dic	# fr-FR
	dh_link	/usr/share/mythes/thes_fr.dat	/usr/share/mythes/thes_fr_FR.dat	# fr-FR
	dh_link	/usr/share/mythes/thes_fr.idx	/usr/share/mythes/thes_fr_CH.idx	# fr-CH
	dh_link	/usr/share/hunspell/lt.aff	/usr/share/hunspell/lt_LT.aff	# lt-LT
	dh_link	/usr/share/hyphen/hyph_lt.dic	/usr/share/hyphen/hyph_lt_LT.dic	# lt-LT
	dh_link	/usr/share/hunspell/ar.dic	/usr/share/hunspell/ar_EG.dic	# ar-EG
	dh_link	/usr/share/hunspell/ar.aff	/usr/share/hunspell/ar_DZ.aff	# ar-DZ
	dh_link	/usr/share/mythes/th_ar.idx	/usr/share/mythes/th_ar_EG.idx	# ar-EG
	dh_link	/usr/share/mythes/th_ar.dat	/usr/share/mythes/th_ar_DZ.dat	# ar-DZ
	dh_link	/usr/share/hunspell/pt_PT.aff	/usr/share/hunspell/pt_AO.aff	# pt-AO
	dh_link	/usr/share/mythes/th_pt_PT_v2.dat	/usr/share/mythes/th_pt_PT.dat	# pt-PT
	dh_link	/usr/share/hunspell/bn_BD.dic	/usr/share/hunspell/bn_IN.dic	# bn-IN
	dh_link	/usr/share/hunspell/bs_BA.aff	/usr/share/hunspell/bs.aff	# bs
	dh_link	/usr/share/mythes/th_ro_RO_v2.dat	/usr/share/mythes/th_ro_RO.dat	# ro-RO
	dh_link	/usr/share/hunspell/tr_TR.aff	/usr/share/hunspell/tr.aff	# tr
	dh_link	/usr/share/hunspell/an_ES.aff	/usr/share/hunspell/an.aff	# an
	dh_link	/usr/share/hyphen/hyph_es_ANY.dic	/usr/share/hyphen/hyph_es_NI.dic	# es-NI
	dh_link	/usr/share/mythes/th_es_ANY_v2.dat	/usr/share/mythes/th_es_NI.dat	# es-NI
	dh_link	/usr/share/mythes/th_es_ANY_v2.idx	/usr/share/mythes/th_es_CO.idx	# es-CO
	dh_link	/usr/share/hunspell/nl_NL.dic	/usr/share/hunspell/nl_BE.dic	# nl-BE
	dh_link	/usr/share/hyphen/hyph_nl_NL.dic	/usr/share/hyphen/hyph_nl_BE.dic	# nl-BE
	dh_link	/usr/share/hunspell/is.dic	/usr/share/hunspell/is_IS.dic	# is-IS
	dh_link	/usr/share/mythes/th_is.idx	/usr/share/mythes/th_is_IS.idx	# is-IS
	dh_link	/usr/share/hunspell/bo.aff	/usr/share/hunspell/bo_CN.aff	# bo-CN
	dh_link	/usr/share/mythes/th_bg_BG_v2.dat	/usr/share/mythes/th_bg_BG.dat	# bg-BG
	dh_link	/usr/share/mythes/th_lv_LV_v2.dat	/usr/share/mythes/th_lv_LV.dat	# lv-LV
	dh_link	/usr/share/hunspell/ca.aff	/usr/share/hunspell/ca_ES.aff	# ca-ES
	dh_link	/usr/share/hunspell/ca.dic	/usr/share/hunspell/ca_IT.dic	# ca-IT
	dh_link	/usr/share/hunspell/ca-valencia.dic	/usr/share/hunspell/ca_ES@valencia.dic	# ca-ES-valencia
	dh_link	/usr/share/hyphen/hyph_ca.dic	/usr/share/hyphen/hyph_ca_FR.dic	# ca-FR
	dh_link	/usr/share/mythes/th_ca_ES_v3.idx	/usr/share/mythes/th_ca_FR.idx	# ca-FR
	dh_link	/usr/share/mythes/th_ca_ES_v3.dat	/usr/share/mythes/th_ca_ES@valencia.dat	# ca-ES-valencia
	dh_link	/usr/share/hunspell/te_IN.dic	/usr/share/hunspell/te.dic	# te
	dh_link	/usr/share/mythes/th_pl_PL_v2.dat	/usr/share/mythes/th_pl_PL.dat	# pl-PL
	dh_link	/usr/share/mythes/th_id_ID_v2.idx	/usr/share/mythes/th_id_ID.idx	# id-ID
	dh_link	/usr/share/hunspell/sr.aff	/usr/share/hunspell/sr_RS.aff	# sr-RS
	dh_link	/usr/share/hunspell/sr.dic	/usr/share/hunspell/sr_CS.dic	# sr-CS
	dh_link	/usr/share/hunspell/sr-Latn.dic	/usr/share/hunspell/sr_CS@latin.dic	# sr-Latn-CS
	dh_link	/usr/share/hunspell/sr-Latn.aff	/usr/share/hunspell/sr_RS@latin.aff	# sr-Latn-RS
	dh_link	/usr/share/hyphen/hyph_sr.dic	/usr/share/hyphen/hyph_sr_RS.dic	# sr-RS
	dh_link	/usr/share/hyphen/hyph_sr-Latn.dic	/usr/share/hyphen/hyph_sr_CS@latin.dic	# sr-Latn-CS
	dh_link	/usr/share/hyphen/hyph_sv.dic	/usr/share/hyphen/hyph_sv_SE.dic	# sv-SE
	dh_link	/usr/share/mythes/th_hu_HU_v2.idx	/usr/share/mythes/th_hu_HU.idx	# hu-HU
	dh_link	/usr/share/mythes/th_ru_RU_v2.idx	/usr/share/mythes/th_ru_RU.idx	# ru-RU
	dh_link	/usr/share/mythes/th_it_IT_v2.dat	/usr/share/mythes/th_it_IT.dat	# it-IT
	dh_link	/usr/share/mythes/th_nb_NO_v2.dat	/usr/share/mythes/th_nb_NO.dat	# nb-NO
	dh_link	/usr/share/mythes/th_nn_NO_v2.dat	/usr/share/mythes/th_nn_NO.dat	# nn-NO
	dh_link	/usr/share/mythes/th_sk_SK_v2.idx	/usr/share/mythes/th_sk_SK.idx	# sk-SK
	dh_link	/usr/share/hunspell/gug.aff	/usr/share/hunspell/gug_PY.aff	# gug-PY
	dh_link	/usr/share/hunspell/kmr_Latn.aff	/usr/share/hunspell/kmr_SY@latin.aff	# kmr-Latn-SY
	dh_link	/usr/share/hunspell/kmr_Latn.dic	/usr/share/hunspell/kmr_TR@latin.dic	# kmr-Latn-TR
	dh_link	/usr/share/hunspell/de_AT_frami.aff	/usr/share/hunspell/de_AT.aff	# de-AT
	dh_link	/usr/share/hunspell/de_CH_frami.dic	/usr/share/hunspell/de_CH.dic	# de-CH
	dh_link	/usr/share/hunspell/de_DE_frami.aff	/usr/share/hunspell/de_DE.aff	# de-DE
	dh_link	/usr/share/mythes/th_de_DE_v2.dat	/usr/share/mythes/th_de_AT.dat	# de-AT
	dh_link	/usr/share/mythes/th_de_CH_v2.dat	/usr/share/mythes/th_de_CH.dat	# de-CH
	dh_link	/usr/share/mythes/th_sl_SI_v2.dat	/usr/share/mythes/th_sl_SI.dat	# sl-SI
	dh_link	/usr/share/mythes/th_ne_NP_v2.dat	/usr/share/mythes/th_ne_NP.dat	# ne-NP

Reply to: