Bug#929923: missing dictionaries.xcu confuses non-US English locales (e.g. en_AU)
I still advocate solving only MY problem, with a simple change:
https://bugs.debian.org/cgi-bin/bugreport.cgi?att=2;bug=929923;filename=929923.patch;msg=22
However, I also considered a complex change:
generate debian/*.links automatically from */dictionaries.xcu.
A proof-of-concept is attached.
If this is interesting, I can look at merging it into debian/helper.py.
If this is not interesting, I'm happy to just forget about it :-)
#!/usr/bin/python3
"""Create symlinks to simulate missing dictionaries.xcu.
LibreOffice provides spelling/hyphenation/thesaurus dictionaries for different language varieties (xx_YY).
When another variety is similar, and no dedicated dictionary is available, they are aliased together.
LibreOffice defines these aliases in a "dictionaries.xcu" file.
For example, de_AT has its own hyphenation dictionary, but re-uses de_DE's thesaurus dictionary.
https://sources.debian.org/src/libreoffice-dictionaries/1:6.3.0-1/dictionaries/de/dictionaries.xcu/#L46
https://sources.debian.org/src/libreoffice-dictionaries/1:6.3.0-1/dictionaries/de/dictionaries.xcu/#L80
Debian does not ship dictionaries.xcu files because
1. only LibreOffice understands them, but
other packages use the dictionaries themselves.
2. Debian packages the spelling/hyphenation/thesaurus dictionaries separately, but
dictionaries.xcu assumes they are packaged together.
If your locale is set to the original language (e.g. LANG=de_DE for
th_de_DE_v2.dat), this Just Works, because of fallback behaviour in
the individual apps (including LibreOffice).
If your locale is set to the aliased language (e.g. LANG=de_AT for
th_de_DE_v2.dat), a symlink is needed to help the app "see" the dictionary.
Prior to this script, those symlinks were created on an ad-hoc basis.
This script tries to fully automate that process, so that
1. there is less work for the Debian maintainer; and
2. more consistent behaviour between Debian and upstream.
---Trent W. Buck, Aug 2019, https://bugs.debian.org/929923
"""
import sys
import glob
import pprint
import types
import re
import lxml.etree
def main() -> None:
for d in xcu2dicts():
for f, l in zip(d.files, d.locales):
# Expand the "%origin%" variable to whatever it should be.
f = f.replace('%origin%',
{'DICT_SPELL': '/usr/share/hunspell',
'DICT_HYPH': '/usr/share/hyphen',
'DICT_THES': '/usr/share/mythes'}[d.format])
symlink_dst_path = f
prefix = re.fullmatch(
r'('
r'/usr/share/hunspell/|'
r'/usr/share/hyphen/hyph_|'
r'/usr/share/mythes/(?:th|thes|thesaurus)_'
r').*',
f).group(1)
suffix = re.fullmatch(
r'.*'
r'((?:_v2)?\.(?:dic|aff|dat|idx))',
f).group(1)
symlink_src_path = (
prefix +
IETF_locale_to_glibc_locale(l) +
suffix)
# FIXME: needs to use f'-p{package}', like helper.py:generate_installs().
if symlink_dst_path != symlink_src_path:
print('', # indent for make
'dh_link',
symlink_dst_path,
symlink_src_path,
'# ' + l, # comment
sep='\t')
# The upstream XCU use RFC 5646 notation (kmr-Latn-TR).
# The upstream dictionaries aren't completely consistent, but mostly use glibc notation (ks_IN@devanagari).
# libreoffice-dictionaries/debian/helper.py has a hand-written dict instead of this bodgy regex-replacement.
def IETF_locale_to_glibc_locale(lo_locale: str) -> str:
s = lo_locale
# Change -Latn- to @latin (YUK!)
s = re.sub(r'(.+)-Latn(-.+)?', r'\1\2@latin', s)
# Change -valencia to @valencia (YUK!)
s = re.sub(r'(.+)-valencia', r'\1@valencia', s)
# Change xx-YY to xx_YY
s = re.sub(r'([^-]+)-(.+)', r'\1_\2', s)
return s
# Scrape key/value pairs from the XCUs.
# Example output:
# [namespace(files={'%origin%/af_ZA.aff', '%origin%/af_ZA.dic'},
# format='DICT_SPELL',
# locales={'af-NA', 'af-ZA'}),
# namespace(files={'%origin%/hyph_af_ZA.dic'},
# format='DICT_HYPH',
# locales={'af-NA', 'af-ZA'})]
def xcu2dicts() -> list:
acc = [] # accumulator
for xcu_path in glob.glob('dictionaries/*/dictionaries.xcu'):
xcu_obj = lxml.etree.parse(xcu_path)
nsmap = xcu_obj.getroot().nsmap
for d in xcu_obj.xpath('//node[@oor:name="Dictionaries"]/node', namespaces=nsmap):
format, = d.xpath('./prop[@oor:name="Format"]/value/text()', namespaces=nsmap)
files = {
l
for value in d.xpath('./prop[@oor:name="Locations"]/value/text()', namespaces=nsmap)
for l in value.split()}
locales = {
l
for value in d.xpath('./prop[@oor:name="Locales"]/value/text()', namespaces=nsmap)
for l in value.split()}
acc.append(types.SimpleNamespace(
format=format,
files=files,
locales=locales))
return acc
if __name__ == '__main__':
main()
dh_link /usr/share/hunspell/af_ZA.dic /usr/share/hunspell/af_NA.dic # af-NA
dh_link /usr/share/hunspell/en_GB.dic /usr/share/hunspell/en_GH.dic # en-GH
dh_link /usr/share/hunspell/en_GB.aff /usr/share/hunspell/en_BS.aff # en-BS
dh_link /usr/share/hunspell/en_ZA.aff /usr/share/hunspell/en_ZW.aff # en-ZW
dh_link /usr/share/hunspell/en_ZA.dic /usr/share/hunspell/en_NA.dic # en-NA
dh_link /usr/share/hunspell/en_US.dic /usr/share/hunspell/en_PH.dic # en-PH
dh_link /usr/share/hyphen/hyph_en_GB.dic /usr/share/hyphen/hyph_en_AU.dic # en-AU
dh_link /usr/share/hyphen/hyph_en_US.dic /usr/share/hyphen/hyph_en_PH.dic # en-PH
dh_link /usr/share/mythes/th_en_US_v2.dat /usr/share/mythes/th_en_AU.dat # en-AU
dh_link /usr/share/hunspell/gl_ES.dic /usr/share/hunspell/gl.dic # gl
dh_link /usr/share/hyphen/hyph_gl.dic /usr/share/hyphen/hyph_gl_ES.dic # gl-ES
dh_link /usr/share/mythes/thesaurus_gl.idx /usr/share/mythes/thesaurus_gl_ES.idx # gl-ES
dh_link /usr/share/hunspell/fr.dic /usr/share/hunspell/fr_FR.dic # fr-FR
dh_link /usr/share/hunspell/fr.aff /usr/share/hunspell/fr_CH.aff # fr-CH
dh_link /usr/share/hyphen/hyph_fr.dic /usr/share/hyphen/hyph_fr_FR.dic # fr-FR
dh_link /usr/share/mythes/thes_fr.dat /usr/share/mythes/thes_fr_FR.dat # fr-FR
dh_link /usr/share/mythes/thes_fr.idx /usr/share/mythes/thes_fr_CH.idx # fr-CH
dh_link /usr/share/hunspell/lt.aff /usr/share/hunspell/lt_LT.aff # lt-LT
dh_link /usr/share/hyphen/hyph_lt.dic /usr/share/hyphen/hyph_lt_LT.dic # lt-LT
dh_link /usr/share/hunspell/ar.dic /usr/share/hunspell/ar_EG.dic # ar-EG
dh_link /usr/share/hunspell/ar.aff /usr/share/hunspell/ar_DZ.aff # ar-DZ
dh_link /usr/share/mythes/th_ar.idx /usr/share/mythes/th_ar_EG.idx # ar-EG
dh_link /usr/share/mythes/th_ar.dat /usr/share/mythes/th_ar_DZ.dat # ar-DZ
dh_link /usr/share/hunspell/pt_PT.aff /usr/share/hunspell/pt_AO.aff # pt-AO
dh_link /usr/share/mythes/th_pt_PT_v2.dat /usr/share/mythes/th_pt_PT.dat # pt-PT
dh_link /usr/share/hunspell/bn_BD.dic /usr/share/hunspell/bn_IN.dic # bn-IN
dh_link /usr/share/hunspell/bs_BA.aff /usr/share/hunspell/bs.aff # bs
dh_link /usr/share/mythes/th_ro_RO_v2.dat /usr/share/mythes/th_ro_RO.dat # ro-RO
dh_link /usr/share/hunspell/tr_TR.aff /usr/share/hunspell/tr.aff # tr
dh_link /usr/share/hunspell/an_ES.aff /usr/share/hunspell/an.aff # an
dh_link /usr/share/hyphen/hyph_es_ANY.dic /usr/share/hyphen/hyph_es_NI.dic # es-NI
dh_link /usr/share/mythes/th_es_ANY_v2.dat /usr/share/mythes/th_es_NI.dat # es-NI
dh_link /usr/share/mythes/th_es_ANY_v2.idx /usr/share/mythes/th_es_CO.idx # es-CO
dh_link /usr/share/hunspell/nl_NL.dic /usr/share/hunspell/nl_BE.dic # nl-BE
dh_link /usr/share/hyphen/hyph_nl_NL.dic /usr/share/hyphen/hyph_nl_BE.dic # nl-BE
dh_link /usr/share/hunspell/is.dic /usr/share/hunspell/is_IS.dic # is-IS
dh_link /usr/share/mythes/th_is.idx /usr/share/mythes/th_is_IS.idx # is-IS
dh_link /usr/share/hunspell/bo.aff /usr/share/hunspell/bo_CN.aff # bo-CN
dh_link /usr/share/mythes/th_bg_BG_v2.dat /usr/share/mythes/th_bg_BG.dat # bg-BG
dh_link /usr/share/mythes/th_lv_LV_v2.dat /usr/share/mythes/th_lv_LV.dat # lv-LV
dh_link /usr/share/hunspell/ca.aff /usr/share/hunspell/ca_ES.aff # ca-ES
dh_link /usr/share/hunspell/ca.dic /usr/share/hunspell/ca_IT.dic # ca-IT
dh_link /usr/share/hunspell/ca-valencia.dic /usr/share/hunspell/ca_ES@valencia.dic # ca-ES-valencia
dh_link /usr/share/hyphen/hyph_ca.dic /usr/share/hyphen/hyph_ca_FR.dic # ca-FR
dh_link /usr/share/mythes/th_ca_ES_v3.idx /usr/share/mythes/th_ca_FR.idx # ca-FR
dh_link /usr/share/mythes/th_ca_ES_v3.dat /usr/share/mythes/th_ca_ES@valencia.dat # ca-ES-valencia
dh_link /usr/share/hunspell/te_IN.dic /usr/share/hunspell/te.dic # te
dh_link /usr/share/mythes/th_pl_PL_v2.dat /usr/share/mythes/th_pl_PL.dat # pl-PL
dh_link /usr/share/mythes/th_id_ID_v2.idx /usr/share/mythes/th_id_ID.idx # id-ID
dh_link /usr/share/hunspell/sr.aff /usr/share/hunspell/sr_RS.aff # sr-RS
dh_link /usr/share/hunspell/sr.dic /usr/share/hunspell/sr_CS.dic # sr-CS
dh_link /usr/share/hunspell/sr-Latn.dic /usr/share/hunspell/sr_CS@latin.dic # sr-Latn-CS
dh_link /usr/share/hunspell/sr-Latn.aff /usr/share/hunspell/sr_RS@latin.aff # sr-Latn-RS
dh_link /usr/share/hyphen/hyph_sr.dic /usr/share/hyphen/hyph_sr_RS.dic # sr-RS
dh_link /usr/share/hyphen/hyph_sr-Latn.dic /usr/share/hyphen/hyph_sr_CS@latin.dic # sr-Latn-CS
dh_link /usr/share/hyphen/hyph_sv.dic /usr/share/hyphen/hyph_sv_SE.dic # sv-SE
dh_link /usr/share/mythes/th_hu_HU_v2.idx /usr/share/mythes/th_hu_HU.idx # hu-HU
dh_link /usr/share/mythes/th_ru_RU_v2.idx /usr/share/mythes/th_ru_RU.idx # ru-RU
dh_link /usr/share/mythes/th_it_IT_v2.dat /usr/share/mythes/th_it_IT.dat # it-IT
dh_link /usr/share/mythes/th_nb_NO_v2.dat /usr/share/mythes/th_nb_NO.dat # nb-NO
dh_link /usr/share/mythes/th_nn_NO_v2.dat /usr/share/mythes/th_nn_NO.dat # nn-NO
dh_link /usr/share/mythes/th_sk_SK_v2.idx /usr/share/mythes/th_sk_SK.idx # sk-SK
dh_link /usr/share/hunspell/gug.aff /usr/share/hunspell/gug_PY.aff # gug-PY
dh_link /usr/share/hunspell/kmr_Latn.aff /usr/share/hunspell/kmr_SY@latin.aff # kmr-Latn-SY
dh_link /usr/share/hunspell/kmr_Latn.dic /usr/share/hunspell/kmr_TR@latin.dic # kmr-Latn-TR
dh_link /usr/share/hunspell/de_AT_frami.aff /usr/share/hunspell/de_AT.aff # de-AT
dh_link /usr/share/hunspell/de_CH_frami.dic /usr/share/hunspell/de_CH.dic # de-CH
dh_link /usr/share/hunspell/de_DE_frami.aff /usr/share/hunspell/de_DE.aff # de-DE
dh_link /usr/share/mythes/th_de_DE_v2.dat /usr/share/mythes/th_de_AT.dat # de-AT
dh_link /usr/share/mythes/th_de_CH_v2.dat /usr/share/mythes/th_de_CH.dat # de-CH
dh_link /usr/share/mythes/th_sl_SI_v2.dat /usr/share/mythes/th_sl_SI.dat # sl-SI
dh_link /usr/share/mythes/th_ne_NP_v2.dat /usr/share/mythes/th_ne_NP.dat # ne-NP
Reply to: