[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#1017852: libc6: C locale is 7-bit (127 characters), must be 8-bit (256 characters) since POSIX Issue 7 TC2/Issue 8



Package: libc6
Version: 2.33-8
Severity: important

Dear Maintainer,

Consider the following reproducer:
-- >8 --
#include <stdio.h>
#include <wchar.h>
#include <locale.h>

int main(int argc, char **) {
	if(argc > 1)
		fprintf(stderr, "loc=%s\n", setlocale(LC_ALL, ""));

	for(int i = 0; i <= 0xFF; ++i) {
		char bs[] = {i, 0};
		mbstate_t ctx = {};
		wchar_t wc = -1;
		printf("%02x: %d, ", i, mbrtowc(&wc, bs, 1, &ctx));
		printf("%ld\n", wc);
	}
}
-- >8 --

Yielding the following output:
-- >8 --
$ ./b | paste - - - - - - - -
00: 0, 0	01: 1, 1	02: 1, 2	03: 1, 3	04: 1, 4	05: 1, 5	06: 1, 6	07: 1, 7
08: 1, 8	09: 1, 9	0a: 1, 10	0b: 1, 11	0c: 1, 12	0d: 1, 13	0e: 1, 14	0f: 1, 15
10: 1, 16	11: 1, 17	12: 1, 18	13: 1, 19	14: 1, 20	15: 1, 21	16: 1, 22	17: 1, 23
18: 1, 24	19: 1, 25	1a: 1, 26	1b: 1, 27	1c: 1, 28	1d: 1, 29	1e: 1, 30	1f: 1, 31
20: 1, 32	21: 1, 33	22: 1, 34	23: 1, 35	24: 1, 36	25: 1, 37	26: 1, 38	27: 1, 39
28: 1, 40	29: 1, 41	2a: 1, 42	2b: 1, 43	2c: 1, 44	2d: 1, 45	2e: 1, 46	2f: 1, 47
30: 1, 48	31: 1, 49	32: 1, 50	33: 1, 51	34: 1, 52	35: 1, 53	36: 1, 54	37: 1, 55
38: 1, 56	39: 1, 57	3a: 1, 58	3b: 1, 59	3c: 1, 60	3d: 1, 61	3e: 1, 62	3f: 1, 63
40: 1, 64	41: 1, 65	42: 1, 66	43: 1, 67	44: 1, 68	45: 1, 69	46: 1, 70	47: 1, 71
48: 1, 72	49: 1, 73	4a: 1, 74	4b: 1, 75	4c: 1, 76	4d: 1, 77	4e: 1, 78	4f: 1, 79
50: 1, 80	51: 1, 81	52: 1, 82	53: 1, 83	54: 1, 84	55: 1, 85	56: 1, 86	57: 1, 87
58: 1, 88	59: 1, 89	5a: 1, 90	5b: 1, 91	5c: 1, 92	5d: 1, 93	5e: 1, 94	5f: 1, 95
60: 1, 96	61: 1, 97	62: 1, 98	63: 1, 99	64: 1, 100	65: 1, 101	66: 1, 102	67: 1, 103
68: 1, 104	69: 1, 105	6a: 1, 106	6b: 1, 107	6c: 1, 108	6d: 1, 109	6e: 1, 110	6f: 1, 111
70: 1, 112	71: 1, 113	72: 1, 114	73: 1, 115	74: 1, 116	75: 1, 117	76: 1, 118	77: 1, 119
78: 1, 120	79: 1, 121	7a: 1, 122	7b: 1, 123	7c: 1, 124	7d: 1, 125	7e: 1, 126	7f: 1, 127
80: -1, -1	81: -1, -1	82: -1, -1	83: -1, -1	84: -1, -1	85: -1, -1	86: -1, -1	87: -1, -1
88: -1, -1	89: -1, -1	8a: -1, -1	8b: -1, -1	8c: -1, -1	8d: -1, -1	8e: -1, -1	8f: -1, -1
90: -1, -1	91: -1, -1	92: -1, -1	93: -1, -1	94: -1, -1	95: -1, -1	96: -1, -1	97: -1, -1
98: -1, -1	99: -1, -1	9a: -1, -1	9b: -1, -1	9c: -1, -1	9d: -1, -1	9e: -1, -1	9f: -1, -1
a0: -1, -1	a1: -1, -1	a2: -1, -1	a3: -1, -1	a4: -1, -1	a5: -1, -1	a6: -1, -1	a7: -1, -1
a8: -1, -1	a9: -1, -1	aa: -1, -1	ab: -1, -1	ac: -1, -1	ad: -1, -1	ae: -1, -1	af: -1, -1
b0: -1, -1	b1: -1, -1	b2: -1, -1	b3: -1, -1	b4: -1, -1	b5: -1, -1	b6: -1, -1	b7: -1, -1
b8: -1, -1	b9: -1, -1	ba: -1, -1	bb: -1, -1	bc: -1, -1	bd: -1, -1	be: -1, -1	bf: -1, -1
c0: -1, -1	c1: -1, -1	c2: -1, -1	c3: -1, -1	c4: -1, -1	c5: -1, -1	c6: -1, -1	c7: -1, -1
c8: -1, -1	c9: -1, -1	ca: -1, -1	cb: -1, -1	cc: -1, -1	cd: -1, -1	ce: -1, -1	cf: -1, -1
d0: -1, -1	d1: -1, -1	d2: -1, -1	d3: -1, -1	d4: -1, -1	d5: -1, -1	d6: -1, -1	d7: -1, -1
d8: -1, -1	d9: -1, -1	da: -1, -1	db: -1, -1	dc: -1, -1	dd: -1, -1	de: -1, -1	df: -1, -1
e0: -1, -1	e1: -1, -1	e2: -1, -1	e3: -1, -1	e4: -1, -1	e5: -1, -1	e6: -1, -1	e7: -1, -1
e8: -1, -1	e9: -1, -1	ea: -1, -1	eb: -1, -1	ec: -1, -1	ed: -1, -1	ee: -1, -1	ef: -1, -1
f0: -1, -1	f1: -1, -1	f2: -1, -1	f3: -1, -1	f4: -1, -1	f5: -1, -1	f6: -1, -1	f7: -1, -1
f8: -1, -1	f9: -1, -1	fa: -1, -1	fb: -1, -1	fc: -1, -1	fd: -1, -1	fe: -1, -1	ff: -1, -1

$ LC_ALL=POSIX ./b _ | paste - - - - - - - -
loc=C
00: 0, 0	01: 1, 1	02: 1, 2	03: 1, 3	04: 1, 4	05: 1, 5	06: 1, 6	07: 1, 7
08: 1, 8	09: 1, 9	0a: 1, 10	0b: 1, 11	0c: 1, 12	0d: 1, 13	0e: 1, 14	0f: 1, 15
10: 1, 16	11: 1, 17	12: 1, 18	13: 1, 19	14: 1, 20	15: 1, 21	16: 1, 22	17: 1, 23
18: 1, 24	19: 1, 25	1a: 1, 26	1b: 1, 27	1c: 1, 28	1d: 1, 29	1e: 1, 30	1f: 1, 31
20: 1, 32	21: 1, 33	22: 1, 34	23: 1, 35	24: 1, 36	25: 1, 37	26: 1, 38	27: 1, 39
28: 1, 40	29: 1, 41	2a: 1, 42	2b: 1, 43	2c: 1, 44	2d: 1, 45	2e: 1, 46	2f: 1, 47
30: 1, 48	31: 1, 49	32: 1, 50	33: 1, 51	34: 1, 52	35: 1, 53	36: 1, 54	37: 1, 55
38: 1, 56	39: 1, 57	3a: 1, 58	3b: 1, 59	3c: 1, 60	3d: 1, 61	3e: 1, 62	3f: 1, 63
40: 1, 64	41: 1, 65	42: 1, 66	43: 1, 67	44: 1, 68	45: 1, 69	46: 1, 70	47: 1, 71
48: 1, 72	49: 1, 73	4a: 1, 74	4b: 1, 75	4c: 1, 76	4d: 1, 77	4e: 1, 78	4f: 1, 79
50: 1, 80	51: 1, 81	52: 1, 82	53: 1, 83	54: 1, 84	55: 1, 85	56: 1, 86	57: 1, 87
58: 1, 88	59: 1, 89	5a: 1, 90	5b: 1, 91	5c: 1, 92	5d: 1, 93	5e: 1, 94	5f: 1, 95
60: 1, 96	61: 1, 97	62: 1, 98	63: 1, 99	64: 1, 100	65: 1, 101	66: 1, 102	67: 1, 103
68: 1, 104	69: 1, 105	6a: 1, 106	6b: 1, 107	6c: 1, 108	6d: 1, 109	6e: 1, 110	6f: 1, 111
70: 1, 112	71: 1, 113	72: 1, 114	73: 1, 115	74: 1, 116	75: 1, 117	76: 1, 118	77: 1, 119
78: 1, 120	79: 1, 121	7a: 1, 122	7b: 1, 123	7c: 1, 124	7d: 1, 125	7e: 1, 126	7f: 1, 127
80: -1, -1	81: -1, -1	82: -1, -1	83: -1, -1	84: -1, -1	85: -1, -1	86: -1, -1	87: -1, -1
88: -1, -1	89: -1, -1	8a: -1, -1	8b: -1, -1	8c: -1, -1	8d: -1, -1	8e: -1, -1	8f: -1, -1
90: -1, -1	91: -1, -1	92: -1, -1	93: -1, -1	94: -1, -1	95: -1, -1	96: -1, -1	97: -1, -1
98: -1, -1	99: -1, -1	9a: -1, -1	9b: -1, -1	9c: -1, -1	9d: -1, -1	9e: -1, -1	9f: -1, -1
a0: -1, -1	a1: -1, -1	a2: -1, -1	a3: -1, -1	a4: -1, -1	a5: -1, -1	a6: -1, -1	a7: -1, -1
a8: -1, -1	a9: -1, -1	aa: -1, -1	ab: -1, -1	ac: -1, -1	ad: -1, -1	ae: -1, -1	af: -1, -1
b0: -1, -1	b1: -1, -1	b2: -1, -1	b3: -1, -1	b4: -1, -1	b5: -1, -1	b6: -1, -1	b7: -1, -1
b8: -1, -1	b9: -1, -1	ba: -1, -1	bb: -1, -1	bc: -1, -1	bd: -1, -1	be: -1, -1	bf: -1, -1
c0: -1, -1	c1: -1, -1	c2: -1, -1	c3: -1, -1	c4: -1, -1	c5: -1, -1	c6: -1, -1	c7: -1, -1
c8: -1, -1	c9: -1, -1	ca: -1, -1	cb: -1, -1	cc: -1, -1	cd: -1, -1	ce: -1, -1	cf: -1, -1
d0: -1, -1	d1: -1, -1	d2: -1, -1	d3: -1, -1	d4: -1, -1	d5: -1, -1	d6: -1, -1	d7: -1, -1
d8: -1, -1	d9: -1, -1	da: -1, -1	db: -1, -1	dc: -1, -1	dd: -1, -1	de: -1, -1	df: -1, -1
e0: -1, -1	e1: -1, -1	e2: -1, -1	e3: -1, -1	e4: -1, -1	e5: -1, -1	e6: -1, -1	e7: -1, -1
e8: -1, -1	e9: -1, -1	ea: -1, -1	eb: -1, -1	ec: -1, -1	ed: -1, -1	ee: -1, -1	ef: -1, -1
f0: -1, -1	f1: -1, -1	f2: -1, -1	f3: -1, -1	f4: -1, -1	f5: -1, -1	f6: -1, -1	f7: -1, -1
f8: -1, -1	f9: -1, -1	fa: -1, -1	fb: -1, -1	fc: -1, -1	fd: -1, -1	fe: -1, -1	ff: -1, -1
-- >8 --

This breaks all programs that expect to process text/data portably,
since in LC_ALL=C half of all bytes collapse to one character
(for sort this means that they all collate equally, &c., &c.)!

Consider a diff of XBD 6.2 ("Character Encoding"), Issue 7 vs Issue 7 TC2:
-- >8 --
@@ -1768,9 +1664,13 @@

 <h3><a name="tag_06_02">   6.2 </a>Character Encoding</h3>

-<p>The POSIX locale contains the characters in <a href="#tagtcjh_3">Portable Character Set</a> , which have the properties listed
-in <a href="../basedefs/V1_chap07.html#tag_07_03_01"><i>LC_CTYPE</i></a> . In other locales, the presence, meaning, and
-representation of any additional characters are locale-specific.</p>
+<p>The POSIX locale shall contain 256 single-byte characters including the characters in <a href="#tagtcjh_3">Portable Character
+Set</a> and <a href="#tagtcjh_4">Non-Portable Control Characters</a>, which have the properties listed in <a href=
+"../basedefs/V1_chap07.html#tag_07_03_01"><i>LC_CTYPE</i></a>. It is unspecified whether characters not listed in those two tables
+are classified as <b>punct</b> or <b>cntrl</b>, or neither. Other locales shall contain the characters in <a href=
+"#tagtcjh_3">Portable Character Set</a> and may contain any or all of the control characters identified in <a href=
+"#tagtcjh_4">Non-Portable Control Characters</a>; the presence, meaning, and representation of any additional characters are
+locale-specific.</p>

 <p>In locales other than the POSIX locale, a character may have a state-dependent encoding. There are two types of these
 encodings:</p>
-- >8 --

This text is widely supported with global changes later originating from bug 674:
  > An invalid character sequence is detected. In the POSIX locale an EILSEQ error cannot occur since all byte values are valid characters.[/CX]
  > In the POSIX locale each byte is a valid single-byte character, and therefore this problem is avoided.
&c.
This text is unchanged in Issue 8 Draft 2.1.

Agonised,
наб

-- System Information:
Debian Release: bookworm/sid
  APT prefers unstable
  APT policy: (500, 'unstable')
Architecture: x32 (x86_64)
Foreign Architectures: amd64, i386

Kernel: Linux 5.18.0-3-amd64 (SMP w/2 CPU threads; PREEMPT)
Kernel taint flags: TAINT_PROPRIETARY_MODULE, TAINT_OOT_MODULE, TAINT_UNSIGNED_MODULE
Locale: LANG=en_GB.UTF-8, LC_CTYPE=en_GB.UTF-8 (charmap=UTF-8), LANGUAGE not set
Shell: /bin/sh linked to /usr/bin/dash
Init: systemd (via /run/systemd/system)
LSM: AppArmor: enabled

Versions of packages libc6 depends on:
ii  libgcc-s1  12.1.0-2

Versions of packages libc6 recommends:
ii  libidn2-0  2.3.3-1

Versions of packages libc6 suggests:
ii  debconf [debconf-2.0]  1.5.79
pn  glibc-doc              <none>
ii  libc-l10n              2.33-8
ii  libnss-nis             3.1-4
ii  libnss-nisplus         1.3-4
ii  locales                2.33-8

-- debconf information excluded

Attachment: signature.asc
Description: PGP signature


Reply to: