[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: lists.debian.org de-localization



Hi,

From: Josip Rodin <joy@gkvk.hr>
Subject: Re: lists.debian.org de-localization
Date: Sun, 12 Jan 2003 04:14:45 +0100

> This, on the other hand, is a hassle to handle (backporting or installation
> into subdirs). master.d.o is scheduled to be upgraded to woody after samosa.
> That's all I know. <shrug>

This is a good news.  Then I will work later on various encoding support.

Anyway, I don't expect the new master.d.o will have development version
of MHonArc (with encoding-assuming feature for raw 8bit headers) even if
it comes from non-Debian-package version.  Thus I think we will have to
have some method to handle raw 8bit headers.

Here is a "filter" to convert 8bit characters (assumed to be KOI8-R) to
"&#xxxx;" expression, which I wrote by imitating iso8859.pl, CharEnt.pm,
and UTF8.pm .  This filter is used for raw 7bit/8bit strings.  Since
7bit part of KOI8-R is identical to ASCII, it doesn't harm legal ASCII
headers.  The filter is to be installed into 
org/lists.debian.org/mhonarc/share/mhonarc/MHonArc/DEBIAN.pm and doesn't
depend on the version of MHonArc or Debian.
##  DEBIAN.pm by Tomohiro KUBOTA <kubota@debian.org>
##
##  CHARSETCONVERTER module that assume input string to be KOI8-R
##  and convert it into &#xxx; expression where xxx is decimal Unicode
##  codepoint.

package DEBIAN;

%US_ASCII_To_Ent = (
  #--------------------------------------------------------------------------
  # Hex Code	Entity Ref	# ISO external entity and description
  #--------------------------------------------------------------------------
    0x22,	"&quot;",	# ISOnum : Quotation mark
    0x26,	"&amp;",	# ISOnum : Ampersand
    0x3C,	"&lt;", 	# ISOnum : Less-than sign
    0x3E,	"&gt;", 	# ISOnum : Greater-than sign
);

%KOI8_R_To_Ent = (
  #--------------------------------------------------------------------------
  # Hex Code	Entity Ref	# ISO external entity and description
  #--------------------------------------------------------------------------
    0x80,	"&#9472;",	# BOX DRAWINGS LIGHT HORIZONTAL
    0x81,	"&#9474;",	# BOX DRAWINGS LIGHT VERTICAL
    0x82,	"&#9484;",	# BOX DRAWINGS LIGHT DOWN AND RIGHT
    0x83,	"&#9488;",	# BOX DRAWINGS LIGHT DOWN AND LEFT
    0x84,	"&#9492;",	# BOX DRAWINGS LIGHT UP AND RIGHT
    0x85,	"&#9496;",	# BOX DRAWINGS LIGHT UP AND LEFT
    0x86,	"&#9500;",	# BOX DRAWINGS LIGHT VERTICAL AND RIGHT
    0x87,	"&#9508;",	# BOX DRAWINGS LIGHT VERTICAL AND LEFT
    0x88,	"&#9516;",	# BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
    0x89,	"&#9524;",	# BOX DRAWINGS LIGHT UP AND HORIZONTAL
    0x8a,	"&#9532;",	# BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
    0x8b,	"&#9600;",	# UPPER HALF BLOCK
    0x8c,	"&#9604;",	# LOWER HALF BLOCK
    0x8d,	"&#9608;",	# FULL BLOCK
    0x8e,	"&#9612;",	# LEFT HALF BLOCK
    0x8f,	"&#9616;",	# RIGHT HALF BLOCK
    0x90,	"&#9617;",	# LIGHT SHADE
    0x91,	"&#9618;",	# MEDIUM SHADE
    0x92,	"&#9619;",	# DARK SHADE
    0x93,	"&#8992;",	# TOP HALF INTEGRAL
    0x94,	"&#9632;",	# BLACK SQUARE
    0x95,	"&#8729;",	# BULLET OPERATOR
    0x96,	"&#8730;",	# SQUARE ROOT
    0x97,	"&#8776;",	# ALMOST EQUAL TO
    0x98,	"&#8804;",	# LESS-THAN OR EQUAL TO
    0x99,	"&#8805;",	# GREATER-THAN OR EQUAL TO
    0x9a,	"&#160;",	# NO-BREAK SPACE
    0x9b,	"&#8993;",	# BOTTOM HALF INTEGRAL
    0x9c,	"&#176;",	# DEGREE SIGN
    0x9d,	"&#178;",	# SUPERSCRIPT TWO
    0x9e,	"&#183;",	# MIDDLE DOT
    0x9f,	"&#247;",	# DIVISION SIGN
    0xa0,	"&#9552;",	# BOX DRAWINGS DOUBLE HORIZONTAL
    0xa1,	"&#9553;",	# BOX DRAWINGS DOUBLE VERTICAL
    0xa2,	"&#9554;",	# BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE
    0xa3,	"&#1105;",	# CYRILLIC SMALL LETTER IO
    0xa4,	"&#9555;",	# BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE
    0xa5,	"&#9556;",	# BOX DRAWINGS DOUBLE DOWN AND RIGHT
    0xa6,	"&#9557;",	# BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE
    0xa7,	"&#9558;",	# BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE
    0xa8,	"&#9559;",	# BOX DRAWINGS DOUBLE DOWN AND LEFT
    0xa9,	"&#9560;",	# BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE
    0xaa,	"&#9561;",	# BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE
    0xab,	"&#9562;",	# BOX DRAWINGS DOUBLE UP AND RIGHT
    0xac,	"&#9563;",	# BOX DRAWINGS UP SINGLE AND LEFT DOUBLE
    0xad,	"&#9564;",	# BOX DRAWINGS UP DOUBLE AND LEFT SINGLE
    0xae,	"&#9565;",	# BOX DRAWINGS DOUBLE UP AND LEFT
    0xaf,	"&#9566;",	# BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE
    0xb0,	"&#9567;",	# BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE
    0xb1,	"&#9568;",	# BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
    0xb2,	"&#9569;",	# BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE
    0xb3,	"&#1025;",	# CYRILLIC CAPITAL LETTER IO
    0xb4,	"&#9570;",	# BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE
    0xb5,	"&#9571;",	# BOX DRAWINGS DOUBLE VERTICAL AND LEFT
    0xb6,	"&#9572;",	# BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE
    0xb7,	"&#9573;",	# BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE
    0xb8,	"&#9574;",	# BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL
    0xb9,	"&#9575;",	# BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE
    0xba,	"&#9576;",	# BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE
    0xbb,	"&#9577;",	# BOX DRAWINGS DOUBLE UP AND HORIZONTAL
    0xbc,	"&#9578;",	# BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE
    0xbd,	"&#9579;",	# BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE
    0xbe,	"&#9580;",	# BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL
    0xbf,	"&#169;",	# COPYRIGHT SIGN
    0xc0,	"&#1102;",	# CYRILLIC SMALL LETTER YU
    0xc1,	"&#1072;",	# CYRILLIC SMALL LETTER A
    0xc2,	"&#1073;",	# CYRILLIC SMALL LETTER BE
    0xc3,	"&#1094;",	# CYRILLIC SMALL LETTER TSE
    0xc4,	"&#1076;",	# CYRILLIC SMALL LETTER DE
    0xc5,	"&#1077;",	# CYRILLIC SMALL LETTER IE
    0xc6,	"&#1092;",	# CYRILLIC SMALL LETTER EF
    0xc7,	"&#1075;",	# CYRILLIC SMALL LETTER GHE
    0xc8,	"&#1093;",	# CYRILLIC SMALL LETTER HA
    0xc9,	"&#1080;",	# CYRILLIC SMALL LETTER I
    0xca,	"&#1081;",	# CYRILLIC SMALL LETTER SHORT I
    0xcb,	"&#1082;",	# CYRILLIC SMALL LETTER KA
    0xcc,	"&#1083;",	# CYRILLIC SMALL LETTER EL
    0xcd,	"&#1084;",	# CYRILLIC SMALL LETTER EM
    0xce,	"&#1085;",	# CYRILLIC SMALL LETTER EN
    0xcf,	"&#1086;",	# CYRILLIC SMALL LETTER O
    0xd0,	"&#1087;",	# CYRILLIC SMALL LETTER PE
    0xd1,	"&#1103;",	# CYRILLIC SMALL LETTER YA
    0xd2,	"&#1088;",	# CYRILLIC SMALL LETTER ER
    0xd3,	"&#1089;",	# CYRILLIC SMALL LETTER ES
    0xd4,	"&#1090;",	# CYRILLIC SMALL LETTER TE
    0xd5,	"&#1091;",	# CYRILLIC SMALL LETTER U
    0xd6,	"&#1078;",	# CYRILLIC SMALL LETTER ZHE
    0xd7,	"&#1074;",	# CYRILLIC SMALL LETTER VE
    0xd8,	"&#1100;",	# CYRILLIC SMALL LETTER SOFT SIGN
    0xd9,	"&#1099;",	# CYRILLIC SMALL LETTER YERU
    0xda,	"&#1079;",	# CYRILLIC SMALL LETTER ZE
    0xdb,	"&#1096;",	# CYRILLIC SMALL LETTER SHA
    0xdc,	"&#1101;",	# CYRILLIC SMALL LETTER E
    0xdd,	"&#1097;",	# CYRILLIC SMALL LETTER SHCHA
    0xde,	"&#1095;",	# CYRILLIC SMALL LETTER CHE
    0xdf,	"&#1098;",	# CYRILLIC SMALL LETTER HARD SIGN
    0xe0,	"&#1070;",	# CYRILLIC CAPITAL LETTER YU
    0xe1,	"&#1040;",	# CYRILLIC CAPITAL LETTER A
    0xe2,	"&#1041;",	# CYRILLIC CAPITAL LETTER BE
    0xe3,	"&#1062;",	# CYRILLIC CAPITAL LETTER TSE
    0xe4,	"&#1044;",	# CYRILLIC CAPITAL LETTER DE
    0xe5,	"&#1045;",	# CYRILLIC CAPITAL LETTER IE
    0xe6,	"&#1060;",	# CYRILLIC CAPITAL LETTER EF
    0xe7,	"&#1043;",	# CYRILLIC CAPITAL LETTER GHE
    0xe8,	"&#1061;",	# CYRILLIC CAPITAL LETTER HA
    0xe9,	"&#1048;",	# CYRILLIC CAPITAL LETTER I
    0xea,	"&#1049;",	# CYRILLIC CAPITAL LETTER SHORT I
    0xeb,	"&#1050;",	# CYRILLIC CAPITAL LETTER KA
    0xec,	"&#1051;",	# CYRILLIC CAPITAL LETTER EL
    0xed,	"&#1052;",	# CYRILLIC CAPITAL LETTER EM
    0xee,	"&#1053;",	# CYRILLIC CAPITAL LETTER EN
    0xef,	"&#1054;",	# CYRILLIC CAPITAL LETTER O
    0xf0,	"&#1055;",	# CYRILLIC CAPITAL LETTER PE
    0xf1,	"&#1071;",	# CYRILLIC CAPITAL LETTER YA
    0xf2,	"&#1056;",	# CYRILLIC CAPITAL LETTER ER
    0xf3,	"&#1057;",	# CYRILLIC CAPITAL LETTER ES
    0xf4,	"&#1058;",	# CYRILLIC CAPITAL LETTER TE
    0xf5,	"&#1059;",	# CYRILLIC CAPITAL LETTER U
    0xf6,	"&#1046;",	# CYRILLIC CAPITAL LETTER ZHE
    0xf7,	"&#1042;",	# CYRILLIC CAPITAL LETTER VE
    0xf8,	"&#1068;",	# CYRILLIC CAPITAL LETTER SOFT SIGN
    0xf9,	"&#1067;",	# CYRILLIC CAPITAL LETTER YERU
    0xfa,	"&#1047;",	# CYRILLIC CAPITAL LETTER ZE
    0xfb,	"&#1064;",	# CYRILLIC CAPITAL LETTER SHA
    0xfc,	"&#1069;",	# CYRILLIC CAPITAL LETTER E
    0xfd,	"&#1065;",	# CYRILLIC CAPITAL LETTER SHCHA
    0xfe,	"&#1063;",	# CYRILLIC CAPITAL LETTER CHE
    0xff,	"&#1066;",	# CYRILLIC CAPITAL LETTER HARD SIGN
);

sub koi8r2sgml {
    my $data = $_[0];
    my ($len, $ret, $char, $offset);

    $len = length($data); $ret = ""; $offset = 0;
    while ($offset < $len) {
	$char = unpack("C", substr($data, $offset++, 1));
	if ($char < 128) {
	    $ret .= ($US_ASCII_To_Ent{$char} || pack("C", $char));
	} else {
	    $ret .= ($KOI8_R_To_Ent{$char} || pack("C", $char));
	}
    }
    $ret;
}

1;
--- debian.rc	2003-01-12 12:33:02.000000000 +0900
+++ debian.rc.new	2003-01-12 12:35:43.000000000 +0900
@@ -3,7 +3,7 @@
 
 <!-- Common Resources -------------------------------------------------------->
 <CharsetConverters>
-plain;          mhonarc::htmlize;
+plain;          MHonArc::DEBIAN::koi8r2sgml;  MHonArc/DEBIAN.pm
 us-ascii;       mhonarc::htmlize;
 iso-8859-1;     iso_8859::str2sgml;     iso8859.pl
 iso-8859-2;     iso_8859::str2sgml;     iso8859.pl

Reply to: