Bug#61342: your mail
[I'll just continue on talking with myself here if you all don't mind...]
On Tue, 04 Jan 2005, Don Armstrong wrote:
> The webpage output by these scripts should really be in UTF8, not
> ISO8859-1, as the maintainer field is in UTF8.
>
> Currently it looks like Unicode::MapUTF8 is the right way to deal
> with the charset->UTF8 issue, but I'm not sure if it's optimal or
> not. It would require a trivial bit of munging to the output
> produced by the webpage and the patch that I made to call to_uft8
> when appropriate, as well as an addition dependency on
> libunicode-maputf8-perl which is already in Debian.
After talking with Colin Watson on IRC, I implemented the
Unicode::MapUTF8 approach, so now all of the output is in UTF-8,
including the message headers (which weren't in UTF-8 in the previous
patch.)
It's a pretty straightforward patch, and the only real downside to it
is an additional dependency on libunicode-maputf8-perl.
It's working on my mini-bts here: http://bugs.donarmstrong.com/cgi-bin/bugreport.cgi?bug=20
Don Armstrong
--
"Because," Fee-5 explained patiently, "I was born in the fifth row. Any fool
would understand that, but against stupidity the very Gods themselves
contend in vain."
-- Alfred Bester _The Computer Connection_ p19
http://www.donarmstrong.com http://rzlab.ucr.edu
Index: bugreport.cgi
===================================================================
RCS file: /cvs/debbugs/source/cgi/bugreport.cgi,v
retrieving revision 1.65
diff -u -r1.65 bugreport.cgi
--- bugreport.cgi 1 Jun 2004 00:41:26 -0000 1.65
+++ bugreport.cgi 5 Jan 2005 20:45:22 -0000
@@ -70,7 +70,7 @@
$filename = '' unless defined $filename;
if ($top) {
- $$this .= htmlsanit($entity->stringify_header) unless ($terse);
+ $$this .= htmlsanit(de_rfc1522($entity->stringify_header)) unless ($terse);
$$this .= "\n";
}
@@ -150,7 +150,7 @@
my %status = %{getbugstatus($ref)};
unless (%status) {
print <<EOF;
-Content-Type: text/html
+Content-Type: text/html;charset=utf-8
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
@@ -181,7 +181,7 @@
$indexentry .= htmlpackagelinks($status{package}, 0);
$indexentry .= "Reported by: <a href=\"" . submitterurl($status{originator})
- . "\">" . htmlsanit($status{originator}) . "</a>;\n";
+ . "\">" . htmlsanit(de_rfc1522($status{originator})) . "</a>;\n";
$indexentry .= "Owned by: " . htmlsanit($status{owner}) . ";\n"
if length $status{owner};
@@ -369,7 +369,9 @@
$normstate= 'go';
push @mail, $_;
} elsif ($normstate eq 'html') {
- $this .= $_;
+ # XXX This is broken. The BTS shouldn't be writing
+ # HTML into the log... but it does.
+ $this .= de_rfc1522($_);
} elsif ($normstate eq 'go') {
s/^\030//;
if (!$suppressnext && !$found_msgid &&
@@ -409,7 +411,7 @@
$thisheader = "<h2>Message sent:</h2>\n";
} else {
s/\04/, /g; s/\n$//;
- $thisheader = "<h2>Message sent to ".htmlsanit($_).":</h2>\n";
+ $thisheader = "<h2>Message sent to ".htmlsanit(de_rfc1522($_)).":</h2>\n";
}
$this = "";
$normstate= 'kill-body';
@@ -448,7 +450,7 @@
print join("", @mails );
exit 0;
}
-print "Content-Type: text/html\n\n";
+print "Content-Type: text/html;charset=utf-8\n\n";
my $title = htmlsanit($status{subject});
Index: common.pl
===================================================================
RCS file: /cvs/debbugs/source/cgi/common.pl,v
retrieving revision 1.108
diff -u -r1.108 common.pl
--- common.pl 28 Mar 2004 06:02:45 -0000 1.108
+++ common.pl 5 Jan 2005 20:45:22 -0000
@@ -12,6 +12,10 @@
use Debbugs::Versions;
+# for de_rfc1522
+use MIME::WordDecoder qw();
+use Unicode::MapUTF8 qw(to_utf8 utf8_supported_charset);
+
$MLDBM::RemoveTaint = 1;
my $common_archive = 0;
@@ -286,8 +290,8 @@
$result .= htmlpackagelinks($status{"package"}, 1);
$result .= $showseverity;
$result .= "Reported by: <a href=\"" . submitterurl($status{originator})
- . "\">" . htmlsanit($status{originator}) . "</a>";
- $result .= ";\nOwned by: " . htmlsanit($status{owner})
+ . "\">" . htmlsanit(de_rfc1522($status{originator})) . "</a>";
+ $result .= ";\nOwned by: " . htmlsanit(de_rfc1522($status{owner}))
if length $status{owner};
$result .= ";\nTags: <strong>"
. htmlsanit(join(", ", sort(split(/\s+/, $status{tags}))))
@@ -420,6 +424,33 @@
$in =~ s/([<>&"])/\&$saniarray{$1};/g;
return $in;
}
+
+=head2 de_rfc1522
+
+ de_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
+
+Turn RFC-1522 names into the UTF-8 equivalent. [#61342 et al]
+
+=cut
+
+# Set up the default rfc1522 decoder, which turns all charsets that
+# are supported into the appropriate UTF-8 charset.
+MIME::WordDecoder->default(new MIME::WordDecoder(['*' => sub {my ($data,$charset) = @_;
+ return $data unless utf8_supported_charset($charset);
+ return to_utf8({-string => $data,
+ -charset => $charset,
+ });
+ },
+ ],));
+
+sub de_rfc1522($){
+ my ($string) = @_;
+
+ # unmime calls the default MIME::WordDecoder; handler set up at
+ # initalization time.
+ return MIME::WordDecoder::unmime($string);
+}
+
sub maybelink {
my $in = shift;
Index: pkgreport.cgi
===================================================================
RCS file: /cvs/debbugs/source/cgi/pkgreport.cgi,v
retrieving revision 1.76
diff -u -r1.76 pkgreport.cgi
--- pkgreport.cgi 19 Apr 2004 10:30:42 -0000 1.76
+++ pkgreport.cgi 5 Jan 2005 20:45:22 -0000
@@ -234,7 +234,7 @@
my $result = htmlizebugs(\@bugs);
-print "Content-Type: text/html\n\n";
+print "Content-Type: text/html;charset=utf-8\n\n";
print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n";
print "<HTML><HEAD>\n" .
Reply to: