New version of DPNhtml2mail.pl [was: Possible bug when generating mail version]
- To: debian-publicity@lists.debian.org
- Subject: New version of DPNhtml2mail.pl [was: Possible bug when generating mail version]
- From: Thomas Blein <dageou@yahoo.fr>
- Date: Thu, 3 May 2012 00:14:51 +0200
- Message-id: <[🔎] 20120502221451.GB4855@yopbook.tblein.eu>
- In-reply-to: <20120403204849.GD21443@sid.nuvreauspam>
- References: <20120402141150.GF5952@zouish.org> <20120402204845.GY21443@sid.nuvreauspam> <jld42b$ht3$1@dough.gmane.org> <20120403204849.GD21443@sid.nuvreauspam>
Dear all,
I got a look in the DPNhtml2mail.pl script to improve it and more
particularly solved the problem Andrei report.
I work around and change quite a lot of things inside it to correct also
some small bugs in the display (extra spacing mainly) with some hacks
dedicated to the French version.
I tested it on the DWN 2012/07 and it looks like the layout is nice for
French and English. I did not tested so much with the other languages.
The script is not perfect since I am new to perl and some factorisation
of the code might be useful.
Do not hesitate to report any problem.
Best regards,
Thomas
#!/usr/bin/env perl
# Author: Jean-Edouard Babin, radius in gmail.com
# Author: Thomas Blein, tblein in tblein.eu (2012)
# Todo
# - Debug need to be implemented/improved
# - Better \n removing
# History
# Revision 1.2 02/05/2012
# Remove indentation of first li
# Correct date missalignement in French
# Correction of indentation of the script
# Take into account also quoting string in title
# Correction of all the spacing problems
# Revision 1.1 01/02/2009 03:39
# Split parse & print / Wrapping
# Revision 1.0 31/01/2009 23:35
# Initial Version
use strict;
use warnings;
use HTML::TokeParser::Simple;
use LWP::UserAgent;
use Getopt::Long;
use Text::Wrap;
$Text::Wrap::columns = 80;
my %opts;
my $data;
my @links;
my $link_index = 1;
my $footer_found = 0;
my $stories_index = 0;
my $paragraph_index = 0;
my $subparagraph_index = 0;
# Base config
my $default_url = $opts{u} = 'http://www.debian.org/News/weekly/';
my $default_lang = $opts{l} = 'en';
$opts{d} = 0;
# Option parsing
GetOptions(\%opts, 'u=s', 'i=s', 'l=s', 'd');
if (!defined($opts{i})) {
print STDERR "Usage: $0 -i issue [-l lang] [-u base_url] [-d]\n";
print STDERR " -i issue number (i.e.: 2008/17)\n";
print STDERR " -l langage (i.e.: fr). Default value is \"-l $default_lang\"\n";
print STDERR " -u base_url: DPN common URL. Default value is \"-u $default_url\"\n";
print STDERR " -d for verbose output\n";
exit 1;
}
if ($opts{d} == 1) {
use Data::Dumper; #useful for debug only
}
# HTML file fetching
my $ua = LWP::UserAgent->new;
$ua->agent("DPNhtml2mail");
my $req = HTTP::Request->new(GET => "$opts{u}$opts{i}/index.$opts{l}.html");
#my $req = HTTP::Request->new(GET => "http://www.debian.org/News/2011/20110625.fr.html");
my $res = $ua->request($req);
if (! $res->is_success) {
die "Can't fetch $opts{u}$opts{i}/index.$opts{l}.html ".$res->status_line;
}
# Start of parsing / storage
my $p = HTML::TokeParser::Simple->new(\$res->content);
my $token = $p->get_tag("h1");
$data->{header}{title} = $p->get_trimmed_text;
$data->{header}{url} = "$opts{u}$opts{i}/";
($data->{header}{dpn},$data->{header}{date}) = split(/ - /,$data->{header}{title});
if ($opts{l} eq "fr") {
$data->{header}{project} = 'Projet Debian';
}
elsif ($opts{l} eq "es") {
$data->{header}{project} = 'El proyecto Debian';
}
elsif ($opts{l} eq "it") {
$data->{header}{project} = 'Il progetto Debian';
}
else {
$data->{header}{project} = 'The Debian Project';
}
my $in_title = 0;
while (my $token = $p->get_tag) {
if ($token->is_start_tag('h2')) {
$p->get_tag("a");
$paragraph_index = 0;
$subparagraph_index = 0;
$stories_index++;
# Get story name
$in_title = 1;
$data->{stories}[$stories_index]{'title'} = $p->get_trimmed_text;
} elsif ($token->is_end_tag('h2')){
$in_title = 0;
}elsif ($token->is_start_tag('a')) {
if ($token->[1]{'href'} !~ /^#\w+.*$/) { # Common link
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= $p->get_trimmed_text . " [".$link_index."]";
push(@{$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'links'}},
{ 'index' => $link_index++,
'link' => $token->[1]{'href'} || '-' });
} else { # First internal links
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= $p->get_trimmed_text;
}
} elsif ($token->is_tag('q')) {
if ($token->is_start_tag('q')) {
if ($opts{l} eq "fr") {
if ($in_title) {
$data->{stories}[$stories_index]
{'title'} .= " « ".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= " « ".$p->get_trimmed_text;
}
} elsif ($opts{l} eq "de") {
if ($in_title) {
$data->{stories}[$stories_index]
{'title'} .= " »".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= " »".$p->get_trimmed_text;
}
} elsif ($opts{l} eq "es" or $opts{l} eq "it") {
if ($in_title) {
$data->{stories}[$stories_index]
{'title'} .= " «".$p->get_trimmed_text;
} else{
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= " «".$p->get_trimmed_text;
}
} else {
if ($in_title) {
$data->{stories}[$stories_index]
{'title'} .= " \"".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= " \"".$p->get_trimmed_text;
}
}
} elsif ($token->is_end_tag('q')) {
if ($opts{l} eq "fr") {
if ($in_title) {
$data->{stories}[$stories_index]
{'title'} .= " » ".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= " » ".$p->get_trimmed_text;
}
} elsif ($opts{l} eq "de") {
if ($in_title) {
$data->{stories}[$stories_index]
{'title'} .= "« ".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= "« ".$p->get_trimmed_text;
}
} elsif ($opts{l} eq "es" or $opts{l} eq "it") {
if ($in_title) {
$data->{stories}[$stories_index]
{'title'} .= "» ".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= "« ".$p->get_trimmed_text;
}
} else {
if ($in_title) {
$data->{stories}[$stories_index]
{'title'} .= "\" ".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= "« ".$p->get_trimmed_text;
}
}
}
} elsif ($token->is_end_tag('p')) {
delete @links[0..$#links];
$paragraph_index++;
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= $p->get_text;
} elsif ($token->is_tag('li')) {
if ($token->is_start_tag('li')) {
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= " * ".$p->get_trimmed_text;
} elsif ($token->is_end_tag('li')) {
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= "\n" . $p->get_trimmed_text;
$subparagraph_index++;
}
} elsif ($token->is_start_tag('hr')) {
last if ($footer_found);
$p->get_tag('p');
$p->get_tag('p');
$p->get_token('p');
$p->get_token('p');
$p->get_token('p');
$p->get_token('p');
$footer_found = 1;
} elsif ($token->is_start_tag('ul')) {
} else {
if ($in_title) {
$data->{stories}[$stories_index]
{'title'} .= $p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]
{'paragraph'}[$paragraph_index]
{'subparagraph'}[$subparagraph_index]
{'text'} .= $p->get_text;
}
}
}
# Start of formating / printing
print "------------------------------------------------------------------------\n";
print "$data->{header}{project}"." "x(72-length($data->{header}{project})-22)."http://www.debian.org/\n";
print "$data->{header}{dpn}"." "x(72-length($data->{header}{dpn})-33)."debian-publicity\@lists.debian.org\n";
if ($opts{l} eq "fr") {
print "$data->{header}{date}"." "x(74-length($data->{header}{date})-length($data->{header}{url}))."$data->{header}{url}\n";
} else {
print "$data->{header}{date}"." "x(72-length($data->{header}{date})-length($data->{header}{url}))."$data->{header}{url}\n";
}
print
"------------------------------------------------------------------------\n\n";
foreach my $stories (@{$data->{stories}}) {
print $stories->{'title'} . "\n" . '-'x(length($stories->{'title'})). "\n" if (defined($stories->{'title'}));
foreach my $paragraph (@{$stories->{paragraph}}) {
foreach my $subparagraph (@{$paragraph->{subparagraph}}) {
$subparagraph->{'text'} =~ s/^\n*//g;
$subparagraph->{'text'} =~ s/\n/ /g;
$subparagraph->{'text'} =~ s/\s+/ /g;
$subparagraph->{'text'} =~ s/ \*/ \*/g;
$subparagraph->{'text'} =~ s/ \./\./g;
$subparagraph->{'text'} =~ s/ ,/,/g;
$subparagraph->{'text'} =~ s/\( /\(/g;
$subparagraph->{'text'} =~ s/ \)/\)/g;
if (length($subparagraph->{'text'}) > 1){
if ((length($subparagraph->{'text'}) > 3) and
((substr $subparagraph->{'text'}, 0, 3) eq " *")){
print wrap("", " ", $subparagraph->{'text'});
} else {
print wrap("", "", $subparagraph->{'text'});
}
print "\n";
}
}
print "\n";
foreach my $link (@{$paragraph->{'links'}}) {
$link->{'link'} =~ s,^../../../..,http://www.debian.org,;
$link->{'link'} =~ s,^../../..,http://www.debian.org/News,;
$link->{'link'} =~ s,^../..,http://www.debian.org/News/weekly,;
print " $link->{'index'} : $link->{'link'}\n";
}
print "\n";
}
}
Reply to: