Re: New version of DPNhtml2mail.pl
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Hi,
Le 03/05/12 01:43, David Prévot a écrit :
> Le 02/05/2012 18:14, Thomas Blein a écrit :
>> # Correction of indentation of the script
>
> Do you have a version without that change? Or at worst, a way to
> apply the same changes to the current version: it's pretty hard to
> figure out what is the actual code edited.
I managed to revert all the indentation correction. Still a lot of
changes but maybe clearer for the diff.
I did this correction of the indentation mainly because it was
difficult for me to follow the code.
Regards,
Thomas
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.9 (Darwin)
iQEcBAEBAgAGBQJPod7eAAoJEPrijPU0jEt7ifIH/A/eDz3o35jKQe6wB+XJTOfv
AgxvIxGzK8hjKc+jxk8t3Xj/FdSb9sMS5xJPDiOuGZu/yCphiqRrOKA2rPCRhSKH
rwEgRBN3rNSOc7AkLhOmst3UNzwNmlTWAr379Q+Ulg2pFUBLsMOgZLDaTPyCgEsu
PR6IqwGoUt5jlh8Cme9hPVrsfHLMCL2P+lhwDRJ6pGDIWDAc/sZIDeKcHw85xqtz
Mn46AKs5NiO48OXdpZtc95/NbuOfzGqFKQLoZJ0h9+1RWyS+ZjLrDNeBRuyYbU4k
UsVCtUah/EINlRWEAwldDmeUTEQG/ccQdslmDa01C77I2gxAhOQNvQ8fm4yKgDk=
=ruZ/
-----END PGP SIGNATURE-----
#!/usr/bin/env perl
# Author: Jean-Edouard Babin, radius in gmail.com
# Author: Thomas Blein, tblein in tblein.eu (2012)
# Todo
# - Debug need to be implemented/improved
# - Better \n removing
# History
# Revision 1.2 02/05/2012
# Remove indentation of first li
# Correct date missalignement in French
# Correction of indentation of the script
# Take into account also quoting string in title
# Revision 1.1 01/02/2009 03:39
# Split parse & print / Wrapping
# Revision 1.0 31/01/2009 23:35
# Initial Version
use strict;
use warnings;
use HTML::TokeParser::Simple;
use LWP::UserAgent;
use Getopt::Long;
use Text::Wrap;
$Text::Wrap::columns = 80;
my %opts;
my $data;
my @links;
my $link_index = 1;
my $footer_found = 0;
my $stories_index = 0;
my $paragraph_index = 0;
my $subparagraph_index = 0;
# Base config
my $default_url = $opts{u} = 'http://www.debian.org/News/weekly/';
my $default_lang = $opts{l} = 'en';
$opts{d} = 0;
# Option parsing
GetOptions(\%opts, 'u=s', 'i=s', 'l=s', 'd');
if (!defined($opts{i})) {
print STDERR "Usage: $0 -i issue [-l lang] [-u base_url] [-d]\n";
print STDERR " -i issue number (i.e.: 2008/17)\n";
print STDERR " -l langage (i.e.: fr). Default value is \"-l $default_lang\"\n";
print STDERR " -u base_url: DPN common URL. Default value is \"-u $default_url\"\n";
print STDERR " -d for verbose output\n";
exit 1;
}
if ($opts{d} == 1) {
use Data::Dumper; #useful for debug only
}
# HTML file fetching
my $ua = LWP::UserAgent->new;
$ua->agent("DPNhtml2mail");
my $req = HTTP::Request->new(GET => "$opts{u}$opts{i}/index.$opts{l}.html");
my $res = $ua->request($req);
if (! $res->is_success) {
die "Can't fetch $opts{u}$opts{i}/index.$opts{l}.html ".$res->status_line;
}
# Start of parsing / storage
my $p = HTML::TokeParser::Simple->new(\$res->content);
my $token = $p->get_tag("h1");
$data->{header}{title} = $p->get_trimmed_text;
$data->{header}{url} = "$opts{u}$opts{i}/";
($data->{header}{dpn},$data->{header}{date}) = split(/ - /,$data->{header}{title});
if ($opts{l} eq "fr") {
$data->{header}{project} = 'Projet Debian';
}
elsif ($opts{l} eq "es") {
$data->{header}{project} = 'El proyecto Debian';
}
elsif ($opts{l} eq "it") {
$data->{header}{project} = 'Il progetto Debian';
}
else {
$data->{header}{project} = 'The Debian Project';
}
my $in_title = 0;
while (my $token = $p->get_tag) {
if ($token->is_start_tag('h2')) {
$p->get_tag("a");
$paragraph_index = 0;
$subparagraph_index = 0;
$stories_index++;
# Get story name
$in_title = 1;
$data->{stories}[$stories_index]{'title'} = $p->get_trimmed_text;
} elsif ($token->is_end_tag('h2')){
$in_title = 0;
}elsif ($token->is_start_tag('a')) {
if ($token->[1]{'href'} !~ /^#\w+.*$/) { # Common link
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_trimmed_text . " [".$link_index."]";
push(@{$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'links'}}, { 'index' => $link_index++, 'link' => $token->[1]{'href'} || '-' });
} else { # First internal links
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_trimmed_text;
}
} elsif ($token->is_tag('q')) {
if ($token->is_start_tag('q')) {
if ($opts{l} eq "fr") {
if ($in_title) {
$data->{stories}[$stories_index]{'title'} .= " « ".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " « ".$p->get_trimmed_text;
}
} elsif ($opts{l} eq "de") {
if ($in_title) {
$data->{stories}[$stories_index]{'title'} .= " »".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " »".$p->get_trimmed_text;
}
} elsif ($opts{l} eq "es" or $opts{l} eq "it") {
if ($in_title) {
$data->{stories}[$stories_index]{'title'} .= " «".$p->get_trimmed_text;
} else{
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " «".$p->get_trimmed_text;
}
} else {
if ($in_title) {
$data->{stories}[$stories_index]{'title'} .= " \"".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " \"".$p->get_trimmed_text;
}
}
} elsif ($token->is_end_tag('q')) {
if ($opts{l} eq "fr") {
if ($in_title) {
$data->{stories}[$stories_index]{'title'} .= " » ".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " » ".$p->get_trimmed_text;
}
} elsif ($opts{l} eq "de") {
if ($in_title) {
$data->{stories}[$stories_index]{'title'} .= "« ".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "« ".$p->get_trimmed_text;
}
} elsif ($opts{l} eq "es" or $opts{l} eq "it") {
if ($in_title) {
$data->{stories}[$stories_index]{'title'} .= "» ".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "« ".$p->get_trimmed_text;
}
} else {
if ($in_title) {
$data->{stories}[$stories_index]{'title'} .= "\" ".$p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "« ".$p->get_trimmed_text;
}
}
}
} elsif ($token->is_end_tag('p')) {
delete @links[0..$#links];
$paragraph_index++;
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_text;
} elsif ($token->is_tag('li')) {
if ($token->is_start_tag('li')) {
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " * ".$p->get_trimmed_text;
} elsif ($token->is_end_tag('li')) {
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "\n" . $p->get_trimmed_text;
$subparagraph_index++;
}
} elsif ($token->is_start_tag('hr')) {
last if ($footer_found);
$p->get_tag('p');
$p->get_tag('p');
$p->get_token('p');
$p->get_token('p');
$p->get_token('p');
$p->get_token('p');
$footer_found = 1;
} elsif ($token->is_start_tag('ul')) {
} else {
if ($in_title) {
$data->{stories}[$stories_index]{'title'} .= $p->get_trimmed_text;
} else {
$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_text;
}
}
}
# Start of formating / printing
print "------------------------------------------------------------------------\n";
print "$data->{header}{project}"." "x(72-length($data->{header}{project})-22)."http://www.debian.org/\n";
print "$data->{header}{dpn}"." "x(72-length($data->{header}{dpn})-33)."debian-publicity\@lists.debian.org\n";
if ($opts{l} eq "fr") {
print "$data->{header}{date}"." "x(74-length($data->{header}{date})-length($data->{header}{url}))."$data->{header}{url}\n";
} else {
print "$data->{header}{date}"." "x(72-length($data->{header}{date})-length($data->{header}{url}))."$data->{header}{url}\n";
}
print
"------------------------------------------------------------------------\n\n";
foreach my $stories (@{$data->{stories}}) {
print $stories->{'title'} . "\n" . '-'x(length($stories->{'title'})). "\n" if (defined($stories->{'title'}));
foreach my $paragraph (@{$stories->{paragraph}}) {
foreach my $subparagraph (@{$paragraph->{subparagraph}}) {
$subparagraph->{'text'} =~ s/^\n*//g;
$subparagraph->{'text'} =~ s/\n/ /g;
$subparagraph->{'text'} =~ s/\s+/ /g;
$subparagraph->{'text'} =~ s/ \*/ \*/g;
$subparagraph->{'text'} =~ s/ \./\./g;
$subparagraph->{'text'} =~ s/ ,/,/g;
$subparagraph->{'text'} =~ s/\( /\(/g;
$subparagraph->{'text'} =~ s/ \)/\)/g;
if (length($subparagraph->{'text'}) > 1){
if ((length($subparagraph->{'text'}) > 3) and ((substr $subparagraph->{'text'}, 0, 3) eq " *")){
print wrap("", "", $subparagraph->{'text'});
} else {
print wrap("", "", $subparagraph->{'text'});
}
print "\n";
}
}
print "\n";
foreach my $link (@{$paragraph->{'links'}}) {
$link->{'link'} =~ s,^../../../..,http://www.debian.org,;
$link->{'link'} =~ s,^../../..,http://www.debian.org/News,;
$link->{'link'} =~ s,^../..,http://www.debian.org/News/weekly,;
print " $link->{'index'} : $link->{'link'}\n";
}
print "\n";
}
}
Reply to: