[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

New version of DPNhtml2mail.pl [was: Possible bug when generating mail version]



Dear all,

I got a look in the DPNhtml2mail.pl script to improve it and more
particularly solved the problem Andrei report.

I work around and change quite a lot of things inside it to correct also
some small bugs in the display (extra spacing mainly) with some hacks
dedicated to the French version.

I tested it on the DWN 2012/07 and it looks like the layout is nice for
French and English. I did not tested so much with the other languages.

The script is not perfect since I am new to perl and some factorisation
of the code might be useful.

Do not hesitate to report any problem.

Best regards,

Thomas
#!/usr/bin/env perl

# Author: Jean-Edouard Babin, radius in gmail.com
# Author: Thomas Blein, tblein in tblein.eu (2012)

# Todo
# - Debug need to be implemented/improved
# - Better \n removing

# History
# Revision 1.2 02/05/2012
# Remove indentation of first li
# Correct date missalignement in French
# Correction of indentation of the script
# Take into account also quoting string in title
# Correction of all the spacing problems

# Revision 1.1 01/02/2009 03:39
# Split parse & print / Wrapping

# Revision 1.0 31/01/2009 23:35
# Initial Version

use strict;
use warnings;

use HTML::TokeParser::Simple;
use LWP::UserAgent;
use Getopt::Long;
use Text::Wrap;

$Text::Wrap::columns = 80;
my %opts;
my $data;
my @links;

my $link_index = 1;
my $footer_found = 0;
my $stories_index = 0;
my $paragraph_index = 0;
my $subparagraph_index = 0;

# Base config
my $default_url  = $opts{u} = 'http://www.debian.org/News/weekly/';
my $default_lang = $opts{l} = 'en';
                   $opts{d} = 0;

# Option parsing
GetOptions(\%opts, 'u=s', 'i=s', 'l=s', 'd');

if (!defined($opts{i})) {
    print STDERR "Usage: $0 -i issue [-l lang] [-u base_url] [-d]\n";
    print STDERR " -i issue number (i.e.: 2008/17)\n";
    print STDERR " -l langage (i.e.: fr). Default value is \"-l $default_lang\"\n";
    print STDERR " -u base_url: DPN common URL. Default value is \"-u $default_url\"\n";
    print STDERR " -d for verbose output\n";
    exit 1;
}

if ($opts{d} == 1) {
    use Data::Dumper; #useful for debug only
}

# HTML file fetching
my $ua = LWP::UserAgent->new;
$ua->agent("DPNhtml2mail");

my $req = HTTP::Request->new(GET => "$opts{u}$opts{i}/index.$opts{l}.html");
#my $req = HTTP::Request->new(GET => "http://www.debian.org/News/2011/20110625.fr.html";);
my $res = $ua->request($req);
if (! $res->is_success) {
    die "Can't fetch $opts{u}$opts{i}/index.$opts{l}.html ".$res->status_line;
}

# Start of parsing / storage
my $p = HTML::TokeParser::Simple->new(\$res->content);
my $token = $p->get_tag("h1");
$data->{header}{title} = $p->get_trimmed_text;
$data->{header}{url}   = "$opts{u}$opts{i}/";
($data->{header}{dpn},$data->{header}{date}) = split(/ - /,$data->{header}{title});
if ($opts{l} eq "fr") {
    $data->{header}{project} = 'Projet Debian';
}
elsif ($opts{l} eq "es") {
        $data->{header}{project} = 'El proyecto Debian';
}
elsif ($opts{l} eq "it") {
        $data->{header}{project} = 'Il progetto Debian';
}
else {
    $data->{header}{project} = 'The Debian Project';
}

my $in_title = 0;

while (my $token = $p->get_tag) {
    if ($token->is_start_tag('h2')) {
        $p->get_tag("a");
        $paragraph_index = 0;
        $subparagraph_index = 0;
        $stories_index++;
        # Get story name
        $in_title = 1;
        $data->{stories}[$stories_index]{'title'} = $p->get_trimmed_text;
    } elsif ($token->is_end_tag('h2')){
        $in_title = 0;
    }elsif ($token->is_start_tag('a')) {
        if ($token->[1]{'href'} !~ /^#\w+.*$/) {                # Common link
            $data->{stories}[$stories_index]
                   {'paragraph'}[$paragraph_index]
                   {'subparagraph'}[$subparagraph_index]
                   {'text'} .= $p->get_trimmed_text . " [".$link_index."]";
            push(@{$data->{stories}[$stories_index]
                          {'paragraph'}[$paragraph_index]
                          {'links'}},
                 { 'index' => $link_index++,
                   'link' => $token->[1]{'href'} || '-' });
        } else {                                # First internal links
            $data->{stories}[$stories_index]
                   {'paragraph'}[$paragraph_index]
                   {'subparagraph'}[$subparagraph_index]
                   {'text'} .= $p->get_trimmed_text;
        }
    } elsif ($token->is_tag('q')) {
        if ($token->is_start_tag('q')) {
            if ($opts{l} eq "fr") {
                if ($in_title) {
                    $data->{stories}[$stories_index]
                           {'title'} .= " « ".$p->get_trimmed_text;
                } else {
                    $data->{stories}[$stories_index]
                           {'paragraph'}[$paragraph_index]
                           {'subparagraph'}[$subparagraph_index]
                           {'text'} .= " « ".$p->get_trimmed_text;
                }
            } elsif ($opts{l} eq "de") {
                if ($in_title) {
                    $data->{stories}[$stories_index]
                           {'title'} .= " »".$p->get_trimmed_text;
                } else {
                    $data->{stories}[$stories_index]
                           {'paragraph'}[$paragraph_index]
                           {'subparagraph'}[$subparagraph_index]
                           {'text'} .= " »".$p->get_trimmed_text;
                }
            } elsif ($opts{l} eq "es" or $opts{l} eq "it") {
                if ($in_title) {
                    $data->{stories}[$stories_index]
                           {'title'} .= " «".$p->get_trimmed_text;
                } else{
                    $data->{stories}[$stories_index]
                           {'paragraph'}[$paragraph_index]
                           {'subparagraph'}[$subparagraph_index]
                           {'text'} .= " «".$p->get_trimmed_text;
                }
            } else {
                if ($in_title) {
                    $data->{stories}[$stories_index]
                           {'title'} .= " \"".$p->get_trimmed_text;
                } else {
                    $data->{stories}[$stories_index]
                           {'paragraph'}[$paragraph_index]
                           {'subparagraph'}[$subparagraph_index]
                           {'text'} .= " \"".$p->get_trimmed_text;
                }
            }
        } elsif ($token->is_end_tag('q')) {
            if ($opts{l} eq "fr") {
                if ($in_title) {
                $data->{stories}[$stories_index]
                       {'title'} .= " » ".$p->get_trimmed_text;
                } else {
                $data->{stories}[$stories_index]
                       {'paragraph'}[$paragraph_index]
                       {'subparagraph'}[$subparagraph_index]
                       {'text'} .= " » ".$p->get_trimmed_text;
                }
            } elsif ($opts{l} eq "de") {
                if ($in_title) {
                $data->{stories}[$stories_index]
                       {'title'} .= "« ".$p->get_trimmed_text;
                } else {
                $data->{stories}[$stories_index]
                       {'paragraph'}[$paragraph_index]
                       {'subparagraph'}[$subparagraph_index]
                       {'text'} .= "« ".$p->get_trimmed_text;
                }
            } elsif ($opts{l} eq "es" or $opts{l} eq "it") {
                if ($in_title) {
                $data->{stories}[$stories_index]
                       {'title'} .= "» ".$p->get_trimmed_text;
                } else {
                $data->{stories}[$stories_index]
                       {'paragraph'}[$paragraph_index]
                       {'subparagraph'}[$subparagraph_index]
                       {'text'} .= "« ".$p->get_trimmed_text;
                }
            } else {
                if ($in_title) {
                $data->{stories}[$stories_index]
                       {'title'} .= "\" ".$p->get_trimmed_text;
                } else {
                $data->{stories}[$stories_index]
                       {'paragraph'}[$paragraph_index]
                       {'subparagraph'}[$subparagraph_index]
                       {'text'} .= "« ".$p->get_trimmed_text;
                }
            }
        }
    } elsif ($token->is_end_tag('p')) {
        delete @links[0..$#links];
        $paragraph_index++;
        $data->{stories}[$stories_index]
               {'paragraph'}[$paragraph_index]
               {'subparagraph'}[$subparagraph_index]
               {'text'} .= $p->get_text;
    } elsif ($token->is_tag('li')) {
        if ($token->is_start_tag('li')) {
            $data->{stories}[$stories_index]
                   {'paragraph'}[$paragraph_index]
                   {'subparagraph'}[$subparagraph_index]
                   {'text'} .= "  * ".$p->get_trimmed_text;
        } elsif ($token->is_end_tag('li')) {
            $data->{stories}[$stories_index]
                   {'paragraph'}[$paragraph_index]
                   {'subparagraph'}[$subparagraph_index]
                   {'text'} .= "\n" . $p->get_trimmed_text;
            $subparagraph_index++;
        }
    } elsif ($token->is_start_tag('hr')) {
        last if ($footer_found);
        $p->get_tag('p');
        $p->get_tag('p');
        $p->get_token('p');
        $p->get_token('p');
        $p->get_token('p');
        $p->get_token('p');
        $footer_found = 1;
    } elsif ($token->is_start_tag('ul')) {
    } else {
        if ($in_title) {
            $data->{stories}[$stories_index]
                   {'title'} .= $p->get_trimmed_text;
        } else {
            $data->{stories}[$stories_index]
                   {'paragraph'}[$paragraph_index]
                   {'subparagraph'}[$subparagraph_index]
                   {'text'} .= $p->get_text;
        }
    }
}

# Start of formating / printing

print "------------------------------------------------------------------------\n";
print "$data->{header}{project}"." "x(72-length($data->{header}{project})-22)."http://www.debian.org/\n";;
print "$data->{header}{dpn}"." "x(72-length($data->{header}{dpn})-33)."debian-publicity\@lists.debian.org\n";
if ($opts{l} eq "fr") {
    print "$data->{header}{date}"." "x(74-length($data->{header}{date})-length($data->{header}{url}))."$data->{header}{url}\n";
} else {
    print "$data->{header}{date}"." "x(72-length($data->{header}{date})-length($data->{header}{url}))."$data->{header}{url}\n";
}
print
"------------------------------------------------------------------------\n\n";



foreach my $stories (@{$data->{stories}}) {
    print $stories->{'title'} . "\n" . '-'x(length($stories->{'title'})). "\n" if (defined($stories->{'title'}));
    foreach my $paragraph (@{$stories->{paragraph}}) {
        foreach my $subparagraph (@{$paragraph->{subparagraph}}) {
            $subparagraph->{'text'} =~ s/^\n*//g;
            $subparagraph->{'text'} =~ s/\n/ /g;
            $subparagraph->{'text'} =~ s/\s+/ /g;
            $subparagraph->{'text'} =~ s/ \*/  \*/g;
            $subparagraph->{'text'} =~ s/ \./\./g;
            $subparagraph->{'text'} =~ s/ ,/,/g;
            $subparagraph->{'text'} =~ s/\( /\(/g;
            $subparagraph->{'text'} =~ s/ \)/\)/g;


            if (length($subparagraph->{'text'}) > 1){
                if ((length($subparagraph->{'text'}) > 3) and
                    ((substr $subparagraph->{'text'}, 0, 3) eq "  *")){
                    print wrap("", "    ", $subparagraph->{'text'});
                } else {
                    print wrap("", "", $subparagraph->{'text'});
                }
                print "\n";
            }
        }
        print "\n";
        foreach my $link (@{$paragraph->{'links'}}) {
            $link->{'link'}  =~ s,^../../../..,http://www.debian.org,;
            $link->{'link'}  =~ s,^../../..,http://www.debian.org/News,;
            $link->{'link'}  =~ s,^../..,http://www.debian.org/News/weekly,;
            print "   $link->{'index'} : $link->{'link'}\n";
        }
        print "\n";
    }
}

Reply to: