Hi everybody, after some discussion with David and Cédric on IRC, here is a new version of DPNhtml2mail.pl I know there are some error when processing some languages: "Wide character in print at ./DPNhtml2mail.pl line 243" I will check it certainly not before beginning of next week. Feedbacks are welcome, Regards, Thomas
#!/usr/bin/env perl # Author: Jean-Edouard Babin, radius in gmail.com # Author: Thomas Blein, tblein in tblein.eu (2012) # Todo # - Debug need to be implemented/improved # - Better \n removing # History # Revision 1.3 04/05/2012 # Check to prevent initialisation of substitution error # Add empty line after title # Correct subsequent indentation for list # Remove extra line when no links # Calculate length of string in UTF-8 for date and title # Space cleaning also in titles # Indentation of first ligne of paragraph in French # Revision 1.2 02/05/2012 # Remove indentation of first li # Correct date missalignement in French # Correction of indentation of the script # Take into account also quoting string in title # Correction of all the spacing problems # Revision 1.1 01/02/2009 03:39 # Split parse & print / Wrapping # Revision 1.0 31/01/2009 23:35 # Initial Version use strict; use warnings; use HTML::TokeParser::Simple; use LWP::UserAgent; use Getopt::Long; use Text::Wrap; use Encode; $Text::Wrap::columns = 80; my %opts; my $data; my @links; my $link_index = 1; my $footer_found = 0; my $stories_index = 0; my $paragraph_index = 0; my $subparagraph_index = 0; # Base config my $default_url = $opts{u} = 'http://www.debian.org/News/weekly/'; my $default_lang = $opts{l} = 'en'; $opts{d} = 0; sub space_cleaning { my ($string) = @_; $string =~ s/^\n*//g; $string =~ s/\n/ /g; $string =~ s/\s+/ /g; $string =~ s/ \*/ \*/g; $string =~ s/ \./\./g; $string =~ s/ ,/,/g; $string =~ s/\( /\(/g; $string =~ s/ \)/\)/g; $string =~ s/\s$//; return $string; } # Option parsing GetOptions(\%opts, 'u=s', 'i=s', 'l=s', 'd'); if (!defined($opts{i})) { print STDERR "Usage: $0 -i issue [-l lang] [-u base_url] [-d]\n"; print STDERR " -i issue number (i.e.: 2008/17)\n"; print STDERR " -l langage (i.e.: fr). Default value is \"-l $default_lang\"\n"; print STDERR " -u base_url: DPN common URL. Default value is \"-u $default_url\"\n"; print STDERR " -d for verbose output\n"; exit 1; } if ($opts{d} == 1) { use Data::Dumper; #useful for debug only } # HTML file fetching my $ua = LWP::UserAgent->new; $ua->agent("DPNhtml2mail"); my $req = HTTP::Request->new(GET => "$opts{u}$opts{i}/index.$opts{l}.html"); my $res = $ua->request($req); if (! $res->is_success) { die "Can't fetch $opts{u}$opts{i}/index.$opts{l}.html ".$res->status_line; } # Start of parsing / storage my $p = HTML::TokeParser::Simple->new(\$res->content); my $token = $p->get_tag("h1"); $data->{header}{title} = $p->get_trimmed_text; $data->{header}{url} = "$opts{u}$opts{i}/"; ($data->{header}{dpn},$data->{header}{date}) = split(/ - /,$data->{header}{title}); if ($opts{l} eq "fr") { $data->{header}{project} = 'Projet Debian'; } elsif ($opts{l} eq "es") { $data->{header}{project} = 'El proyecto Debian'; } elsif ($opts{l} eq "it") { $data->{header}{project} = 'Il progetto Debian'; } else { $data->{header}{project} = 'The Debian Project'; } my $in_title = 0; while (my $token = $p->get_tag) { if ($token->is_start_tag('h2')) { $p->get_tag("a"); $paragraph_index = 0; $subparagraph_index = 0; $stories_index++; # Get story name $in_title = 1; $data->{stories}[$stories_index]{'title'} = $p->get_trimmed_text; } elsif ($token->is_end_tag('h2')){ $in_title = 0; }elsif ($token->is_start_tag('a')) { if ($token->[1]{'href'} !~ /^#\w+.*$/) { # Common link $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_trimmed_text . " [".$link_index."]"; push(@{$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'links'}}, { 'index' => $link_index++, 'link' => $token->[1]{'href'} || '-' }); } else { # First internal links $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_trimmed_text; } } elsif ($token->is_tag('q')) { if ($token->is_start_tag('q')) { if ($opts{l} eq "fr") { if ($in_title) { $data->{stories}[$stories_index]{'title'} .= " « ".$p->get_trimmed_text; } else { $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " « ".$p->get_trimmed_text; } } elsif ($opts{l} eq "de") { if ($in_title) { $data->{stories}[$stories_index]{'title'} .= " »".$p->get_trimmed_text; } else { $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " »".$p->get_trimmed_text; } } elsif ($opts{l} eq "es" or $opts{l} eq "it") { if ($in_title) { $data->{stories}[$stories_index]{'title'} .= " «".$p->get_trimmed_text; } else{ $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " «".$p->get_trimmed_text; } } else { if ($in_title) { $data->{stories}[$stories_index]{'title'} .= " \"".$p->get_trimmed_text; } else { $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " \"".$p->get_trimmed_text; } } } elsif ($token->is_end_tag('q')) { if ($opts{l} eq "fr") { if ($in_title) { $data->{stories}[$stories_index]{'title'} .= " » ".$p->get_trimmed_text; } else { $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " » ".$p->get_trimmed_text; } } elsif ($opts{l} eq "de") { if ($in_title) { $data->{stories}[$stories_index]{'title'} .= "« ".$p->get_trimmed_text; } else { $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "« ".$p->get_trimmed_text; } } elsif ($opts{l} eq "es" or $opts{l} eq "it") { if ($in_title) { $data->{stories}[$stories_index]{'title'} .= "» ".$p->get_trimmed_text; } else { $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "« ".$p->get_trimmed_text; } } else { if ($in_title) { $data->{stories}[$stories_index]{'title'} .= "\" ".$p->get_trimmed_text; } else { $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "« ".$p->get_trimmed_text; } } } } elsif ($token->is_end_tag('p')) { delete @links[0..$#links]; $paragraph_index++; $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_text; } elsif ($token->is_tag('li')) { if ($token->is_start_tag('li')) { $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " * ".$p->get_trimmed_text; } elsif ($token->is_end_tag('li')) { $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "\n" . $p->get_trimmed_text; $subparagraph_index++; } } elsif ($token->is_start_tag('hr')) { last if ($footer_found); $p->get_tag('p'); $p->get_tag('p'); $p->get_token('p'); $p->get_token('p'); $p->get_token('p'); $p->get_token('p'); $footer_found = 1; } elsif ($token->is_start_tag('ul')) { } else { if ($in_title) { $data->{stories}[$stories_index]{'title'} .= $p->get_trimmed_text; } else { $data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_text; } } } # Start of formating / printing print "------------------------------------------------------------------------\n"; print "$data->{header}{project}"." "x(72-length($data->{header}{project})-22)."http://www.debian.org/\n"; print "$data->{header}{dpn}"." "x(72-length($data->{header}{dpn})-33)."debian-publicity\@lists.debian.org\n"; print "$data->{header}{date}"." "x(72-length(Encode::decode("utf-8", "$data->{header}{date}"))-length($data->{header}{url}))."$data->{header}{url}\n"; print "------------------------------------------------------------------------\n\n"; foreach my $stories (@{$data->{stories}}) { if (defined($stories->{'title'})){ $stories->{'title'} = space_cleaning($stories->{'title'}); print $stories->{'title'} . "\n" . '-'x(length(Encode::decode("utf-8", "$stories->{'title'}"))). "\n\n"; } foreach my $paragraph (@{$stories->{paragraph}}) { foreach my $subparagraph (@{$paragraph->{subparagraph}}) { if (defined($subparagraph->{'text'}) and length($subparagraph->{'text'}) > 1){ $subparagraph->{'text'} = space_cleaning($subparagraph->{'text'}); if ((length($subparagraph->{'text'}) > 3) and ((substr $subparagraph->{'text'}, 0, 3) eq " *")){ print wrap("", " ", $subparagraph->{'text'}); } else { if ($opts{l} eq "fr") { print wrap(" ", "", $subparagraph->{'text'}); } else { print wrap("", "", $subparagraph->{'text'}); } } print "\n"; } } if (defined($paragraph->{'links'})){ print "\n"; foreach my $link (@{$paragraph->{'links'}}) { $link->{'link'} =~ s,^../../../..,http://www.debian.org,; $link->{'link'} =~ s,^../../..,http://www.debian.org/News,; $link->{'link'} =~ s,^../..,http://www.debian.org/News/weekly,; print " $link->{'index'} : $link->{'link'}\n"; } } print "\n"; } }
Attachment:
signature.asc
Description: Digital signature