Re: New version of DPNhtml2mail.pl

To: debian-publicity@lists.debian.org
Subject: Re: New version of DPNhtml2mail.pl
From: Thomas Blein <dageou@yahoo.fr>
Date: Fri, 4 May 2012 01:04:14 +0200
Message-id: <[🔎] 20120503230414.GE4890@yopbook.tblein.eu>
In-reply-to: <[🔎] jnsgre$bei$1@dough.gmane.org>
References: <20120402141150.GF5952@zouish.org> <20120402204845.GY21443@sid.nuvreauspam> <jld42b$ht3$1@dough.gmane.org> <20120403204849.GD21443@sid.nuvreauspam> <[🔎] 20120502221451.GB4855@yopbook.tblein.eu> <[🔎] jnsgre$bei$1@dough.gmane.org>

Hi everybody,

after some discussion with David and Cédric on IRC, here is a new version
of DPNhtml2mail.pl

I know there are some error when processing some languages:
"Wide character in print at ./DPNhtml2mail.pl line 243"

I will check it certainly not before beginning of next week.

Feedbacks are welcome,

Regards,

Thomas

#!/usr/bin/env perl

# Author: Jean-Edouard Babin, radius in gmail.com
# Author: Thomas Blein, tblein in tblein.eu (2012)

# Todo
# - Debug need to be implemented/improved
# - Better \n removing

# History
# Revision 1.3 04/05/2012
# Check to prevent initialisation of substitution error
# Add empty line after title
# Correct subsequent indentation for list
# Remove extra line when no links 
# Calculate length of string in UTF-8 for date and title
# Space cleaning also in titles
# Indentation of first ligne of paragraph in French

# Revision 1.2 02/05/2012
# Remove indentation of first li
# Correct date missalignement in French
# Correction of indentation of the script
# Take into account also quoting string in title
# Correction of all the spacing problems

# Revision 1.1 01/02/2009 03:39
# Split parse & print / Wrapping

# Revision 1.0 31/01/2009 23:35
# Initial Version

use strict;
use warnings;

use HTML::TokeParser::Simple;
use LWP::UserAgent;
use Getopt::Long;
use Text::Wrap;
use Encode;

$Text::Wrap::columns = 80;
my %opts;
my $data;
my @links;

my $link_index = 1;
my $footer_found = 0;
my $stories_index = 0;
my $paragraph_index = 0;
my $subparagraph_index = 0;

# Base config
my $default_url  = $opts{u} = 'http://www.debian.org/News/weekly/';
my $default_lang = $opts{l} = 'en';
                   $opts{d} = 0;

sub space_cleaning {
    my ($string) = @_;
    $string =~ s/^\n*//g;
    $string =~ s/\n/ /g;
    $string =~ s/\s+/ /g;
    $string =~ s/ \*/  \*/g;
    $string =~ s/ \./\./g;
    $string =~ s/ ,/,/g;
    $string =~ s/\( /\(/g;
    $string =~ s/ \)/\)/g;
    $string =~ s/\s$//;
    return $string;
}

# Option parsing
GetOptions(\%opts, 'u=s', 'i=s', 'l=s', 'd');

if (!defined($opts{i})) {
	print STDERR "Usage: $0 -i issue [-l lang] [-u base_url] [-d]\n";
	print STDERR " -i issue number (i.e.: 2008/17)\n";
	print STDERR " -l langage (i.e.: fr). Default value is \"-l $default_lang\"\n";
	print STDERR " -u base_url: DPN common URL. Default value is \"-u $default_url\"\n";
	print STDERR " -d for verbose output\n";
	exit 1;
}

if ($opts{d} == 1) {
	use Data::Dumper; #useful for debug only
}

# HTML file fetching
my $ua = LWP::UserAgent->new;
$ua->agent("DPNhtml2mail");

my $req = HTTP::Request->new(GET => "$opts{u}$opts{i}/index.$opts{l}.html");
my $res = $ua->request($req);
if (! $res->is_success) {
	die "Can't fetch $opts{u}$opts{i}/index.$opts{l}.html ".$res->status_line;
}

# Start of parsing / storage
my $p = HTML::TokeParser::Simple->new(\$res->content);
my $token = $p->get_tag("h1");
$data->{header}{title} = $p->get_trimmed_text;
$data->{header}{url}   = "$opts{u}$opts{i}/";
($data->{header}{dpn},$data->{header}{date}) = split(/ - /,$data->{header}{title});
if ($opts{l} eq "fr") {
	$data->{header}{project} = 'Projet Debian';
}
elsif ($opts{l} eq "es") {
        $data->{header}{project} = 'El proyecto Debian';
}
elsif ($opts{l} eq "it") {
        $data->{header}{project} = 'Il progetto Debian';
}
else {
	$data->{header}{project} = 'The Debian Project';
}

my $in_title = 0;

while (my $token = $p->get_tag) {
	if ($token->is_start_tag('h2')) {
		$p->get_tag("a");
		$paragraph_index = 0;
		$subparagraph_index = 0;
		$stories_index++;
		# Get story name
		$in_title = 1;
		$data->{stories}[$stories_index]{'title'} = $p->get_trimmed_text;
	} elsif ($token->is_end_tag('h2')){
		$in_title = 0;
	}elsif ($token->is_start_tag('a')) {
		if ($token->[1]{'href'} !~ /^#\w+.*$/) {				# Common link
			$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_trimmed_text . " [".$link_index."]";
			push(@{$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'links'}}, { 'index' => $link_index++, 'link' => $token->[1]{'href'} || '-' });
		} else {								# First internal links
			$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_trimmed_text;
		}
	} elsif ($token->is_tag('q')) {
		if ($token->is_start_tag('q')) {
			if ($opts{l} eq "fr") {
				if ($in_title) {
					$data->{stories}[$stories_index]{'title'} .= " « ".$p->get_trimmed_text;
				} else {
					$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " « ".$p->get_trimmed_text;
				}
			} elsif ($opts{l} eq "de") {
				if ($in_title) {
					$data->{stories}[$stories_index]{'title'} .= " »".$p->get_trimmed_text;
				} else {
					$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " »".$p->get_trimmed_text;
				}
			} elsif ($opts{l} eq "es" or $opts{l} eq "it") {
				if ($in_title) {
					$data->{stories}[$stories_index]{'title'} .= " «".$p->get_trimmed_text;
				} else{
					$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " «".$p->get_trimmed_text;
				}
                        } else {
				if ($in_title) {
				$data->{stories}[$stories_index]{'title'} .= " \"".$p->get_trimmed_text;
				} else {
				$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " \"".$p->get_trimmed_text;
				}
			}
		} elsif ($token->is_end_tag('q')) {
			if ($opts{l} eq "fr") {
				if ($in_title) {
				$data->{stories}[$stories_index]{'title'} .= " » ".$p->get_trimmed_text;
				} else {
				$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " » ".$p->get_trimmed_text;
				}
			} elsif ($opts{l} eq "de") {
					if ($in_title) {
					$data->{stories}[$stories_index]{'title'} .= "« ".$p->get_trimmed_text;
					} else {
					$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "« ".$p->get_trimmed_text;
					}
			} elsif ($opts{l} eq "es" or $opts{l} eq "it") {
					if ($in_title) {
					$data->{stories}[$stories_index]{'title'} .= "» ".$p->get_trimmed_text;
					} else {
					$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "« ".$p->get_trimmed_text;
					}
                        } else {
				if ($in_title) {
				$data->{stories}[$stories_index]{'title'} .= "\" ".$p->get_trimmed_text;
				} else {
				$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "« ".$p->get_trimmed_text;
				}
			}
		}
	} elsif ($token->is_end_tag('p')) {
		delete @links[0..$#links];
		$paragraph_index++;
		$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_text;
	} elsif ($token->is_tag('li')) {
		if ($token->is_start_tag('li')) {
			$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "  * ".$p->get_trimmed_text;
		} elsif ($token->is_end_tag('li')) {
			$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "\n" . $p->get_trimmed_text;
			$subparagraph_index++;
		}
	} elsif ($token->is_start_tag('hr')) {
		last if ($footer_found);
		$p->get_tag('p');
		$p->get_tag('p');
		$p->get_token('p');
		$p->get_token('p');
		$p->get_token('p');
		$p->get_token('p');
		$footer_found = 1;
	} elsif ($token->is_start_tag('ul')) {
	} else {
		if ($in_title) {
		$data->{stories}[$stories_index]{'title'} .= $p->get_trimmed_text;
		} else {
		$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_text;
		}
	}
}

# Start of formating / printing

print "------------------------------------------------------------------------\n";
print "$data->{header}{project}"." "x(72-length($data->{header}{project})-22)."http://www.debian.org/\n";;
print "$data->{header}{dpn}"." "x(72-length($data->{header}{dpn})-33)."debian-publicity\@lists.debian.org\n";
print "$data->{header}{date}"." "x(72-length(Encode::decode("utf-8", "$data->{header}{date}"))-length($data->{header}{url}))."$data->{header}{url}\n";
print
"------------------------------------------------------------------------\n\n";



foreach my $stories (@{$data->{stories}}) {
    if (defined($stories->{'title'})){
    $stories->{'title'} = space_cleaning($stories->{'title'});
	print $stories->{'title'} . "\n" . '-'x(length(Encode::decode("utf-8", "$stories->{'title'}"))). "\n\n";
    }
	foreach my $paragraph (@{$stories->{paragraph}}) {
	foreach my $subparagraph (@{$paragraph->{subparagraph}}) {
		if (defined($subparagraph->{'text'}) and length($subparagraph->{'text'}) > 1){
        $subparagraph->{'text'} = space_cleaning($subparagraph->{'text'});

			if ((length($subparagraph->{'text'}) > 3) and ((substr $subparagraph->{'text'}, 0, 3) eq "  *")){
				print wrap("", "    ", $subparagraph->{'text'});
			} else {
			if ($opts{l} eq "fr") {
				print wrap("  ", "", $subparagraph->{'text'});
			} else {
				print wrap("", "", $subparagraph->{'text'});
            }
			}
			print "\n";
		}
	}
    if (defined($paragraph->{'links'})){
		print "\n";
		foreach my $link (@{$paragraph->{'links'}}) {
			$link->{'link'}  =~ s,^../../../..,http://www.debian.org,;
			$link->{'link'}  =~ s,^../../..,http://www.debian.org/News,;
			$link->{'link'}  =~ s,^../..,http://www.debian.org/News/weekly,;
			print "   $link->{'index'} : $link->{'link'}\n";
		}
    }
	print "\n";
	}
}

Attachment: signature.asc
Description: Digital signature

Reply to:

Follow-Ups:
- Re: New version of DPNhtml2mail.pl
  - From: David Prévot <taffit@debian.org>

References:
- New version of DPNhtml2mail.pl [was: Possible bug when generating mail version]
  - From: Thomas Blein <dageou@yahoo.fr>
- Re: New version of DPNhtml2mail.pl
  - From: David Prévot <taffit@debian.org>

Prev by Date: Re: New version of DPNhtml2mail.pl
Next by Date: Re: New version of DPNhtml2mail.pl
Previous by thread: Re: New version of DPNhtml2mail.pl
Next by thread: Re: New version of DPNhtml2mail.pl
Index(es):
- Date
- Thread