Re: New version of DPNhtml2mail.pl

To: debian-publicity@lists.debian.org
Subject: Re: New version of DPNhtml2mail.pl
From: Thomas Blein <tblein@tblein.eu>
Date: Thu, 03 May 2012 03:26:55 +0200
Message-id: <[🔎] 4FA1DEDF.4040402@tblein.eu>
In-reply-to: <[🔎] jnsgre$bei$1@dough.gmane.org>
References: <20120402141150.GF5952@zouish.org> <20120402204845.GY21443@sid.nuvreauspam> <jld42b$ht3$1@dough.gmane.org> <20120403204849.GD21443@sid.nuvreauspam> <[🔎] 20120502221451.GB4855@yopbook.tblein.eu> <[🔎] jnsgre$bei$1@dough.gmane.org>

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Hi,

Le 03/05/12 01:43, David Prévot a écrit :
> Le 02/05/2012 18:14, Thomas Blein a écrit :
>> # Correction of indentation of the script
> 
> Do you have a version without that change? Or at worst, a way to 
> apply the same changes to the current version: it's pretty hard to
>  figure out what is the actual code edited.

I managed to revert all the indentation correction. Still a lot of
changes but maybe clearer for the diff.
I did this correction of the indentation mainly because it was
difficult for me to follow the code.

Regards,

Thomas
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.9 (Darwin)

iQEcBAEBAgAGBQJPod7eAAoJEPrijPU0jEt7ifIH/A/eDz3o35jKQe6wB+XJTOfv
AgxvIxGzK8hjKc+jxk8t3Xj/FdSb9sMS5xJPDiOuGZu/yCphiqRrOKA2rPCRhSKH
rwEgRBN3rNSOc7AkLhOmst3UNzwNmlTWAr379Q+Ulg2pFUBLsMOgZLDaTPyCgEsu
PR6IqwGoUt5jlh8Cme9hPVrsfHLMCL2P+lhwDRJ6pGDIWDAc/sZIDeKcHw85xqtz
Mn46AKs5NiO48OXdpZtc95/NbuOfzGqFKQLoZJ0h9+1RWyS+ZjLrDNeBRuyYbU4k
UsVCtUah/EINlRWEAwldDmeUTEQG/ccQdslmDa01C77I2gxAhOQNvQ8fm4yKgDk=
=ruZ/
-----END PGP SIGNATURE-----

#!/usr/bin/env perl

# Author: Jean-Edouard Babin, radius in gmail.com
# Author: Thomas Blein, tblein in tblein.eu (2012)

# Todo
# - Debug need to be implemented/improved
# - Better \n removing

# History
# Revision 1.2 02/05/2012
# Remove indentation of first li
# Correct date missalignement in French
# Correction of indentation of the script
# Take into account also quoting string in title

# Revision 1.1 01/02/2009 03:39
# Split parse & print / Wrapping

# Revision 1.0 31/01/2009 23:35
# Initial Version

use strict;
use warnings;

use HTML::TokeParser::Simple;
use LWP::UserAgent;
use Getopt::Long;
use Text::Wrap;

$Text::Wrap::columns = 80;
my %opts;
my $data;
my @links;

my $link_index = 1;
my $footer_found = 0;
my $stories_index = 0;
my $paragraph_index = 0;
my $subparagraph_index = 0;

# Base config
my $default_url  = $opts{u} = 'http://www.debian.org/News/weekly/';
my $default_lang = $opts{l} = 'en';
                   $opts{d} = 0;

# Option parsing
GetOptions(\%opts, 'u=s', 'i=s', 'l=s', 'd');

if (!defined($opts{i})) {
	print STDERR "Usage: $0 -i issue [-l lang] [-u base_url] [-d]\n";
	print STDERR " -i issue number (i.e.: 2008/17)\n";
	print STDERR " -l langage (i.e.: fr). Default value is \"-l $default_lang\"\n";
	print STDERR " -u base_url: DPN common URL. Default value is \"-u $default_url\"\n";
	print STDERR " -d for verbose output\n";
	exit 1;
}

if ($opts{d} == 1) {
	use Data::Dumper; #useful for debug only
}

# HTML file fetching
my $ua = LWP::UserAgent->new;
$ua->agent("DPNhtml2mail");

my $req = HTTP::Request->new(GET => "$opts{u}$opts{i}/index.$opts{l}.html");
my $res = $ua->request($req);
if (! $res->is_success) {
	die "Can't fetch $opts{u}$opts{i}/index.$opts{l}.html ".$res->status_line;
}

# Start of parsing / storage
my $p = HTML::TokeParser::Simple->new(\$res->content);
my $token = $p->get_tag("h1");
$data->{header}{title} = $p->get_trimmed_text;
$data->{header}{url}   = "$opts{u}$opts{i}/";
($data->{header}{dpn},$data->{header}{date}) = split(/ - /,$data->{header}{title});
if ($opts{l} eq "fr") {
	$data->{header}{project} = 'Projet Debian';
}
elsif ($opts{l} eq "es") {
        $data->{header}{project} = 'El proyecto Debian';
}
elsif ($opts{l} eq "it") {
        $data->{header}{project} = 'Il progetto Debian';
}
else {
	$data->{header}{project} = 'The Debian Project';
}

my $in_title = 0;

while (my $token = $p->get_tag) {
	if ($token->is_start_tag('h2')) {
		$p->get_tag("a");
		$paragraph_index = 0;
		$subparagraph_index = 0;
		$stories_index++;
		# Get story name
		$in_title = 1;
		$data->{stories}[$stories_index]{'title'} = $p->get_trimmed_text;
	} elsif ($token->is_end_tag('h2')){
		$in_title = 0;
	}elsif ($token->is_start_tag('a')) {
		if ($token->[1]{'href'} !~ /^#\w+.*$/) {				# Common link
			$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_trimmed_text . " [".$link_index."]";
			push(@{$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'links'}}, { 'index' => $link_index++, 'link' => $token->[1]{'href'} || '-' });
		} else {								# First internal links
			$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_trimmed_text;
		}
	} elsif ($token->is_tag('q')) {
		if ($token->is_start_tag('q')) {
			if ($opts{l} eq "fr") {
				if ($in_title) {
					$data->{stories}[$stories_index]{'title'} .= " Â«Â ".$p->get_trimmed_text;
				} else {
					$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " Â«Â ".$p->get_trimmed_text;
				}
			} elsif ($opts{l} eq "de") {
				if ($in_title) {
					$data->{stories}[$stories_index]{'title'} .= " Â»".$p->get_trimmed_text;
				} else {
					$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " Â»".$p->get_trimmed_text;
				}
			} elsif ($opts{l} eq "es" or $opts{l} eq "it") {
				if ($in_title) {
					$data->{stories}[$stories_index]{'title'} .= " Â«".$p->get_trimmed_text;
				} else{
					$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " Â«".$p->get_trimmed_text;
				}
                        } else {
				if ($in_title) {
				$data->{stories}[$stories_index]{'title'} .= " \"".$p->get_trimmed_text;
				} else {
				$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= " \"".$p->get_trimmed_text;
				}
			}
		} elsif ($token->is_end_tag('q')) {
			if ($opts{l} eq "fr") {
				if ($in_title) {
				$data->{stories}[$stories_index]{'title'} .= "Â Â» ".$p->get_trimmed_text;
				} else {
				$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "Â Â» ".$p->get_trimmed_text;
				}
			} elsif ($opts{l} eq "de") {
					if ($in_title) {
					$data->{stories}[$stories_index]{'title'} .= "Â« ".$p->get_trimmed_text;
					} else {
					$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "Â« ".$p->get_trimmed_text;
					}
			} elsif ($opts{l} eq "es" or $opts{l} eq "it") {
					if ($in_title) {
					$data->{stories}[$stories_index]{'title'} .= "Â» ".$p->get_trimmed_text;
					} else {
					$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "Â« ".$p->get_trimmed_text;
					}
                        } else {
				if ($in_title) {
				$data->{stories}[$stories_index]{'title'} .= "\" ".$p->get_trimmed_text;
				} else {
				$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "Â« ".$p->get_trimmed_text;
				}
			}
		}
	} elsif ($token->is_end_tag('p')) {
		delete @links[0..$#links];
		$paragraph_index++;
		$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_text;
	} elsif ($token->is_tag('li')) {
		if ($token->is_start_tag('li')) {
			$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "  * ".$p->get_trimmed_text;
		} elsif ($token->is_end_tag('li')) {
			$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= "\n" . $p->get_trimmed_text;
			$subparagraph_index++;
		}
	} elsif ($token->is_start_tag('hr')) {
		last if ($footer_found);
		$p->get_tag('p');
		$p->get_tag('p');
		$p->get_token('p');
		$p->get_token('p');
		$p->get_token('p');
		$p->get_token('p');
		$footer_found = 1;
	} elsif ($token->is_start_tag('ul')) {
	} else {
		if ($in_title) {
		$data->{stories}[$stories_index]{'title'} .= $p->get_trimmed_text;
		} else {
		$data->{stories}[$stories_index]{'paragraph'}[$paragraph_index]{'subparagraph'}[$subparagraph_index]{'text'} .= $p->get_text;
		}
	}
}

# Start of formating / printing

print "------------------------------------------------------------------------\n";
print "$data->{header}{project}"." "x(72-length($data->{header}{project})-22)."http://www.debian.org/\n";;
print "$data->{header}{dpn}"." "x(72-length($data->{header}{dpn})-33)."debian-publicity\@lists.debian.org\n";
if ($opts{l} eq "fr") {
print "$data->{header}{date}"." "x(74-length($data->{header}{date})-length($data->{header}{url}))."$data->{header}{url}\n";
} else {
print "$data->{header}{date}"." "x(72-length($data->{header}{date})-length($data->{header}{url}))."$data->{header}{url}\n";
}
print
"------------------------------------------------------------------------\n\n";



foreach my $stories (@{$data->{stories}}) {
	print $stories->{'title'} . "\n" . '-'x(length($stories->{'title'})). "\n" if (defined($stories->{'title'}));
	foreach my $paragraph (@{$stories->{paragraph}}) {
	foreach my $subparagraph (@{$paragraph->{subparagraph}}) {
		$subparagraph->{'text'} =~ s/^\n*//g;
		$subparagraph->{'text'} =~ s/\n/ /g;
		$subparagraph->{'text'} =~ s/\s+/ /g;
		$subparagraph->{'text'} =~ s/ \*/  \*/g;
		$subparagraph->{'text'} =~ s/ \./\./g;
		$subparagraph->{'text'} =~ s/ ,/,/g;
		$subparagraph->{'text'} =~ s/\( /\(/g;
		$subparagraph->{'text'} =~ s/ \)/\)/g;


		if (length($subparagraph->{'text'}) > 1){
			if ((length($subparagraph->{'text'}) > 3) and ((substr $subparagraph->{'text'}, 0, 3) eq "  *")){
				print wrap("", "", $subparagraph->{'text'});
			} else {
				print wrap("", "", $subparagraph->{'text'});
			}
			print "\n";
		}
	}
		print "\n";
		foreach my $link (@{$paragraph->{'links'}}) {
			$link->{'link'}  =~ s,^../../../..,http://www.debian.org,;
			$link->{'link'}  =~ s,^../../..,http://www.debian.org/News,;
			$link->{'link'}  =~ s,^../..,http://www.debian.org/News/weekly,;
			print "   $link->{'index'} : $link->{'link'}\n";
		}
	print "\n";
	}
}

Reply to:

References:
- New version of DPNhtml2mail.pl [was: Possible bug when generating mail version]
  - From: Thomas Blein <dageou@yahoo.fr>
- Re: New version of DPNhtml2mail.pl
  - From: David Prévot <taffit@debian.org>

Prev by Date: Re: New version of DPNhtml2mail.pl
Next by Date: Re: New version of DPNhtml2mail.pl
Previous by thread: Re: New version of DPNhtml2mail.pl
Next by thread: Re: New version of DPNhtml2mail.pl
Index(es):
- Date
- Thread