[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

CGI search script.



I don't recall seeing a search script on our website, but I could not
verify that today since it is down for a little while.  In any case, I
wrote up a small CGI script that parses the Packages file and looks
for matching strings, it will output the Package listing, with a URL
for the file, if it finds a match.  It is fairly primitive, but I
thought it might be useful.  Please mail me any suggestions, comments,
or whatevers.


Jim

#!/usr/bin/perl
#
# find_deb.pl -- a simple CGI search engine for Debian packages.
#
# This is a simple CGI search script to find Debian packages via the
# Web. It is meant to be used for simple packages, not really for
# interdependent packages.
#
# Pass the search string via the variable "search_string" in a POST from
# a WWW form.  The variable may be a single word or a perl regular
# expression.  It will search the Packages file and output a list of
# packages that match.
#
# The script uses a list generated from a 'find -name' run from the root
# of the FTP tree to try and match the filename in the Package field. 
# The files in this list  should be following format:
# (note the lack of leading './')
#	
#		pub/debian/binary-i386/admin/acct-5-12.deb      
#		pub/debian/binary-i386/admin/at-2.8a-2.deb      
#		pub/debian/binary-i386/admin/cfengine-1.2.14-1.deb
#
# If a grep through the file list finds a match for '/filename*.deb$', 
# the package name will be a URL to the found file.  You may also match
# the Depends, Recommends, etc. fields, but it is not advised due to the
# interdependencies of Debian packages.
#
# Modify the variables below before you put this script up.  Look at
# sub html_head and sub html_tail for the header/trailer HTML code.
#
#					Jim Robinson <jimr@simons-rock.edu> 
#					March 17, 1996

### Variables ###

$SERVER = 'ftp://ftp.debian.org';		# Our FTP server

$M_URL   = 'http://www.debian.org/';		# Our homepage URL
$M_EMAIL = 'webmaster\@debian.org';		# Our e-mail address.
$TITLE   = 'Debian GNU/Linux Search';		# URL Title
$HEAD    = 'Debian GNU/Linux Search';		# Printed Title.

# The following is the path to the Packages file.
$PACKAGES_PATH='/usr/src/ftp/pub/debian/Packages';
# The following is the path to a "find -name" of the main binary tree.
$FILELIST_PATH='/usr/src/ftp/pub/debian/find-name';

# The following are the package keys we try and find files for.
# It would be safer to use this field...
$CONTAINS_FILENAMES = 'package';
# However, if you use the big CONTAINS_FILENAMES field, make sure you warn
# people that interdependencies are fairly complex, and this search engine 
# will NOT find them all.
#$CONTAINS_FILENAMES = 'package|depends|recommended|optional';

# Names in a field.  Make sure they are in the order you want them
# printed in.
@field_names = ('Package',
	        'Section',
		'Version',
		'Maintainer',
		'Depends',
		'Recommends',
		'Suggests',
		'Conflics',
		'Provides',
		'Essential',
		'Description');

#### END OF VARIABLES ####





#
# Main
#

# Open filelist database.  This is slow -- we should use locate(1) or
# some sort of simple lookup program in C.
if (!open(FILELIST, "$FILELIST_PATH")) {
    html_head();
    print "Sorry, I couldn't open $FILELIST_PATH: ($!)<br>\n";
    html_tail();
    exit(-1);
}
@LIST = <FILELIST>;

# Get our regexp string.
if ($ENV{'REQUEST_METHOD'} eq 'GET' or $ENV{'REQUEST_METHOD'} eq 'POST') {
    ReadParse();
    $string = $in{'search_string'};
} else {
    $string = $ARGV[0];
}

# Find records and output...
html_head();
find_records();
html_tail();





# 
# Functions
#
sub find_records
{
    $count = 0;

    if (!open(PACKAGES, "$PACKAGES_PATH")) {
	print "Sorry, I couldn't open $PACKAGES_PATH: ($!)<br>\n";
	exit(-1);
    }

    while(<PACKAGES>) {
	chomp();		# strip newline

	if ($_ ne '') {		# It is NOT a new package name...
	    $package .= $_. "\n";
	} else {		# It IS a new package name...
	    if(grep(/$string/, $package)) {
		parse_record($package);
	    }
	    undef($package);
	}
    }
}

sub parse_record
{
    local($input) = @_;
    local($pkg, @lines);

    @lines = split(/\n/, $input);

    # Split apart the record into its component key/value pairs.
    # Descriptions are tricky, since they are multi-line.
    foreach $line (@lines) {
	if (($field ne 'Description') and ($line =~ m/^(\w+):\s+(.*)$/)) {
	    # Searched for, and found key/value pairs.
	    $pkg{$1} = $2;
	} elsif ($pkg{'Description'} ne '') {
	    # We are at the end (with the Description).
	    $pkg{'Description'} .= "\n$line";
	}
    }
    find_package_url(%pkg);	# Search for the file in the list.
}

sub find_package_url
{
    local(%pkg) = @_;
    local($file, $key, $path_to_file, $path_to_tree, @matchs, @urls);

    foreach $key (keys %pkg) {
	if ($key =~ m/$CONTAINS_FILENAMES/i) { # Does the key have filenames?
	    
	    undef(@urls);
	    @files = split(/,|\|/, $pkg{$key});

	    foreach $file (@files) {
		$file =~ s/\s//g;

		# Using a call to locate(1) or a specialized lookup
		# program probably be a LOT faster, but anyway...
		# 
		# If we find a match, use the first match and write out
		# The URL string, replacing the filename in the key/value 
		# pair with this string.  This needs improvement.
		if ((@matchs = grep(/\/$file\-.*\.deb$/, @LIST))) {
		    chomp($path_to_file = $matchs[0]);
		    $file = "<a href=\"$SERVER/$path_to_file\">$file</a>";
		} 
		push (@urls, $file);
	    }
	    $pkg{$key} = join(', ', @urls);
	}
    }

    print_output(%pkg);
}

sub print_output
{
    local(%pkg) = @_;

    $pkg{'Maintainer'} =~ s/<(.*)>/\&lt;<a href=\"mailto:$1\";>$1<\/a>\&gt;/g;
    $pkg{'Description'} =~ s/ \./<p>\n/g;
    # We do this ONCE, for the first linefeed.
    $pkg{'Description'} =~ s/\n/<br>\n/; 

    foreach $field (@field_names) {
	if ($pkg{$field} ne '') {
	    print "<b>$field:</b> $pkg{$field}<br>\n";
	}
    }
    print "\n<hr>\n";
}

#
# html_headxs - print out httpd needed info and title information.
#
sub html_head
{
    print "Content-type: text/html\n\n";
    print "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML//EN\">\n";
    print "<html> <head>\n";
    print "<title>$TITLE</title>\n";
    print "</head>\n";
    print "\n";
    print "<body>\n";
    print "<h1>$HEAD</h1>\n";
    print "<hr><p>\n";
}

#
# html_tail - print out main site and contact information.
#
sub html_tail
{
    print "<hr>\n";
    print "<address><A HREF=\"$M_URL\">Debian GNU/Linux</A>&lt;<A HREF=\"mailto:$M_EMAIL\";>$M_EMAIL</A>&gt;</address>\n";
    print "</body> </html>\n";
}

#
# ReadParse from cgi-lib.pl - Parse data passed via POST or GET.
#
# S.E.Brenner@bioc.cam.ac.uk
# $Header: /people/seb1005/http/cgi-bin/RCS/cgi-lib.pl,v 1.2 1994/01/10 15:05:40 seb1005 Exp $
#
# Copyright 1993 Steven E. Brenner
# Unpublished work.
# Permission granted to use and modify this library so long as the
# copyright above is maintained, modifications are documented, and
# credit is given
#
sub ReadParse
{
    if (@_) {
	local (*in) = @_;
    }

    local ($i, $loc, $key, $val);

    # Read in text
    if ($ENV{'REQUEST_METHOD'} eq "GET") {
	$in = $ENV{'QUERY_STRING'};
    } elsif ($ENV{'REQUEST_METHOD'} eq "POST") {
	for ($i = 0; $i < $ENV{'CONTENT_LENGTH'}; $i++) {
	    $in .= getc;
	}
    }

    @in = split(/&/,$in);

    foreach $i (0 .. $#in) {
	# Convert plus's to spaces
	$in[$i] =~ s/\+/ /g;

	# Convert %XX from hex numbers to alphanumeric
	$in[$i] =~ s/%(..)/pack("c",hex($1))/ge;

	# Split into key and value.
	$loc = index($in[$i],"=");
	$key = substr($in[$i],0,$loc);
	$val = substr($in[$i],$loc+1);
	# \0 is the multiple separator
	$in{$key} .= '\0' if (defined($in{$key}));
	$in{$key} .= $val;
    }
    return 1; # just for fun
}

Reply to: