Chapter 4: Sources
This chapter lists the sources of LSE.
4.1: LSE-index
#!/usr/bin/perl
use strict;
use Cwd;
use Getopt::Std;
# LSE-index:
# Local Site Search Engine indexer
# Config section
# ==============
# Handlers for file types.
my %handlers = (
".htm" => "lynx -dump file://localhost/FULLPATH",
".html" => "lynx -dump file://localhost/FULLPATH",
".txt" => "cat FULLPATH",
".pdf" => "ps2ascii FULLPATH",
);
# Globals
# =======
my %opts; # program options
my $dbname; # database name
# Queries section
# ===============
# Start a write-pipe for queries.
my $started = 0;
sub qry_open ($) {
my $qry = shift;
if (! $started) {
$started++;
open (OQRY, "|mysql $dbname") or die ("cannot start open qry");
my $oldh = select (OQRY);
$| = 1;
select ($oldh);
print OQRY ("set autocommit=1;\n");
}
print OQRY ("$qry;\n");
}
# Finalize an open query.
sub qry_finalize () {
close (OQRY);
$started = 0;
}
# Run a single read-query. Return the results.
sub qry_close ($) {
my $qry = shift;
my @ret;
open (QR, "echo \"$qry\" | mysql $dbname |")
or die ("cannot run qry $qry\n");
my $nlines = 0;
LINE: while (<QR>) {
s/\n//;
next LINE unless ($nlines++);
push (@ret, split ("\t"));
}
close (QR);
return (@ret);
}
# Word handling section
# =====================
my %wordinfo;
my $linelen = 0;
sub words_reset ($) {
my $pageid = shift;
%wordinfo = ();
qry_open ("delete from hits where pageid = $pageid");
$linelen = 0;
}
sub words_insert ($ $) {
my $word = shift;
my $pageid = shift;
# Check if this is a WORD or a NONWORD (specified by -n...)
my $skip = 0;
if ($opts{'n'}) {
my @nonwords = split (",", $opts{'n'});
my $nw;
foreach $nw (@nonwords) {
$skip++ if ($word eq $nw);
}
}
if (! $skip) {
my $key = "$word:$pageid";
$wordinfo{$key}++;
}
if ($opts{'w'}) {
if ($linelen + length ($word) > 75) {
print ("\n");
$linelen = 0;
}
if ($skip) {
print ("-");
} else {
print ("+");
}
print ("$word ");
$linelen += length($word);
$linelen += 2;
}
}
sub words_flush () {
my $key;
print ("Inserting words and hits into database.\n");
# Check CPU usage of MySQL, if indicated by -m PERCENTAGE.
if ($opts{'m'}) {
my $toohigh = 1;
while ($toohigh) {
$toohigh = 0;
print ("Checking MySQL usage.\n");
open (TP, "top -b -n1 |") or die ("Cannot start top\n");
TPLINE: while (<TP>) {
s/\n//;
my ($pid, $usr, $pri, $ni, $sz, $rss, $share, $stat,
$cpu, $mem, $time, $cmd) = split (" ");
if ($cmd eq "mysqld") {
print ("MySQL CPU consumption: $cpu\n");
if ($cpu > $opts{'m'}) {
print ("Consumption too high, waiting..\n");
$toohigh = 1;
last TPLINE;
}
}
}
close (TP);
sleep (3) if ($toohigh);
}
}
foreach $key (keys (%wordinfo)) {
my ($word, $pageid) = split (":", $key);
my $hits = $wordinfo{$key};
my $wordid = word_id ($word);
qry_open ("insert into hits (wordid, pageid, hits) " .
"values ($wordid, $pageid, $hits)");
}
qry_finalize ();
}
# What index does a distinctive word have?
my %word_ids;
sub word_id ($) {
my $word = shift;
# Got it already?
return ($word_ids{$word}) if ($word_ids{$word} ne "");
# Already in the DB?
my @res = qry_close ("select id from words where word = '$word'");
if ($res[0] ne "") {
$word_ids{$word} = $res[0];
return ($res[0]);
}
# Nope.. put it there.
qry_close ("insert into words (word) values ('$word')");
my @res = qry_close ("select id from words where word = '$word'");
die ("Failed to insert word\n") if ($res[0] eq "");
$word_ids{$word} = $res[0];
return ($res[0]);
}
# File handling section
# =====================
sub file_handle ($ $ $) {
my $fullpath = shift;
my $handler = shift;
my $pageid = shift;
$handler =~ s{FULLPATH}{$fullpath};
open (IF, "$handler |")
or die ("failed to start $handler");
# Reset the page hits.
words_reset ($pageid);
while (<IF>) {
s/\n//;
s/\r//;
my @parts = split (/\b/);
my $part;
foreach $part (@parts) {
$part = lc ($part);
words_insert ($part, $pageid) if ($part =~ /[a-z]/);
}
}
close (IF);
print ("\n") if ($opts{'w'});
# Now flush the hitinfo into the WORDS table.
words_flush ();
}
# Indexing section
# ================
# File indexer
sub index_file ($ $) {
my $file = shift;
my $uri = shift;
my $curdir = getcwd();
my $fullpath = "$curdir/$file";
print ("File: $fullpath\n");
my $modtime = (stat($fullpath))[9];
my $url;
$url = $fullpath;
$url =~ s/.*$uri/$uri/;
my ($id, $stamp) = pages_lookup ($url);
my @loc = localtime ($stamp);
my $readable = sprintf ("%4.4d-%2.2d-%2.2d",
$loc[5] + 1900, $loc[4] + 1, $loc[3]);
print ("ID: $id, last indexed on $readable ($stamp)\n");
if ($stamp == 0) {
print ("Never handled, indexing.\n");
} elsif ($stamp < $modtime) {
print ("File modified since last run, will try to index.\n");
} else {
print ("Indexing was recent enough, not re-indexing\n");
return;
}
my $re;
my $found = 0;
my $now = time();
foreach $re (keys (%handlers)) {
if ($file =~ /$re$/) {
file_handle ($fullpath, $handlers{$re}, $id);
$found++;
}
}
if (! $found) {
print ("WARNING: no handler for file type\n");
} else {
qry_close ("update pages set stamp = $now where id = $id");
}
}
# Directory Indexer
sub index_dir ($ $) {
my $localpath = shift;
my $relativeuri = shift;
my $prevdir = getcwd();
my $i;
print ("\n");
for ($i = 0; $i < 78; $i++) {
print ("=");
}
print ("\nDirectory: $localpath\n");
for ($i = 0; $i < 78; $i++) {
print ("=");
}
print ("\n");
chdir ($localpath) or die ("cannot cd to $localpath: $!\n");
$localpath = getcwd();
print ("Indexing directory: $localpath\n");
my @dirs;
my $direntry;
foreach $direntry (<*>) {
index_file ($direntry, $relativeuri) if (-f $direntry);
push (@dirs, $direntry) if (-d $direntry);
}
foreach $direntry (@dirs) {
index_dir ($direntry, $relativeuri);
}
chdir($prevdir) or die ("cannot cd to $prevdir: $!\n");
}
# Pages section
# =============
my %pageinfo;
# Get the page info. Reads the PAGES table into core.
sub pages_getinfo () {
print ("Getting information on already processed pages.\n");
my @res = qry_close ("select id,url,stamp from pages");
my $i;
my $count = 0;
for ($i = 0; $i <= $#res; $i += 3) {
my $id = $res[$i];
my $uri = $res[$i + 1];
my $stamp = $res[$i + 2];
$pageinfo{$uri} = "$id:$stamp";
$count++;
}
print ("$count pages were previously processed\n");
}
# Look up a page. Return the ID and the stamp as a list.
# If the page wasn't in core yet, then it is added (and to the db
# as well).
sub pages_lookup ($) {
my $uri = shift;
# Do we have it yet?
my $val = $pageinfo{$uri};
return (split (":", $val)) if ($val ne "");
# Nope.. add it.
qry_close ("insert into pages (url, stamp) " .
"values ('$uri', 0)");
my @ret = qry_close ("select id from pages where url = '$uri'");
return ($ret[0], 0);
}
# Show usage
# ==========
sub usage () {
die ("\n",
"Usage: LSE-index [-flags] database localdir relativeurl\n",
"Flags may be:\n",
" -m PERCENTAGE: wait for MySQL until its CPU usage drops\n",
" below PERCENTAGE\n",
" -n WORD,WORD,WORD: skip these non-words\n",
" -w: show words as we go along\n",
"The database is the index db.\n",
"The local directory is where the indexing will start.\n",
"The relative URL is a URL-directory part to each file under\n",
"the local directory.\n",
"\n");
}
# Main starts here
# ----------------
usage() unless getopts ("wn:m:", \%opts);
usage() unless (-d $ARGV[1] and $ARGV[2] ne "");
$dbname = $ARGV[0];
$| = 1;
pages_getinfo ();
index_dir ($ARGV[1], $ARGV[2]);
4.2: LSE.php
<script language="php">
# Central include file for LSE, the Local Search Engine.
# ------------------------------------------------------
# Defines
# -------
define ("LSE_OR", 1); # OR-search
define ("LSE_AND", 2); # AND search
define ("LSE_VERBOSE", 4); # Verbosity on
define ("LSE_MATCHSTART", 8); # Match at start of term
define ("LSE_MATCHMID", 16); # Match at middle of term
define ("LSE_DEFAULT", 1); # Default
class LSE {
var $qry;
var $verbose;
var $limit;
var $nreturned;
var $beyondlimit;
var $logfile;
function LSE ($dbname, $user, $words, $flags = LSE_DEFAULT,
$lim = 50, $logf = "") {
# Remember the parameters.
$this->verbose = ($flags & LSE_VERBOSE);
$this->limit = $lim;
$this->nreturned = 0;
$this->beyondlimit = false;
$this->logfile = $logf;
# Since the indexer stores words in lower case, we should
# use lower case too.
$words = strtolower($words);
# Initialize the DB connection.
$link = mysql_pconnect("localhost", $user)
or $this->error ("Failed to connect to MySQL server on " .
"localhost as user $user.");
$this->msg ("Connected to MySQL on localhost as user $user.");
mysql_select_db ($dbname, $link)
or $this->error ("Failed to select database $dbname: " .
mysql_error());
$this->msg ("Selected database: $dbname.");
# Show what we have in the DB, when verbose.
if ($this->verbose) {
list ($pages, $latest, $nwords) = $this->info ();
$this->msg ($pages . " indexed pages, latest at " .
$this->stamptotime ($earliest) . ", " .
$nwords . " words in the index.");
$this->msg ("Operating flags: $flags");
if ($flags & LSE_OR)
$this->msg ("OR matching is on.");
if ($flags & LSE_AND)
$this->msg ("AND matching is on.");
if ($flags & LSE_VERBOSE)
$this->msg ("VERBOSITY is on.");
if ($flags & LSE_MATCHSTART)
$this->msg ("STARTMATCH is on.");
if ($flags & LSE_MATCHMID)
$this->msg ("MIDMATCH is on.");
}
# Chop up the words.
$fields = explode (" ", $words);
# Build up the words selection.
$wordsel = "";
foreach ($fields as $field) {
if ($wordsel == "")
$wordsel = "(";
else
$wordsel .= " or ";
if ($flags & LSE_MATCHSTART)
$wordsel .= "w.word like '$field%'";
else if ($flags & LSE_MATCHMID)
$wordsel .= "w.word like '%$field%'";
else
$wordsel .= "w.word = '$field'";
}
$wordsel .= ")";
$this->msg ("Word selection: $wordsel");
# Build up the total select statement.
$stmt = "select count(*) as nr, w.word, sum(h.hits) as total, p.url " .
"from words w " .
"inner join hits h on (w.id = h.wordid) " .
"inner join pages p on (p.id = h.pageid) " .
"where $wordsel " .
"group by h.pageid ";
if ($flags & LSE_AND)
$stmt .= "having nr = " . count($fields) . " ";
$stmt .= "order by total desc limit $lim";
$this->msg ("SQL statement: $stmt.");
$this->qry = mysql_query ($stmt)
or $this->error ("Bad SQL statement: $stmt");
$this->msg ("Results set will be limited to " . $this->limit. ".");
}
function foundhits() {
return ($this->nreturned);
}
function nexturl () {
if ($this->nreturned == $this->limit) {
$this->beyondlimit = true;
mysql_free_result ($this->qry);
return (array ());
}
if (! (list ($a, $w, $n, $page) = mysql_fetch_row ($this->qry)) ) {
mysql_free_result ($this->qry);
return (array ());
}
$this->nreturned++;
return (array ($page, $n));
}
function nextpage ($base) {
if (! (list ($url, $n) = $this->nexturl ()) )
return (array ());
# Construct the local filename and try to read it.
$file = $base . $url;
$this->msg ("Next page: " . $file . ", " . $n . " hits.");
return (array ($url, $n, $this->titleof ($file)));
}
function contentsof ($file, $max = 200) {
$this->msg ("CONTENTSOF starts.");
if (! file_exists ($file))
$this->error ("URL " . $url . " points to non-existing file " .
$file);
if (! ($fp = fopen ($file, "r")) )
$this->error ("Cannot read " . $file);
# Extract up to XX chars from the file.
$total = "";
$intagblock = 0;
while (! feof ($fp)) {
# Append to total.
$line = rtrim (fgets ($fp, 4096));
$this->msg ("Got line: " . htmlspecialchars($line));
$addline = true;
if ($intagblock) {
$this->msg ("I am inside a TAG BLOCK.");
if (preg_match ("/\/script>/i", $line) or
preg_match ("/\/style>/i", $line)
) {
$intagblock = 0;
$line = preg_replace ("/.*\/script>/i", "", $line);
$line = preg_replace ("/.*\/style>/i", "", $line);
$this->msg ("ENDTAGBLOCK removed, now " .
htmlspecialchars($line));
} else
$addline = false;
}
if ($addline) {
$lastchar = $total[strlen ($total) - 1];
if ($lastchar != ">" and $lastchar != "<")
$total .= " ";
$total .= $line;
$this->msg ("TOTAL is now: " . htmlspecialchars($total));
}
# Kill <script> ... </script> if we have that,
# or <style> ... </style>.
$total = preg_replace ("/<script.*>.*<\/script>/i", "", $total);
$total = preg_replace ("/<style.*>.*<\/style>/i", "", $total);
# Are we inside tag block that we should skip?
if (preg_match ("/<script/i", $total) or
preg_match ("/<style/i", $total)
) {
$intagblock++;
$this->msg ("Going into TAG BLOCK mode.");
$total = preg_replace ("/<script.*/i", "", $total);
$total = preg_replace ("/<style.*/i", "", $total);
$this->msg ("TOTAL now: " . htmlspecialchars($total));
}
# Reached max yet?
if (strlen (strip_tags($total)) > $max) {
fclose ($fp);
$this->msg ("CONTENTSOF ends.");
if (preg_match ("/<html>/", $total)) {
$this->msg ("CONTENTSOF: it is an HTML file.");
return (substr (strip_tags($total), 0, $max) . "...");
} else {
$this->msg ("CONTENTSOF: Not HTML!");
return ("");
}
}
}
# File was too short.
fclose ($fp);
$this->msg ("CONENTSOF reached EOF before MAX.");
if (preg_match ("/<html>/", $total)) {
$this->msg ("CONTENTSOF: it is an HTML file.");
return (substr (strip_tags($total)));
} else {
$this->msg ("CONTENTSOF: Not HTML!");
return ("");
}
}
function titleof ($file) {
if (! file_exists ($file))
$this->error ("URL " . $url . " points to non-existing file " .
$file);
if (! ($fp = fopen ($file, "r")) )
$this->error ("Cannot read " . $file);
# Extract the title from the file.
$total = "";
$nlines = 0;
while (! feof ($fp) and $nlines++ < 50) {
# Append the current line to the total.
$line = rtrim (fgets ($fp, 256));
$lastchar = $total[strlen ($total) - 1];
if ($lastchar != ">" and $lastchar != "<")
$total .= " ";
$total .= $line;
# Do we have a title spec yet?
if (preg_match ("/<title>/i", $total) and
preg_match ("/<\/title>/i", $total)
) {
# Got the title. Extract it!
$this->msg ("HTML string with title: " .
htmlspecialchars($total));
$title = preg_replace ("/.*<title>(.*)<\/title>.*/i", "$1",
$total);
$this->msg ("Extracted title: " .
htmlspecialchars($title));
fclose ($fp);
return (strip_tags ($title));
}
}
fclose ($fp);
# We did not get a title...
return ("");
}
function limitexceeded () {
return ($this->beyondlimit);
}
function info () {
$q = mysql_query ("select count(*) from pages");
list ($npages) = mysql_fetch_row ($q);
mysql_free_result ($q);
$q = mysql_query ("select max(stamp) from pages");
list ($maxstamp) = mysql_fetch_row ($q);
mysql_free_result ($q);
$q = mysql_query ("select count(*) from words");
list($nwords) = mysql_fetch_row ($q);
mysql_free_result ($q);
return (array ($npages, $maxstamp, $nwords));
}
function stamptotime ($stamp) {
$parts = localtime ($stamp, true);
$txt = sprintf ("%4.4d-%2.2d-%2.2d@%2.2d:%2.2d:%2.2d",
$parts["tm_year"] + 1900,
$parts["tm_mon"] + 1,
$parts["tm_mday"],
$parts["tm_hour"],
$parts["tm_min"],
$parts["tm_sec"]);
$txt = preg_replace ("/ /", "0", $txt);
$txt = preg_replace ("/@/", " ", $txt);
return ($txt);
}
function error ($msg) {
die ("<p>Search engine error:<br>
<i>$msg</i>\n");
}
function msg ($msg) {
if ($this->logfile) {
if (! ($fp = fopen ($this->logfile, "a")) )
$fp = fopen ($this->logfile, "w");
if ($fp) {
fputs ($fp, "$msg\n");
fclose ($fp);
}
}
if ($this->verbose) {
echo ("<font size=\"1\"> $msg </font><br>\n");
flush();
}
}
}
</script>
4.3: search.php
<!--
Here's a sample form that uses the LSE search indexer.
The form "calls itself" to make re-searching possible.
Also, the form may be called from other site parts.
-->
<script language="php">
# First, let's strip possibly offending information from the
# form variables. This is always a good idea!
# -----------------------------------------------------------
$logical = strip_tags ($logical);
$words = strip_tags ($words);
$debug = strip_tags ($debug);
</script>
<html>
<head>
<title>Search Results</title>
<link rel="stylesheet" type="text/css" href="/css/style.css">
</head>
<body>
<h1>Search results</h1>
<hr>
<!--
This form presents the searching or re-searching
functionality. Note that when the form calls itself,
it will substitute previously entered options at the
right fields.
The form asks for the following info:
- HOW do we search: must all words match (logical AND)
or may any of the words match (logical OR);
- WHAT we search for, the list of terms
- HOW we match each term with the index: must the term
match exactly, must it be the start of a word, or may
it occur anywhere in the word. If you know SQL: this
will lead to
word = 'TERM' or
word = 'TERM%' or
word = '%TERM%'
The checkbox for "debugging" is of course optional.
This checkbox is used during development (by me).
-->
<form method="post" action="search.html">
<table>
<tr>
<td> Search terms: </td>
<td>
<select name="logical">
<option value="or"> Any word may match </option>
<option value="and"
<? if ($logical == "and") echo ("selected"); ?> >
All words must match </option>
</select>
of
<input type="text" name="words" size="30"
<? if ($words) echo ("value=\"$words\""); ?> >
</td>
</tr>
<tr>
<td> Match mode: </td>
<td>
<select name="matchmode">
<option value="matchexact"
<? if ($matchmode == "matchexact") echo ("selected"); ?> >
Match terms exactly as stated </option>
<option value="matchstart"
<? if ($matchmode == "matchstart") echo ("selected"); ?> >
Match starts of terms </option>
<option value="matchmid"
<? if ($matchmode == "matchmid") echo ("selected"); ?> >
Match any part of terms </option>
</td>
</tr>
<tr>
<td> Start searching </td>
<td>
<input type="submit" value="Go!">
<font size="1"> Debugging: </font>
<input type="checkbox" name="debug"
<? if ($debug) echo ("checked"); ?> >
</td>
</tr>
</table>
</form>
<hr>
<script language="php">
# The following code is the PHP interface to the LSE class.
# ---------------------------------------------------------
# No words to search for? Don't fire up LSE then.
if ($words == "")
exit ();
# Start the output for the results. We'll need LSE.php
# for the class definition.
echo ("Results for <i>$words</i>:<p>\n");
flush();
include ("/home/intranet/etc/LSE.php");
# Here, the operation flags get synthesized from the form
# values. The operation flags are OR-ed values, using the
# constants: LSE_VERBOSE to get lots of debugging output,
# LSE_OR or LSE_AND to specify logical "or" or "and"
# matching,
# LSE_MATCHSTART and LSE_MATCHMID: to have search terms
# match start- or midstrings of words.
if ($debug)
$flags |= LSE_VERBOSE;
if ($logical == "and")
$flags |= LSE_AND;
else
$flags |= LSE_OR;
if ($matchmode == "matchstart")
$flags |= LSE_MATCHSTART;
else if ($matchmode == "matchmid")
$flags |= LSE_MATCHMID;
# This is the searching object. The constructor defines what
# we'll search for, and how. The arguments to the constructor are:
# - The database name holding the index;
# - The username to use when connecting;
# - the words to search for, as one string, space-separated;
# - optional: the operation flags. When not given, LSE_OR is
# the default;
# - the limit for the # of returned rows. When not given,
# 50 is the default.
# - A log file where stuff is appended. Default is an empty string,
# meaning no logging.
$lse = new LSE ("intranet", "intranet", $words,
$flags, 50, "/tmp/LSE.log");
# Start outputting the results as a table.
echo ("<table>
<tr>
<td>
<font size=\"1\">
<b>Hits</b>
</font>
</td>
<td>
<font size=\"1\">
<b>Page, URL and first contents</b>
</font>
</td>
</tr>\n");
# Retrieve the results.
# The returned values are the (relative) URL to a page with
# a hit, the number of hits in that page, and the page title.
# The title may be an empty string, when the document doesn't have
# a title or when it's not an HTML document.
while (list ($url, $n, $title) =
$lse->nextpage ("/home/intranet/htdocs")) {
$contents = $lse->contentsof ("/home/intranet/htdocs/" . $url);
echo ("<tr>
<td valign=\"top\" align=\"right\">
<font size=\"1\" color=\"red\"> $n </font>
</td>
<td valign=\"top\">
<font size=\"1\">
<a href=\"$url\"><b>$title</b></a>
(<a href=\"$url\">$url</a>) <br>
<i>$contents</i>
</font>
</td>
</tr>\n");
}
echo ("</table>\n");
# All done, the results up to the limit are shown.
# If the limit was exceeded, say so.
if ($lse->limitexceeded ())
echo ("<p> There are more results. Try limiting your
search by adding words and by matching documents that
contain all of the stated words.\n");
else {
$nhits = $lse->foundhits ();
if (! $nhits)
echo ("<p> Your search didn't produce any hits.
Try adding words and searching for documents that
contain any of the stated words; or try a different
search term.\n");
}
# Some stats.
list($pages, $latest, $words) = $lse->info();
$l_str = $lse->stamptotime($latest);
echo ("<p>
<font size=\"1\">
The index holds $words words in $pages pages,
last indexed on $l_str.
</font>\n");
# That's all folks!
</script>
</body>
</html>