#!/usr/bin/perl
# Agnews Poster: parser.pl
# This script takes an article in the agnmore style format and adds
# appropriate html tags (paragraphs, writer/contact markup, title,
# etc..). The input file is taken from standard in and the ouput is
# printed to standard out.
#
# Author: James Dean Palmer
#
# Revision 1: 11-21-96
# Revision 2: 01-15-97
# Revision 3: 10-09-98
# Huge code clean up. Fixes "" bug. Fixes & bug.
# Can parse dates from any year. HTML output is much prettier.
# Generally more forgiving.
# Read the file from standard in.
$article .= $_ while (<>);
$_ = $article;
# This script assumes the incoming article has no html tags in it.
# Some ascii characters need to be converted to the html equivalent.
s/\&/\&\;/g;
s/\\</g;
s/\>/\>/g;
# Change 5 or more dashes to an HR bar
s/^-{5,}$/
/g;
# Is there a NOTICE: somewhere in the file?
# Notices provide a way of adding comments. Notices are terminated
# by two carriage returns in a row (i.e. 1 blank line) and could
# appear anywhere in the file. The news poster should ignore
# notices (which are intended only for the mailing list recipients).
# The next regular expression removes the first occurrance of a
# notice from the article text. I assume only one notice per
# article.
s/\nnotice:.*?\n\n/\n/si;
# The first thing in an article is the date. The date should always
# end with a 4 digit year. We put an tag at the end of the
# data and start the title (which should be next) with an tag.
s/(\d\d\d\d)(\s*)/$1\<\/h3\>\n\/s;
# The title ends with either the writer or the contact.
s/(Writer|Contact)(\:|s\:|\(s\)\:)/\<\/h2\>\n$1$2/s;
# Make the Writer label bold. Start italics for the list of people.
s/(Writer)(\:|s\:|\(s\)\:)/\$1$2\<\/b\> /s;
# Make the Contact label bold. Start italics for the list of people.
s/(Contact)(\:|s\:|\(s\)\:)/\$1$2\<\/b\> /s;
# Find the start of the article.
s/\n\n/\<\!\-\- article \-\-\>/;
# End the italics at either the occurance of \n\n or another bold tag.
s/(\(.|\n)*?)(\)/$1\<\/i\>$3/;
s/(\(.|\n)*?)(\<\!\-\-\sarticle\s\-\-\>)/$1\<\/i\>$3/;
# Make sure that italics end tags don't wrap to next line.
s/(\n<\/i>)/<\/i>\n/g;
# Remove any white space between the title and the
tag.
s/(\s*)(\<\/h2\>)/$2/;
# Replace any \n's in the writer/contact information with
's
s/(\)((.|\n)*)(\<\!\-\-\sarticle\s\-\-\>)/PLACE-HOLDER/;
$temp = $1 . $2;
$temp =~ s/\n/\n
/g;
s/PLACE-HOLDER/$temp\n\n\n/;
# Center -30- at the bottom of the page. Opinion seems to varry on
# how -30- is formatted so trap all cases, and format it MY way. :-)
s|\s*-{1,3}\s{0,5}30\s{0,5}-{1,3}|\n
-30-|o;
s/-30-((.|\n)*)$/-30-<\/center>\n/;
$comment = $1;
$comment =~ s/\s+/ /g;
# Scan the entire document and attempt to put paragraph tags where
# appropriate by using these rules:
# 1. Change newline + whitespace + newline to a paragraph
s|\n\s*\n+\s*|\n\n|g;
# 2. Change newline + 2 whitespaces to a paragraph
s|\n\s\s|\n
|g;
# Improve the html spacing
s/\n/\n /g;
s/\s\s
/
/g;
s/
\s*/
/g;
# All the difficult formatting bits have been done. Now we just
# need to add some helpful html tags.
# Check for URLs.
s|(news:[\w.]+)|$&|g;
s|(http:[\w/.:+\-]+)|$&|g;
s|(file:[\w/.:+\-]+)|$&|g;
s|(ftp:[\w/.:+\-]+)|$&|g;
s|(wais:[\w/.:+\-]+)|$&|g;
s|(gopher:[\w/.:+\-]+)|$&|g;
s|(telnet:[\w/.:+\-]+)|$&|g;
# Check for an something@something. This regular expression is usually
# pretty good at recognizing email addresses and discounting other common
# @ usages.
$at = '@';
s/\s((\w|\.|\-|\d){2,20})(\@)((\w|\.|\-|\d){2,32})/$1$3$4<\/a>/gs;
# Determine the article title
m/\(.*?)\<\/h2\>/;
$title = $1;
# Assign $article the modified article data
$article = $_;
# Print everything to standard out.
print "\n";
print " \n";
print " $title\n";
print " \n";
print " \n";
print " $article\n";
print " \n";
print "
\n";
print "
\n";
print " To Texas A&M Agriculture News Home Page\n";
print " \n";
print "";