#!/usr/bin/perl # Agnews Poster: parser.pl # This script takes an article in the agnmore style format and adds # appropriate html tags (paragraphs, writer/contact markup, title, # etc..). The input file is taken from standard in and the ouput is # printed to standard out. # # Author: James Dean Palmer # # Revision 1: 11-21-96 # Revision 2: 01-15-97 # Revision 3: 10-09-98 # Huge code clean up. Fixes "" bug. Fixes & bug. # Can parse dates from any year. HTML output is much prettier. # Generally more forgiving. # Read the file from standard in. $article .= $_ while (<>); $_ = $article; # This script assumes the incoming article has no html tags in it. # Some ascii characters need to be converted to the html equivalent. s/\&/\&\;/g; s/\/\>/g; # Change 5 or more dashes to an HR bar s/^-{5,}$/
/g; # Is there a NOTICE: somewhere in the file? # Notices provide a way of adding comments. Notices are terminated # by two carriage returns in a row (i.e. 1 blank line) and could # appear anywhere in the file. The news poster should ignore # notices (which are intended only for the mailing list recipients). # The next regular expression removes the first occurrance of a # notice from the article text. I assume only one notice per # article. s/\nnotice:.*?\n\n/\n/si; # The first thing in an article is the date. The date should always # end with a 4 digit year. We put an tag at the end of the # data and start the title (which should be next) with an

tag. s/(\d\d\d\d)(\s*)/$1\<\/h3\>\n\/s; # The title ends with either the writer or the contact. s/(Writer|Contact)(\:|s\:|\(s\)\:)/\<\/h2\>\n$1$2/s; # Make the Writer label bold. Start italics for the list of people. s/(Writer)(\:|s\:|\(s\)\:)/\$1$2\<\/b\> /s; # Make the Contact label bold. Start italics for the list of people. s/(Contact)(\:|s\:|\(s\)\:)/\$1$2\<\/b\> /s; # Find the start of the article. s/\n\n/\<\!\-\- article \-\-\>/; # End the italics at either the occurance of \n\n or another bold tag. s/(\(.|\n)*?)(\)/$1\<\/i\>$3/; s/(\(.|\n)*?)(\<\!\-\-\sarticle\s\-\-\>)/$1\<\/i\>$3/; # Make sure that italics end tags don't wrap to next line. s/(\n<\/i>)/<\/i>\n/g; # Remove any white space between the title and the

tag. s/(\s*)(\<\/h2\>)/$2/; # Replace any \n's in the writer/contact information with
's s/(\)((.|\n)*)(\<\!\-\-\sarticle\s\-\-\>)/PLACE-HOLDER/; $temp = $1 . $2; $temp =~ s/\n/\n
/g; s/PLACE-HOLDER/$temp\n\n\n/; # Center -30- at the bottom of the page. Opinion seems to varry on # how -30- is formatted so trap all cases, and format it MY way. :-) s|\s*-{1,3}\s{0,5}30\s{0,5}-{1,3}|\n

-30-|o; s/-30-((.|\n)*)$/-30-<\/center>\n/; $comment = $1; $comment =~ s/\s+/ /g; # Scan the entire document and attempt to put paragraph tags where # appropriate by using these rules: # 1. Change newline + whitespace + newline to a paragraph s|\n\s*\n+\s*|\n\n

|g; # 2. Change newline + 2 whitespaces to a paragraph s|\n\s\s|\n

|g; # Improve the html spacing s/\n/\n /g; s/\s\s

/

/g; s/

\s*/

/g; # All the difficult formatting bits have been done. Now we just # need to add some helpful html tags. # Check for URLs. s|(news:[\w.]+)|$&|g; s|(http:[\w/.:+\-]+)|$&|g; s|(file:[\w/.:+\-]+)|$&|g; s|(ftp:[\w/.:+\-]+)|$&|g; s|(wais:[\w/.:+\-]+)|$&|g; s|(gopher:[\w/.:+\-]+)|$&|g; s|(telnet:[\w/.:+\-]+)|$&|g; # Check for an something@something. This regular expression is usually # pretty good at recognizing email addresses and discounting other common # @ usages. $at = '@'; s/\s((\w|\.|\-|\d){2,20})(\@)((\w|\.|\-|\d){2,32})/$1$3$4<\/a>/gs; # Determine the article title m/\(.*?)\<\/h2\>/; $title = $1; # Assign $article the modified article data $article = $_; # Print everything to standard out. print "\n"; print " \n"; print " $title\n"; print " \n"; print " \n"; print "

$article\n"; print " \n"; print "
\n"; print "

\n"; print " To Texas A&M Agriculture News Home Page\n"; print " \n"; print "";