#!/usr/bin/perl # arma@mit.edu # Distributed under GPL for SEUL project sub usage { print "$0: SEUL document parser.\n" . "Usage: $0 [-hq] [options] [-o outfile|-O outfile] infile\n" . "Usage: $0 [-hq] [options] [-o outfile|-O outfile] < infile\n\n" . "-h, --help\tHelp (this text)\n" . "-v, --verbose\tVerbose level. 9 is spammy, 0 is quiet (default), 6 is standard\n" . "-q, --quiet\tQuiet mode (suppress warnings)\n" . "-o, --output\tRedirect output to outfile, fail if outfile already exists\n" . "-O, --Output\tRedirect output to outfile, replace if outfile already exists\n" . "-i, --initial\tRead in initial script from scriptfile\n" . "-H, --Handler\tAdd handlerset parameter\n" . "-d, --doctype\tSet doctype variable\n"; exit(0); } # set the libpaths $default_lib = '/home/seul/lib/web/sdoc'; $default_init = 'init_script.pl'; $env_lib = $ENV{'SDOC_LIB'}; $env_init = $ENV{'SDOC_INIT'}; #require "getopts.pl"; #&Getopts("hv:qo:O:i:D:"); #parse command-line options ourselves. Currently, the first # non-option we find ends option parsing. while($ARGV[0] =~ /^-/) { # are we out of option args? $_ = shift; last if (/^--$/); s/^-//; # cut off the - at the beginning checkopts: if(/^h/ || /^-help/) { &usage(); # program exits } elsif(/^-quiet/) { $opt_q = 1; } elsif(/^q/) { $opt_q = 1; s/^q//; goto checkopts; } elsif(/^-verbose/) { $opt_v = shift; } elsif(/^v(.*)/) { $opt_v = ($1 ? $1 : shift); } elsif(/^-initial/) { $opt_i = shift; if (!Load_Initscript($opt_i)) { &Warn("Unable to find initscript '$opt_i', skipping..."); } } elsif(/^i(.*)/) { $opt_i = ($1 ? $1 : shift); if (!Load_Initscript($opt_i)) { &Warn("Unable to find initscript '$opt_i', skipping..."); } } elsif(/^-output/) { $opt_o = shift; } elsif(/^o(.*)/) { $opt_o = ($1 ? $1 : shift); } elsif(/^-Output/) { $opt_O = shift; } elsif(/^O(.*)/) { $opt_O = ($1 ? $1 : shift); } elsif(/^D(.*)/) { my $tmp = ($1 ? $1 : shift); if($tmp =~ /(.+)=(.+)/) { # correctly formed $defines{$1} = $2; } else { &Warn("\nError: Malformed -D option, skipping.\n"); } } elsif(/^H(.*)/) { push(@HandlerSet, ($1 ? $1 : shift)); } elsif(/^-Handler/) { push(@HandlerSet, shift); } elsif(/^d(.*)/) { $doctype = ($1 ? $1 : shift); } elsif(/^-doctype/) { $doctype = shift; } else { print "Error: Unrecognized switch: $_\n\n"; &usage(); } } #print "opt_h is $opt_h\n"; #print "opt_o is $opt_o\n"; #print "opt_O is $opt_O\n"; #print "opt_q is $opt_q\n"; #print "opt_i is $opt_i\n"; #foreach $opt_d (keys %defines) { # print "defines{$opt_d} is $defines{$opt_d}\n"; #} #exit(0); if ($opt_o) { if ($opt_O) { &Warn("Both -o and -O specified, only one allowed. Exiting."); exit(1); } if (-e $opt_o) { &Warn("Non-forced redirection to existing file '$opt_o', exiting."); exit(1); } &Debug(5, "Redirecting output to file $opt_o, replacing."); open(STDOUT, ">$opt_o") || die "Can't redirect stdout"; } elsif ($opt_O) { &Debug(5, "Redirecting output to file $opt_O, overwriting."); open(STDOUT, ">$opt_O") || die "Can't redirect stdout"; } if(!$opt_i) { # load default initscripts if no $opt_i given Load_Initscript("$default_lib/$default_init"); Load_Initscript("$env_lib/$default_init"); } sub Load_Initscript { # loads the script specified as its arg. Returns 0 if failed. my($i) = @_; if (-f "$i") { do "$i"; Debug(5,"Loaded initscript '$i'"); return(1); } elsif (-f "$env_lib/$i") { do "$env_lib/$i"; Debug(5,"Loaded initscript '$env_lib/$i'"); return(1); } elsif (-f "$default_lib/$i") { do "$default_lib/$i"; Debug(5,"Loaded initscript '$default_lib/$i'"); return(1); } return(0); } { my($x); $x = $/; undef $/; $input = <>; # slurp in the whole file at once $/ = $x; # set it back to its default } # &Load_Config("doc-types/$headerparams{'doc-type'}/Config"); # # will create @fieldsrequired, @fieldsoptional, @tagsdefined, @tagsdefinedsingle $endhtml = '/'; # $tagfinished = '\*'; # the symbol that says "don't parse this tag" $Max_Passes = 10; # maximum number of passes the parser will make for($Pass_Number = 0; $Pass_Number < $Max_Passes; $Pass_Number++) { Debug(6,"Initiating pass #$Pass_Number..."); Debug(9,"Text is currently:\n********\n$input\n****"); $Previous_num_succ = $Global_num_succ; $Previous_num_fail = $Global_num_fail; $Global_num_succ = 0; $Global_num_fail = 0; $input = &Parse($input); last if ($Global_num_fail == 0); } # $input = &Final_Parse($input); # clean out those i'm-done tag markers Debug(5,"*********End text:*********"); print $input; exit(0); sub count_open_tags { my($tag, $text) = @_; Debug(9,"Performing countopentags on tag '$tag', text:\n****\n$text\n**"); # if (/ \< \s* $tag (\s|\>)/isx) { if ($text =~ / \< \s* $tag [\s\>] /isx) { Debug(9,"Found a recursive opentag $tag!"); return(1+&count_open_tags($tag, $')); } else { return(0); } } sub Final_Parse { # strips out asterisks my($text) = @_; # Debug("Final Parse called, text:\n"); # $text =~ s|\<$tagfinished|\<|g; # $text =~ s|\<\*|\<|g; # Debug("Final Parse returning, text:\n"); return($text); } sub Parse { # returns ($text) my ($text) = @_; my ($pretext) = ''; # no pun. really. my ($insidetext) = ""; my ($tagpaired) = 0; my ($tagandparams); my ($tag, $params); my ($prelude, $postlude); # find a tag to parse while($text =~ / (.*?) \< (.*?) \> /sx) { # there's a tag in it $pretext .= $1; $tagandparams = $2; $text = $'; # if ($tagandparams =~ /^$tagfinished/) { # not a parsable tag. Skip. # if ($tagandparams =~ /^\*/) { # not a parsable tag. Skip. # Debug("Finished tag '$tagandparams' found, skipped."); # $pretext .= "<$tagandparams>"; # next; # } if ($tagandparams =~ / \s* (\S+) \s* /sx) { $tag = lc($1); $params = $'; # postmatch } else { # no text in tagandparams. It was a <> (maybe with whitespace) &Warn("No text inside tag, treating as normal text"); $pretext .= "<$tagandparams>"; next; } &Debug(7,"Parse found embedded tag '$tag', params '$params'."); if(!$HandlerSingle{$tag}) { # then it's either paired or default # look for a matching endtag my($preinsidetext) = ''; $num_opened_tags = 1; while ($text =~ / (.*?) \< \s* $endhtml \s* $tag \s* \> /isx) { Debug(9,"close-tag $tag found."); # looks hopeful $num_opened_tags--; $insidetext = $1; $text = $'; $num_opened_tags += &count_open_tags($tag, $insidetext); Debug(9,"Num Open Tags after Count is '$num_opened_tags'."); $preinsidetext .= $insidetext; if (!$num_opened_tags) { $tagpaired = 1; $insidetext = $preinsidetext; Debug(8,"Insidetext '$insidetext', tagpaired matched!"); goto matched; } $preinsidetext .= "<$endhtml" . "$tag>"; } # end while $text = $preinsidetext.$text; # back up, # fall through. No endtag. if (defined $HandlerCode{$tag}) { # it's defined to be paired, no endtag. Oops. die "No matched tag '$tag' for defined tag."; } else { # use default. It's ok if there's no inside. # $insidetext = ''; # Warn("Default handler used on unpaired tag '$tag'"); $tagpaired = 0; } } else { # single $insidetext = ''; $tagpaired = 0; } matched: # label from above, finding matched endtag ($prelude, $insidetext, $postlude, $errval) = &handler($tag, $params, $insidetext, $tagpaired); if ($Handler_has_called_Parse) { undef $Handler_has_called_Parse; return($pretext . $insidetext . &Parse($text)); # recurse on endtext } else { # sort the error code if($errval == 0) { $Global_num_succ++; return($pretext . $insidetext . &Parse($text)); # recurse on endtext } elsif ($errval == 1) { $Global_num_fail++; return($pretext . $insidetext . &Parse($text)); # recurse on endtext } elsif ($errval == 2) { $Global_num_succ++; return($pretext . $prelude . &Parse($insidetext) . $postlude . &Parse($text)); # recurse on endtext } elsif ($errval == 3) { $Global_num_fail++; return($pretext . $prelude . &Parse($insidetext) . $postlude . &Parse($text)); # recurse on endtext } } # $insidetext = Parse($insidetext) unless $Handler_has_called_parse; } # no valid tag. Return the text. $text = $pretext . $text; &Debug(8,"No parseable tag in text, returning '$text'."); return ($text); } #verbose takes an argument, from 0 to 9. The higher the argument, #the more spam you want to see. Generally -v 6 is sufficient to give #you a good idea of what's going on. sub Debug { my ($num, $text) = @_; print STDERR "$num:$text\n" if ($num<=$opt_v); } sub Warn { my ($text) = @_; print STDERR "Warning: $text\n" unless $opt_q; } sub handler { local ($tagname, $params, $text, $tagpaired) = @_; local $prelude = ''; local $postlude = ''; my($errval) = 1; # foreach $var (keys %params) { # &Debug("Handler: Param '$var' is val '$params{$var}'."); # } if (defined $HandlerCode{$tagname}) { $errval = eval "$HandlerCode{$tagname}"; &Debug(7,"HANDLER '$tagname' finished, returning '$text', error $errval"); } else { if(defined $HandlerDefault) { $errval = eval "$HandlerDefault"; &Debug(7,"Default HANDLER '$tagname' finished, returning '$text', error $errval"); } else { # No default handler function. Just do something useful. if ($HandlerSingle{$tagname} == $tagpaired) { Warn("tag $tagname is unpaired, but is supposed to be paired"); } $prelude = "<$tagname"; if ($params) { $prelude .= " $params"; } $prelude .= ">"; if($tagpaired) { $postlude = ""; $errval = 2; } else { $text = $prelude; $errval = 0; } &Debug(7,"Inlined default HANDLER '$tagname' finished."); } } Warn "Handler for tag '$tagname' failed: $@" if $@; # $text = do "doc-types/$headerparams{'doc-type'}/$tagname"; return($prelude, $text, $postlude, $errval); } sub Parse_Params { my ($text) = @_; my ($param, $value); my (%paramlist); while ($text =~ / \s* (.*?) \s* = \s* /sx) { $param = lc($1); ($value, $text) = &Parse_Value($'); # postmatch $paramlist{"$param"} = $value; &Debug(8,"Param '$param' = '$value'."); } if ($text) { &Warn("Extra characters '$text' in param list ignored."); } return(%paramlist); } sub Parse_Value { my ($line) = @_; my ($value) = ''; if ($line =~ / ^ \" (.*?) \" /xs) { # then it was a quoted value $value = $1; $line = $'; # postmatch while ($value =~ s/\\$/\"/s) { # if it ends in a backslash, it wasn't a real " if ($line =~ / (.*?) \" /xs) { $value .= $1; $line = $'; # postmatch } else { # no remaining ". They're unmatched. Ack. &Warn("Unmatched quote in value, automatically closing..."); last; # break out of the while loop } } } else { # not a quoted value. Grab til whitespace. if ($line =~ /^(\S+)/s) { $value = $1; $line = $'; # postmatch } else { # woah. No value? &Warn("Parameter with no value, assigning null..."); } } return($value, $line); } sub Register_Single { # registers that this tag is unpaired, but does not # associate code with it my ($tagname) = @_; $HandlerSingle{$tagname} = 1; } sub Register_Handler { # returns 0 on failure, nonzero on success my ($tagname, $filename, $single) = @_; my ($file, $retval); if ($filename =~ /^\//) { # we have an absolute path Debug(8,"loading absolute $filename"); $file = "$filename"; } else { if (-f "$env_lib/$filename") { Debug(8,"loading relative $env_lib/$filename"); $file = "$env_lib/$filename"; $retval = 2; # means got lib } elsif (-f "$default_lib/$filename") { Debug(8,"loading relative $default_lib/$filename"); $file = "$default_lib/$filename"; $retval = 1; # means got default } else { Warn("file $filename could not be found"); return(0); # argh!! } } # $HandlerCode{$tagname} = `cat $file`; $HandlerCode{$tagname} = ReadFile($file); $HandlerSingle{$tagname} = $single; Debug(7,"handler $filename for tag $tagname loaded"); return($retval); # success } sub ReadFile { # opens a file, and returns its content in a string # returns undef if it fails my($filename) = @_; my($mystring, $x); # return(`cat $filename`); open(TMPFILE, "$filename") || return(undef); $x = $/; undef $/; $mystring = ; $/ = $x; # set it back close(TMPFILE); return($mystring); }