#!/usr/bin/perl

# arma@mit.edu
# Distributed under GPL for SEUL project

sub usage {
  print "$0: SEUL document parser.\n" .
       "Usage: $0 [-hq] [options] [-o outfile|-O outfile] infile\n" .
       "Usage: $0 [-hq] [options] [-o outfile|-O outfile] < infile\n\n" .
       "-h, --help\tHelp (this text)\n" .
       "-v, --verbose\tVerbose level. 9 is spammy, 0 is quiet (default), 6 is standard\n" .
       "-q, --quiet\tQuiet mode (suppress warnings)\n" .
       "-o, --output\tRedirect output to outfile, fail if outfile already exists\n" .
       "-O, --Output\tRedirect output to outfile, replace if outfile already exists\n" .
       "-i, --initial\tRead in initial script from scriptfile\n" .
       "-H, --Handler\tAdd handlerset parameter\n" .
       "-d, --doctype\tSet doctype variable\n";
  exit(0);
}

# set the libpaths
$default_lib = '/home/seul/lib/web/sdoc';
$default_init = 'init_script.pl';
$env_lib = $ENV{'SDOC_LIB'};
$env_init = $ENV{'SDOC_INIT'};

#require "getopts.pl";
#&Getopts("hv:qo:O:i:D:");

#parse command-line options ourselves. Currently, the first
# non-option we find ends option parsing.

while($ARGV[0] =~ /^-/) { # are we out of option args?
    $_ = shift;
    last if (/^--$/);
    s/^-//; # cut off the - at the beginning
checkopts:
    if(/^h/ || /^-help/) {
       &usage();
       # program exits
    } elsif(/^-quiet/) {
       $opt_q = 1;
    } elsif(/^q/) {
       $opt_q = 1;
       s/^q//;
       goto checkopts;
    } elsif(/^-verbose/) {
       $opt_v = shift;
    } elsif(/^v(.*)/) {
       $opt_v = ($1 ? $1 : shift);
    } elsif(/^-initial/) {
       $opt_i = shift;
       if (!Load_Initscript($opt_i)) {
          &Warn("Unable to find initscript '$opt_i', skipping...");
       }
    } elsif(/^i(.*)/) {
       $opt_i = ($1 ? $1 : shift);
       if (!Load_Initscript($opt_i)) {
          &Warn("Unable to find initscript '$opt_i', skipping...");
       }
    } elsif(/^-output/) {
       $opt_o = shift;
    } elsif(/^o(.*)/) {
       $opt_o = ($1 ? $1 : shift);
    } elsif(/^-Output/) {
       $opt_O = shift;
    } elsif(/^O(.*)/) {
       $opt_O = ($1 ? $1 : shift);
    } elsif(/^D(.*)/) {
       my $tmp = ($1 ? $1 : shift);
       if($tmp =~ /(.+)=(.+)/) { # correctly formed
          $defines{$1} = $2;          
       } else {
          &Warn("\nError: Malformed -D option, skipping.\n");
       }
    } elsif(/^H(.*)/) {
       push(@HandlerSet, ($1 ? $1 : shift));
    } elsif(/^-Handler/) {
       push(@HandlerSet, shift);
    } elsif(/^d(.*)/) {
       $doctype = ($1 ? $1 : shift);
    } elsif(/^-doctype/) {
       $doctype = shift;
    }
    else {
        print "Error: Unrecognized switch: $_\n\n";
        &usage();
    }
}

#print "opt_h is $opt_h\n";
#print "opt_o is $opt_o\n";
#print "opt_O is $opt_O\n";
#print "opt_q is $opt_q\n";
#print "opt_i is $opt_i\n";
#foreach $opt_d (keys %defines) {
#  print "defines{$opt_d} is $defines{$opt_d}\n";
#}

#exit(0);

if ($opt_o) {
  if ($opt_O) {
     &Warn("Both -o and -O specified, only one allowed. Exiting.");
     exit(1);
  }
  if (-e $opt_o) {
     &Warn("Non-forced redirection to existing file '$opt_o', exiting.");
     exit(1);  
  }
  &Debug(5, "Redirecting output to file $opt_o, replacing.");
  open(STDOUT, ">$opt_o") || die "Can't redirect stdout";
} elsif ($opt_O) {
  &Debug(5, "Redirecting output to file $opt_O, overwriting.");
  open(STDOUT, ">$opt_O") || die "Can't redirect stdout";
}

if(!$opt_i) { # load default initscripts if no $opt_i given
  Load_Initscript("$default_lib/$default_init");
  Load_Initscript("$env_lib/$default_init");
}

sub Load_Initscript { # loads the script specified as its arg. Returns 0 if failed.
  my($i) = @_;

  if (-f "$i") {
     do "$i";
     Debug(5,"Loaded initscript '$i'");
     return(1);
  } elsif (-f "$env_lib/$i") {
     do "$env_lib/$i";
     Debug(5,"Loaded initscript '$env_lib/$i'");
     return(1);
  } elsif (-f "$default_lib/$i") {
     do "$default_lib/$i";
     Debug(5,"Loaded initscript '$default_lib/$i'");
     return(1);
  }

  return(0);

}

{
  my($x);
  $x = $/;
  undef $/;
  $input = <>; # slurp in the whole file at once
  $/ = $x; # set it back to its default
}

#    &Load_Config("doc-types/$headerparams{'doc-type'}/Config");
#      # will create @fieldsrequired, @fieldsoptional, @tagsdefined, @tagsdefinedsingle

$endhtml = '/';
# $tagfinished = '\*'; # the symbol that says "don't parse this tag"
$Max_Passes = 10; # maximum number of passes the parser will make

for($Pass_Number = 0; $Pass_Number < $Max_Passes; $Pass_Number++) {
   Debug(6,"Initiating pass #$Pass_Number...");
   Debug(9,"Text is currently:\n********\n$input\n****");
   $Previous_num_succ = $Global_num_succ;
   $Previous_num_fail = $Global_num_fail;
   $Global_num_succ = 0;
   $Global_num_fail = 0;
   $input = &Parse($input);

   last if ($Global_num_fail == 0);

}

# $input = &Final_Parse($input); # clean out those i'm-done tag markers

Debug(5,"*********End text:*********");
print $input;

exit(0);

sub count_open_tags {
  my($tag, $text) = @_;
  Debug(9,"Performing countopentags on tag '$tag', text:\n****\n$text\n**");
#  if (/ \< \s* $tag (\s|\>)/isx) {
  if ($text =~ / \< \s* $tag [\s\>] /isx) {
     Debug(9,"Found a recursive opentag $tag!");
     return(1+&count_open_tags($tag, $'));
  } else {
     return(0);
  }
}

sub Final_Parse { # strips out asterisks
  my($text) = @_;
#  Debug("Final Parse called, text:\n");
#  $text =~ s|\<$tagfinished|\<|g;
#  $text =~ s|\<\*|\<|g;
#  Debug("Final Parse returning, text:\n");
  return($text);
}

sub Parse { # returns ($text)
  my ($text) = @_;
  my ($pretext) = ''; # no pun. really.
  my ($insidetext) = "";
  my ($tagpaired) = 0;
  my ($tagandparams);
  my ($tag, $params);
  my ($prelude, $postlude);

  # find a tag to parse

  while($text =~ / (.*?) \< (.*?) \> /sx) { # there's a tag in it
     $pretext .= $1;
     $tagandparams = $2;
     $text = $';

#     if ($tagandparams =~ /^$tagfinished/) { # not a parsable tag. Skip.
#     if ($tagandparams =~ /^\*/) { # not a parsable tag. Skip.
#        Debug("Finished tag '$tagandparams' found, skipped.");
#        $pretext .= "<$tagandparams>";
#        next;
#     }

     if ($tagandparams =~ / \s* (\S+) \s* /sx) {
        $tag = lc($1);
        $params = $'; # postmatch
     } else { # no text in tagandparams. It was a <> (maybe with whitespace)
        &Warn("No text inside tag, treating as normal text");
        $pretext .= "<$tagandparams>";
        next;
     }
  
     &Debug(7,"Parse found embedded tag '$tag', params '$params'.");
     
     if(!$HandlerSingle{$tag}) { # then it's either paired or default
           # look for a matching endtag
           my($preinsidetext) = '';
           $num_opened_tags = 1;
           while ($text =~ / (.*?) \< \s* $endhtml \s* $tag \s* \> /isx) {
               Debug(9,"close-tag $tag found.");
               # looks hopeful
               $num_opened_tags--;
               $insidetext = $1;
               $text = $';
               $num_opened_tags += &count_open_tags($tag, $insidetext);
               Debug(9,"Num Open Tags after Count is '$num_opened_tags'.");
               $preinsidetext .= $insidetext;

               if (!$num_opened_tags) {
                  $tagpaired = 1;
                  $insidetext = $preinsidetext;
                  Debug(8,"Insidetext '$insidetext', tagpaired matched!");
                  goto matched;
               }
               $preinsidetext .= "<$endhtml" . "$tag>";
           } # end while
           $text = $preinsidetext.$text; # back up,
           # fall through. No endtag.
      
           if (defined $HandlerCode{$tag}) {
                  # it's defined to be paired, no endtag. Oops.
                  die "No matched tag '$tag' for defined tag.";
           } else { # use default. It's ok if there's no inside.
#                  $insidetext = '';
#		  Warn("Default handler used on unpaired tag '$tag'");
                  $tagpaired = 0;
           }
      } else { # single
        $insidetext = '';
        $tagpaired = 0;
      }

matched: # label from above, finding matched endtag
      ($prelude, $insidetext, $postlude, $errval) =
             &handler($tag, $params, $insidetext, $tagpaired);

      if ($Handler_has_called_Parse) {
         undef $Handler_has_called_Parse;
         return($pretext . $insidetext . &Parse($text)); # recurse on endtext
      } else { # sort the error code
         if($errval == 0) {
           $Global_num_succ++;
           return($pretext . $insidetext . &Parse($text)); # recurse on endtext
         } elsif ($errval == 1) {
           $Global_num_fail++;
           return($pretext . $insidetext . &Parse($text)); # recurse on endtext
         } elsif ($errval == 2) {
           $Global_num_succ++;
           return($pretext . $prelude . &Parse($insidetext) . 
              $postlude . &Parse($text)); # recurse on endtext
         } elsif ($errval == 3) {
           $Global_num_fail++;
           return($pretext . $prelude . &Parse($insidetext) . 
              $postlude . &Parse($text)); # recurse on endtext
         }
      }
#      $insidetext = Parse($insidetext) unless $Handler_has_called_parse;

  }

  # no valid tag. Return the text.
  $text = $pretext . $text;
  &Debug(8,"No parseable tag in text, returning '$text'.");
  return ($text); 

}

#verbose takes an argument, from 0 to 9. The higher the argument,
#the more spam you want to see. Generally -v 6 is sufficient to give
#you a good idea of what's going on. 
sub Debug {
  my ($num, $text) = @_;

  print STDERR "$num:$text\n" if ($num<=$opt_v);

}

sub Warn {
  my ($text) = @_;

  print STDERR "Warning: $text\n" unless $opt_q;

}

sub handler {
  local ($tagname, $params, $text, $tagpaired) = @_;
  local $prelude = '';
  local $postlude = '';
  my($errval) = 1;

#  foreach $var (keys %params) {
#    &Debug("Handler: Param '$var' is val '$params{$var}'.");
#  }

  if (defined $HandlerCode{$tagname}) {
    $errval = eval "$HandlerCode{$tagname}";
    &Debug(7,"HANDLER '$tagname' finished, returning '$text', error $errval");
  } else {
    if(defined $HandlerDefault) {
      $errval = eval "$HandlerDefault";
      &Debug(7,"Default HANDLER '$tagname' finished, returning '$text', error $errval");
    } else {
      # No default handler function. Just do something useful.

      if ($HandlerSingle{$tagname} == $tagpaired) {
         Warn("tag $tagname is unpaired, but is supposed to be paired");
      }

      $prelude = "<$tagname";
      if ($params) {
          $prelude .= " $params";
      }
      $prelude .= ">";
      if($tagpaired) {
        $postlude = "</$tagname>";
        $errval = 2;
      } else {
	$text = $prelude;
        $errval = 0;
      }

      &Debug(7,"Inlined default HANDLER '$tagname' finished.");

    }
  }
  Warn "Handler for tag '$tagname' failed: $@" if $@;

#  $text = do "doc-types/$headerparams{'doc-type'}/$tagname";

  return($prelude, $text, $postlude, $errval);
  
}

sub Parse_Params {
  my ($text) = @_;
  my ($param, $value);
  my (%paramlist);

  while ($text =~ / \s* (.*?) \s* = \s* /sx) {
    $param = lc($1);
    ($value, $text) = &Parse_Value($'); # postmatch
    $paramlist{"$param"} = $value;
    &Debug(8,"Param '$param' = '$value'.");
  }

  if ($text) {
     &Warn("Extra characters '$text' in param list ignored.");
  }

return(%paramlist);

}

sub Parse_Value {
  my ($line) = @_;
  my ($value) = '';

  if ($line =~ / ^ \" (.*?) \" /xs) { # then it was a quoted value
     $value = $1;
     $line = $'; # postmatch     
     while ($value =~ s/\\$/\"/s) { # if it ends in a backslash, it wasn't a real "
           if ($line =~ / (.*?) \" /xs) {
              $value .= $1;
              $line = $'; # postmatch
           } else { # no remaining ". They're unmatched. Ack.
              &Warn("Unmatched quote in value, automatically closing...");
              last; # break out of the while loop
           }
     }
  } else { # not a quoted value. Grab til whitespace.
     if ($line =~ /^(\S+)/s) {
        $value = $1;
        $line = $'; # postmatch
     } else { # woah. No value?
        &Warn("Parameter with no value, assigning null...");
     }
  }

  return($value, $line);

}

sub Register_Single { # registers that this tag is unpaired, but does not
                      # associate code with it
  my ($tagname) = @_;

  $HandlerSingle{$tagname} = 1;

}

sub Register_Handler { # returns 0 on failure, nonzero on success
  my ($tagname, $filename, $single) = @_;
  my ($file, $retval);

  if ($filename =~ /^\//) {
    # we have an absolute path
    Debug(8,"loading absolute $filename");
    $file = "$filename";
  } else {
    if (-f "$env_lib/$filename") {
      Debug(8,"loading relative $env_lib/$filename");
      $file = "$env_lib/$filename";
      $retval = 2; # means got lib
    } elsif (-f "$default_lib/$filename") {
      Debug(8,"loading relative $default_lib/$filename");
      $file = "$default_lib/$filename";
      $retval = 1; # means got default
    } else {
      Warn("file $filename could not be found");
      return(0); # argh!!
    }
  }
#  $HandlerCode{$tagname} = `cat $file`;
  $HandlerCode{$tagname} = ReadFile($file);
  $HandlerSingle{$tagname} = $single;

  Debug(7,"handler $filename for tag $tagname loaded");

  return($retval); # success
}

sub ReadFile {
 # opens a file, and returns its content in a string
 # returns undef if it fails
  my($filename) = @_;
  my($mystring, $x);

# return(`cat $filename`);

  open(TMPFILE, "$filename") || return(undef);
  $x = $/;
  undef $/;
  $mystring = <TMPFILE>;
  $/ = $x; # set it back
  close(TMPFILE);

  return($mystring);

}