#!/usr/bin/perl

# Convert an ej-formatted doc in to a man page
# Input: First argument or standard input
# Output: Standard output

# Tmp dir (not used; as far as I know)
$TMP = $ENV{'HOME'} . "/tmp";
# Make this just /tmp at your own risk.  You have been warned.

if(! -d $TMP ) {
    die "Fatal: Please create a directory entitled " . $TMP . "\n";
    }

# Read in the doc

while(<>){$doc .= $_}

$* = 1; # Match multiple lines

# Get rid of <!-- ... --> comments
$doc =~ s|<\!\-\-.*?\-\->||sg;

# Grab the header
if($doc =~ m|<head>(.*?)</head>|is) {
    $header = $1;
    }
else {
    die "Fatal: Document must have a heading section\n";
    }

# Make sure the header has 
# <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=XXX">
# Where XXX is any character set
if($header !~ 
m|meta\s+http\-equiv\=\"content\-type\"\s+content\=\"text\/html\;\s+charset=|i) 
  {
  print "Please have somthing like this:\n";
  print
    '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; CHARSET=iso-8859-1">';
  print "\n";
  die "Fatal: Header must declare charset\n";
  }

# OK, see if we have a DTWIDTH header in the ej document.  If so, use that
# value to determine how wide to make "dt" values in the formatted man page

if($header =~ m|<dtwidth>(.*?)</dtwidth>|is) {
    $width = $1;
    if($width =~ /\D/) {
        die "Fatal: DTWIDTH tag can only have a numeric argument.\n";
	}
    $DTROFF = ".TP $1";
    }
else {
    # The nroff to convert a DT tag in to
    $DTROFF = ".TP 4";
    }

# OK, the header looks kosher.  Start generating nroff

print '.\" Do *not* edit this file; it was automatically generated by ej2man';
print "\n";
print '.\" Look for a name.ej file with the same name as this filename';
print "\n";
print '.\"' . "\n";
print '.\" Process this file with the following on iso-8859-1 terminals:'."\n";
print '.\" nroff -man -Tlatin1 maradns.8' . "\n";
print '.\" On UTF-8 terminals:' . "\n";
print '.\" nroff -man -Tutf8 maradns.8 | tr \'\020\' \' \'' . "\n";
print '.\"' . "\n";
# Timestamp
$ts = localtime(time());
print '.\" Last updated ' . $ts . "\n";
print '.\"' . "\n";

# OK, see if we have a TH header in the ej document.  If so, add that to the
# man page.  If not, generate a generic TH

if($header =~ m|<th>(.*?)</th>|is) {
    print ".TH $1\n";
    }
else {
    print ".TH \n";
    }

print '.\" We don\'t want hyphenation (it\'s too ugly)' . "\n";
print '.\" We also disable justification when using nroff' . "\n";
print '.\" Due to the way the -mandoc macro works, this needs to be placed'
      . "\n";
print '.\" after the .TH heading' . "\n";
print ".hy 0\n";
print ".if n .na\n";
print '.\"' . "\n";

# Enough of header processing; let's get to the body of the document

# Grab the body
if($doc =~ m|<body>(.*?)</body>|is) {
    $body = $1;
    }
else {
    die "Fatal: Document must have a body section\n";
    }

$body = process_body($body,0);

print($body);

print "\n";

exit(0);

# And this processes the body (we do this way so we can recursively handle 
# those pesky PRE flags)
sub process_body {
   my($body,$inrecurse) = @_;
   my($hack,$rest,$filename);
   my(@parts);

   # The INCLUDE tag
   while($body =~ m|\<include\s+\"([^"]+)\"\s*\>|is) {
       $filename = $1;
       open(FILE,"< $filename") || die "Can not find file $filename\n";
       $hack = "";
       while(<FILE>) {$hack .= $_}
       close(FILE);
       #$hack = process_body($hack);
       $body =~ s|\<include\s+\"([^"]+)\"\s*\>|$hack|is;
       }

   # Get rid of any </?BLOCKQUOTE> tags in bulletted lists; the NROFF macros
   # can not handle nesting 
   if($inrecurse == 0) {
       @parts = split(m|</?ul>|i,$body);
       if($#parts > 0) {
           $body = "";
           for($hack = 0; $hack <= $#parts; $hack++) {
               if($hack % 2 == 0) { # If we are not in a bulleted list
	           $body .= $parts[$hack];
	           if($hack < $#parts) {
	              $body .= "\n<ul>\n";
		      }
                   }
               else {
	           $parts[$hack] =~ s|</?blockquote>||g;
	           $body .= $parts[$hack];
	           $body .= "\n</ul>\n";
	           }
               }
           }
       }

   # The PRE tag
   @parts = split(m|</?pre>|i,$body);
   if($#parts > 0) {
        $body = "";
        for($hack=0;$hack <= $#parts; $hack++) {
           if($hack %2 == 0) { # If we are not in a <pre> section
	       $body .= process_body($parts[$hack],1);
               }
           else {
	       $body .= "\n.nf";
	       $body .= $parts[$hack];
	       $body .= ".fi\n";
	       }
           }
       return($body);
       }

   # Backslashes need to be escaped in *roff source
   $body =~ s/\\/\\\\/g;
   # As do single quotes
   $body =~ s/\'/\\\'/g;

   # The H1 tag
   while($body =~ m|<h1>(.*?)</h1>|is) {
       $hack = $1;
       $hack =~ s/\s+/ /g;
       $hack =~ s/\"\'//g;
       $body =~ s|<h1>(.*?)</h1>|\n.SH "$hack"\n|is;
       }

   # The H2 tag
   while($body =~ m|<h2>(.*?)</h2>|is) {
       $hack = $1;
       $hack =~ s/\s+/ /g;
       $hack =~ s/\"\'//g;
       $body =~ s|<h2>(.*?)</h2>|\n.in -3\n\\fB$hack\\fR\n.PP\n|is;
       }

   # The A tag (and /A closer)
   $body =~ s|</?a[^>]+>||ig;
   $body =~ s|</?a>||ig;

   # The TT tag (and /TT closer)
   $body =~ s|</?tt>||ig;

   # The BLOCKQUOTE tag
   $body =~ s|<blockquote>|\n.RS 4\n|ig;
   $body =~ s|</blockquote>|\n.RE\n|ig;

   # The B tag
   while($body =~ m|<b>(.*?)</b>(\S+)?|is) {
      $hack = $1;
      $rest = $2;
      if($rest =~ /[<>]/) {
         die "ej2man can't handle a tag immediately after a B tag\nthe offending text is $rest\n";
	 }
      if($hack =~ m|<\?i>|) {
         die "No I tags are allowed inside B tags\n";
	 }
      $hack =~ s/\s+/ /g;
      $hack =~ s/\"\'//g;
      $rest =~ s/\"\'//g;
      if($rest) {
        $body =~ s|<b>(.*?)</b>\S+|\n.BR "$hack" "$rest"\n|is;
	}
      else {
        $body =~ s|<b>(.*?)</b>|\n.B "$hack"\n|is;
	}
      }

   # The I tag
   while($body =~ m|<i>(.*?)</i>(\S+)?|is) {
      $hack = $1;
      $rest = $2;
      if($rest =~ /[<>]/) {
         die "ej2man can't handle a tag immediately after a I tag\nthe offending text is $rest\n";
	 }
      if($hack =~ m|<\?b>|) {
         die "No B tags are allowed inside I tags\n";
	 }
      $hack =~ s/\s+/ /g;
      $hack =~ s/\"\'//g;
      $rest =~ s/\"\'//g;
      if($rest) {
        $body =~ s|<i>(.*?)</i>\S+|\n.IR "$hack" "$rest"\n|is;
	}
      else {
        $body =~ s|<i>(.*?)</i>|\n.I "$hack"\n|is;
	}
      }

   # Get rid of any multiple newlines
   $body =~ s/\n(\s*)\n/\n/sg;

   # The P tag
   $body =~ s/<p>\s*/\n\n/ig;

   # The UL and tags (just nuke them)
   $body =~ s/<[du]l>//ig;

   # The LI tag
   $body =~ s/<li>\n?/\n.TP 2\n*\n/ig;

   # The DT and DD tag
   while($body =~ /<dt>(.*?)<dd>\n?/si) {
       $hack = $1;
       $hack =~ s/\s+/ /g;
       $body =~ s/<dt>(.*?)<dd>\n?/\n$DTROFF\n$hack\n/si;
       }

   # The /UL and /DL tag (which we don't ignore)
   $body =~ s|</[du]l>|\n.PP\n|ig;

   # Get rid of leading space; this confuses nroff
   $body =~ s/\n[ \t]+/\n/sg;

   # Get rid of empty lines before a .TP or .PP flag; this never looks nice
   $body =~ s/\n+(\n\.[TP]P)/$1/sg;

   # Get rid of empty lines at the beginning of the segment which come
   # before a .TP; this covers the case of a </pre> before a <li> in the
   # EJ source
   $body =~ s/^\n+(\.[TP]P)/$1/s;

   # Same with empty lines before an .RE flag; this does not look nice
   $body =~ s/^\n+(\.RE)/$1/s;

   # Get rid of multiple empty lines together; this never looks nice
   # when formatted by Nroff
   $body =~ s/\n\n\n+/\n\n/sg;

   # Get rid of empty lines at the end of a segment after a .TP or .PP
   # flag to work around how <pre> tags are handled
   $body =~ s/(\n\.[TP]P)\s*$/$1/s;

   # Put a newline before the .RE flag; this looks nicer
   $body =~ s/(\n\.RE)/\n$1/sg;
   $body =~ s/^(\.RE)/\n$1/sg;

   # Get rid of empty lines after a .RE flag; this does not look nice either
   $body =~ s/(\n\.RE.*?\n)\n+/$1/sg;

   # Put a newline before the .in flag; this looks nicer
   $body =~ s/(\n\.in)/\n$1/sg;

   # Get rid of empty lines after a .TP or .PP flag; this never looks nice
   $body =~ s/(\n\.[TP]P.*?\n)\n+/$1/sg;

   # Get rid of multiple spaces; nroff (unlike EJ) honors them
   $body =~ s/[ \t]+/ /sg;

   # The TABLE tags (TABLE, TD, TR, /TABLE)
   $body =~ s|<table>|.ta +5 +7 +7|ig;
   $body =~ s|<td>|\t|ig;
   # We also process .br tags
   $body =~ s|<[tb]r>\n?|\n.br\n|ig;
   $body =~ s|</table>||ig;

   # Break long lines so the nroff source is more legible
   $body = fmt($body);

   $body;
   }

# This takes a string, and braks any lines longer than 75 columns; otherwise
# it performs no other formatting
# Input: The string to format
# Output: The formatted string

sub fmt {
   my($input) = @_;
   my($place,$lastspace,$column,$linebegin);
  
   $place = $lastspace = $column = $linebegin = 0;

   # Get rid of trailing white space, which confuses this algorithm
   $input =~ s/[ \t]+\n/\n/sg;

   # The core algorithm
   while($place < length($input)) {
       # If we hit a whitespace, remember that this is where the last
       # (previous) space character is
       if(substr($input,$place,1) =~ /[ \t]/) {
           $lastspace = $place;
	   }
       # If we hit the end of a line reset the counters which tell us when
       # to break a line
       if(substr($input,$place,1) =~ /\n/) {
           $column = -1;
	   $linebegin = $lastspace = $place + 1;
	   }
       # This adds the newline as needed.  Note that we do not break
       # lines which start with a .; this means the line has a man macro 
       # and breaking the line will change the formatting of the page
       if($column > 70 && $linebegin != $lastspace && 
          substr($input,$linebegin,1) !~ /\./) {
           substr($input,$lastspace,1,"\n");
	   $place = $lastspace;
	   $column = -1;
	   $linebegin = $lastspace = $place + 1;
	   }
       $column++;
       $place++;
       }

   $input;
   } 

