Impressum convert.pl
Sprache: Shell
|
|
#!/usr/bin/perl -w
#
# Script to convert GAP manual TeX files to HTML
# Usage:
# convert.pl [-csti] [-f <frontpage>] [-n <pkgname>] <doc-dir> [<html-dir>]
#
# Requirements: Perl (might need to edit the first line of this file)
# TtH is not strictlty necessary but very desirable to treat
# formulas.
#
# Caveats:
#
# 1. This script assumes that the .toc, .lab and .bbl files are up-to-date
# with the .tex files and will almost certainly fail horribly if they
# are not.
#
# 2. The output files are CxxxSxxx.htm, (not .html) plus chapters.htm,
# theindex.htm and biblio.htm, except when called with the -c option
# (in which case, there are CHAPxxx.htm files instead of CxxxSxxx.htm).
# A (front page) file index.htm is assumed, but not created.
# Not all servers will serve .htm files as HTML without adjustments.
#
# 3. The script assumes that the .tex files comply with GAP conventions,
# including unwritten ones. It tries to follow the behaviour of TeX
# assuming those conventions. The on-line browser attempts to provide
# an ASCII equivalent. See BUGS.
#
# 4. The hierarchy of the HTML manuals assumed is of the following form:
#
# <GAPDIR>/
# doc/
# <main>
# pkg/
# <pkg>/
# htm
#
# for each main manual <main> (in: ref, tut) and each
# package <pkg>. To make inter-linking between manuals work,
# one should generally use the -c option for everything, (or not use
# it for everything). Linking to package manuals from the main
# manual can only be expected to work if the package manuals
# are created using this converter.
#
# 5. Only the manual.lab files for books that are referenced via the
# \UseReferences and \UseGapDocReferences commands in the manual.tex
# file of the book being converted (and the book's own manual.lab
# file, of course) are read. Make sure all the \UseReferences and
# \UseGapDocReferences commands needed are present! (The TeX-produced
# manuals will be missing lots of cross-references also, if some are
# missing.) You will get `Bad link' messages if you have some missing.
#
# Options:
#
# -c file-per-chapter mode: Generates one HTML file CHAPxxx.htm
# for each chapter; sections are level 2 headings and anchors
# CHAPxxx.htm#SECTxxx.
# This is intended for local browsing, especially under MS-DOS.
# It may be used with the -n (package) option.
#
# -f <frontpage>
# Adds a "Top" link to link <frontpage> to each manual page,
# only available if -n option is also used.
#
# -s silent running: Conversational messages are suppressed.
#
# -n <pkgname>
# We are not building the main manual but the one for the
# package <pkgname>. To get cross references to the main library
# right, it assumes that the package is in the right place.
# The -c option may be used with this option.
#
# -i index: Only one index file is produced.
#
# -t tex-math: Runs `tth' (which must be installed on the local system)
# to produce better HTML code for formulae. (It would be possible to
# replace tth by another conversion, for example TeXexplorer, but
# (at least) the line calling `tth' would need to be modified.)
# -u Like -t, but uses `tth -u2' to produce unicode.
#
# <doc-dir> The directory where all the needed .tex, .toc, .lab and .bbl
# files are located.
#
# <html-dir> The directory (which should already exist) in which to put
# the generated .htm files. Defaults to the current directory,
# if omitted.
#
# Example usage:
# convert.pl -n mypkg doc htm # in directory .../pkg/mypkg
# convert.pl -t -n mypkg doc htm # ditto previous + use tth for maths
# convert.pl -t -n mypkg -c doc htm # ditto previous + 1 file per chapter
# convert.pl -t -c ../ref ref # (for Ref manual) in dir .../doc/htm
#
# FEATURES (and intended departures from the TeX behaviour)
# . Now interprets 2nd argument of an \atindex command if it is
# of form @... and ignores the first argument, or otherwise it
# interprets the first argument. Interprets ! as a comma and
# indices output have no sub-headers.
# . The @... component of \> commands is ignored. The assumption
# is that for: \>`...'{...}@{...} the @{...} component is just
# the {...} with font changes.
# . In a \beginitems ... \enditems environment everything is indented
# except for the item headers, rather than just the paragraph
# following the item header.
# . By default, the \beginlist ... \endlist environment is interpreted
# as a compact description list. By adding %unordered or %ordered...
# markup it will be interpreted as either an unordered or ordered
# list respectively (see gapmacro documentation for details).
# . There are spacing differences e.g. \begintt ... \endtt etc.
# environments are not indented.
# . Supports all accents of TeX, in probably the best way currently
# possible with HTML.
# . Treats PseudoInput chapters in the `same' way as Input chapters.
# . With -t switch announces the version of TtH used.
# . Now supports %display{nontex}, %display{nontext} and
# %display{nonhtml} variants of %display environment.
# . References to subsections are now interpreted as one would expect.
#
# BUGS (and known departures from the TeX behaviour)
# . $a.b$ is only interpreted correctly in -t mode.
# . The citation keys that appear are the .bib file keys rather
# than the keys BibTeX constructs with the `alpha' bib-style.
#
# TODO
# . Refine macro_replace subroutine so it can also be used to purge
# 2nd arg of \atindex macros.
# . For -t mode, scan for \def commands in manual.tex and write
# to TTHIN (tthmacros.tex). Should we only look for a block
# demarcated by %mathsmacros ... %endmathsmacros ?
# These \def commands are only intended for such font
# changing commands as: \def\B{{\cal B}} (`tth' provides a
# script-type font).
# . Provide a table environment, if/when a \begintable ...
# \endtable environment is added to gapmacro.tex.
#
#############################################################################
# Check PERL version
#
$] > 5 or die "Needs perl 5";
use Getopt::Std;
#
# Global variables
#
# $dir -- the full pathname of the input directory, including a trailing /
# $odir -- the full pathname of the output directory, including a trailing /
# $opt_c and $opt_s set by getopts()
# @chapters -- the chapters data structure
# IN -- the current input file (outputfiles are handled by select)
# $footer -- the trailer put on every page
# $indexcount -- used within chapters to number the index anchors
# $lastnumchap -- number of last numerically numbered chapter
#
# These match chapter and section lines in a .toc file
#
$chapexp = '\\\\chapcontents\s+\{((?:\d+|[A-Z]))\}\s*\{(.+)\}\s*\{\d+\}';
$secexp = '\\\\seccontents\s+\{((?:\d+|[A-Z]))\.(\d+)\}\s*\{(.+)\}\s*\{\d+\}';
#$ignoreexp = '\\\\tocstrut|\\\\appno|\\\\seccontents\s+\{\d+\}';
$lastnumchap = 0;
# Variable that is set to 2 inside a nest of \itemitem s of a
# \beginlist ... \endlist environment
#
$listdepth = 0;
# This is augmented each time a line: \Package{...} is read in a manual.tex
# file, so that macro_replace knows to set a {\...} macro in sans-serif.
#
$sharepkg = "";
# The books converted to HTML with this converter
# The values set are: 0 or 1 according to whether or not -c was used.
#
%convertbooks = ();
# This is added to when scanning GAPDoc manuals.
#
%gapdocbooks = ();
# Types of href label are:
# 0 (non -c books) : C<MMM>S<NNN>.htm
# 1 (-c books) : CHAP<MMM>.htm#SECT<NNN>
# 2 (== $gapdoc) : chap<M>.html#<gapdoc-id>
#
# It would be nice to support subsections properly like GapDoc,
# but this involves creating a subsection data-structure modelled
# on section, which is a mite non-trivial (maybe ... if I find time).
# For now in-text references go to the beginning of the chapter.
#
# BH: it might be easier to use tags based on the name of the function
$gapdoc = 2;
# sansserif:
#
# Used mainly to set GAP in sans serif font. Inside <title> ... </title>
# there should *not* be any tags, since they are not translated there by
# web browsers, and hence sansserif should *not* be applied to anything
# that ends up in the <title> ... </title> field, but *is* quite appropriate
# for the header in the <h1> ... </h1> field at the top of the body of an
# HTML file and anywhere else within the body of an HTML file.
#
sub sansserif {
my ($name) = @_;
return "Gill Sans,Helvetica,Arial\">$name";
}
# booktitle_body:
#
# This is for generating the title of a document that goes in the
# <h1> ... </h1> field at the top of the body, as opposed to the title
# that goes in the <title> ... </title> field which should be unembellished.
#
sub booktitle_body {
my ($bktitle, @prog_or_pkg) = @_;
foreach $prog_or_pkg (@prog_or_pkg) {
$newstring = sansserif $prog_or_pkg;
$bktitle =~ s/$prog_or_pkg/$newstring/;
}
return $bktitle;
}
#
# used to standardize section names for use as hash indices.
#
sub canonize {
my ($key) = @_;
$key =~ tr/A-Z/a-z/;
$key =~ s/\s//g;
$key =~ s/\\//g;
$key;
}
sub kanonize {
my ($key) = @_;
$key =~ s/\\ / /g;
$key =~ s/!/ /g;
$key;
}
sub def_section_by_name {
my ($sec, $chapno, $secno, $ssecno, $name) = @_;
my $secname = canonize $1;
if (defined $sections_by_name{$secname}) {
if (($sections_by_name{$secname}->{chapnum} ne $chapno) ||
($sections_by_name{$secname}->{secnum} ne $secno) ||
($sections_by_name{$secname}->{ssecnum} ne $ssecno)) {
print STDERR "Section: \"$secname\" already defined as: ",
"$sections_by_name{$secname}->{chapnum}.",
"$sections_by_name{$secname}->{secnum}.",
"$sections_by_name{$secname}->{ssecnum}\n";
print STDERR "Now being redefined as: $chapno.$secno.$ssecno\n";
$redefined_secname{$secname} = 1;
} else {
return;
}
}
$sections_by_name{$secname}
= {chapnum => $chapno,
secnum => $secno,
ssecnum => $ssecno,
name => $name};
# print STDERR "Defined section \"$secname\": $chapno.$secno.$ssecno $name\n";
}
sub tonum { # Needed since chanu may be A,B,... for appendices
my ($chanu) = @_;
return $chanu =~ /\d+/ ? $chanu : $lastnumchap + ord($chanu) - ord('A') + 1;
}
# getchaps:
#
# Scan the .tex and .toc files to get chapter names and numbers,
# section names and numbers and associated filenames.
# Loads up chapters and sections_by_name.
#
sub getchaps {
open( TOC, "<${dir}manual.toc" )
|| die "Can't open ${dir}manual.toc.\n You can " .
"create the .toc file by doing: tex manual (at least once).\n";
my ($chap,$sec,$chapno,$chap_as_sec,$chapnam,$chanu);
while (<TOC>) {
if ( /$chapexp/o ) {
$chapnam = $2;
$chanu = $1;
$lastnumchap = $chanu if ( $chanu =~ /\d+/ );
# remove `(preliminary)' part that messes everything up
$chapnam =~ s/ \(preliminary\)//g;
$chap = {name => $chapnam,
number => $chanu};
$chap_as_sec = {name => $chapnam,
chapnum => $chanu,
secnum => 0,
chapter => $chap};
$chap->{sections}[0] = $chap_as_sec;
defined ($chapters[tonum $chanu]) && die "chapter number repeated";
$chapters[tonum $chanu] = $chap;
} elsif ( /$secexp/o ) {
defined ($chapters[tonum $1])
|| die "section $2:$3 in unknown chapter $1";
defined ($chapters[tonum $1]{sections}[$2])
&& die "section number repeated";
$sec = {name => $3,
secnum => $2,
chapnum => $1,
chapter => $chapters[tonum $1]};
$chapters[tonum $1]{sections}[$2] = $sec;
# this would produce warnings from empty chapters. Thus ignore.
# } elsif ( $_ !~ /$ignoreexp/o ) {
# print STDERR "Bad line: $_";
}
}
close TOC;
open (TEX, "<${dir}manual.tex") || die "Can't open ${dir}manual.tex";
$chapno = 0;
while (<TEX>) {
if ( /^[^%]*\\(|Pseudo)Input\{([^}]+)\}(\{([^}]+)\}\{([^}]+)\})?/ ) {
if (not -f "$dir$2.tex" or not -r "$dir$2.tex") {
print STDERR "Chapter file $2.tex does not exist in $dir\n";
}
if ($1 eq "") {
$chapters[++$chapno]{file} = $2;
} else {
$chapnam = $5;
$chanu = ++$chapno;
$lastnumchap = $chanu;
$chap = {name => $chapnam,
number => $chanu};
$chap_as_sec = {name => $chapnam,
chapnum => $chanu,
secnum => 0,
ssecnum => 0,
chapter => $chap};
if ($4 ne $5) {
def_section_by_name("$book:$chapnam", $chanu, 0, 0, canonize $chapnam);
add_to_index(htm_fname($opt_c,$chanu,0, 0, ""),
$4, $chap_as_sec, 0);
}
$chap->{sections}[0] = $chap_as_sec;
defined($chapters[$chanu]) && die "chapter number repeated";
$chapters[$chanu] = $chap;
$chapters[$chanu]{file} = $2;
}
}
}
close TEX;
}
sub getlabs {
my ($bkdir) = @_;
open (LAB, "<${bkdir}manual.lab") || print "Can't open ${bkdir}manual.lab";
while (<LAB>) {
if ( /\\setcitlab/ ) {
next; # We don't get the bibliography labels from here
} elsif ( /\\GAPDocLabFile\s*\{([^}]+)\}/ ) {
$gapdocbooks{$1} = 1;
print STDERR "GapDoc books: ", keys(%gapdocbooks), "\n";
} elsif (/\\makelabel\s*\{([^}]+)\}\s*\{(\w+)(\.(\d+))?(\.(\d+))?\}\{([^}]+)\}/) {
def_section_by_name($1, $2, (defined($3) ? $4 : 0),
(defined($5) ? $6 : 0), $7);
} elsif (/\\makelabel\s*\{([^}]+)\}\s*\{(\w+)(\.(\d+))?(\.(\d+))?\}/) {
def_section_by_name($1, $2, (defined($3) ? $4 : 0),
(defined($5) ? $6 : 0), "");
} else {
chomp;
print STDERR "Ignored line: $_\n... in ${bkdir}manual.lab\n";
}
}
close LAB;
}
#
# Mainly diagnostic, prints the chapters data structure. Also
# checks that each section has the correct back reference to its
# chapter
#
sub printchaps {
my @chapters = @_;
CHAP: foreach $chapter (@chapters) {
next CHAP unless (defined ($chapter));
print "Chapter $chapter->{number} $chapter->{name} $chapter->{file}\n";
SECT: foreach $section (@{$chapter->{sections}}) {
next SECT unless defined ($section);
print " Section $section->{chapnum}.$section->{secnum} $section->{name}\n";
if ($section->{chapter} ne $chapter ) {
print " loop problem\n";
}
}
}
}
# Printed at the bottom of every page.
$footer = "\n" . sansserif( "GAP 4 manual " . `date +"%B %Y"` ) .
" | |