Ziele Untersuchung
mit Columbo Integrität von
Datenbanken Interaktion und
Portierbarkeit Ergonomie der
Schnittstellen

Angebot Produkte Projekt Beratung

Mittel Analytik Modellierung Sprachen Algebra Logik Hardware Denken Kreativität

Zusammenhänge Gesellschaft Wirtschaft Branche Firma


products/sources/formale Sprachen/GAP/pkg/gapdoc/lib/ (Algebra von RWTH Aachen Version 4.15.1^©) Datei vom 21.1.2024 mit Größe 39 kB

Quelle XMLParser.gi Sprache: unbekannt

#############################################################################
##
#W XMLParser.gi GAPDoc Frank Lübeck
##
##
#Y Copyright (C) 2000, Frank Lübeck, Lehrstuhl D für Mathematik,
#Y RWTH Aachen
##
## The files XMLParser.g{d,i} contain a non-validating XML parser and some
## utilities.
##

BindGlobal("EMPTYCONTENT", 0);
BindGlobal("XMLPARSERFLAGS", rec());
BindGlobal("NAMECHARS",
 ## here ':' is missing since it will probably become reserved
 ## for name space syntax in future XML
 Set(List(Concatenation([45,46], [48..57], [58], [65..90],
 [95], [97..122]), CHAR_INT))
 );

## two helper functions for parsing
# next successive characters not in delim (resp. enddelim) - default WHITESPACE
# arg: str, pos[, delim[, enddelim]]
BindGlobal("GetWord", function(arg)
 local str, pos, delim, enddelim, len, pos2;
 str := arg[1];
 pos := arg[2];
 if Length(arg)>2 then
 delim := arg[3];
 if Length(arg)>3 then
 enddelim := arg[4];
 else
 enddelim := delim;
 fi;
 else
 delim := WHITESPACE;
 enddelim := WHITESPACE;
 fi;

 len := Length(str);

 while pos <= len and str[pos] in delim do
 pos := pos + 1;
 od;
 pos2 := pos;
 while pos2 <= len and not str[pos2] in enddelim do
 pos2 := pos2 + 1;
 od;
 if pos2>len then
 return fail;
 else
 return [pos, pos2-1];
 fi;
end);

# first position after pos with character outside chars
BindGlobal("GetChars", function(str, pos, chars)
 local len;
 len := Length(str);
 while pos <= len and str[pos] in chars do
 pos := pos + 1;
 od;
 if pos > len then
 return fail;
 else
 return pos;
 fi;
end);

# returns for string, position: [line number, [begin..end position of
# line]] (range without the '\n')
BindGlobal("LineNumberStringPosition", function(str, pos)
 local p, nl, l;
 p := 0;
 l := 0;
 nl := 0;
 while p<>fail and p < pos do
 l := p;
 p := Position(str, '\n', p);
 nl := nl+1;
 od;
 if p=fail then
 p := Length(str)+1;
 fi;

 if pos = p then
 nl := nl - 1;
 fi;
 return [nl, [l+1..p-1]];
end);

# printing of error message for non-well formed XML document,
# also shows some text around position of error.
XMLPARSEORIGINS := false;
BindGlobal("ParseError", function(str, pos, comment)
 local Show, nl, ShowOrigin, r, badline, i, off;

 # for examination of error
 Show := function()
 Pager(rec(lines := str, start := nl[1]));
 end;
 ShowOrigin := function()
 if XMLPARSEORIGINS <> false then
 Pager(rec(lines := StringFile(r[1]), start := r[2]));
 else
 Show();
 fi;
 end;
 if InfoLevel(InfoXMLParser) > 0 then
 if XMLPARSERFLAGS.Encoding <> "UTF-8" then
 # we have an 8 bit encoding and must compute offset for original
 # position
 off := Number([1..pos], function(i) local c; c := INT_CHAR(str[i]);
 return c > 127 and c < 192; end);
 else
 off := 0;
 fi;
 # this is in UTF-8 document
 nl := LineNumberStringPosition(str, pos);
 if XMLPARSEORIGINS <> false then
 # need offset since in original encoding
 r := OriginalPositionDocument(XMLPARSEORIGINS, pos-off);
 fi;
 Print("XML Parse Error: Line ", nl[1]);
 Print(" Character ", pos-nl[2][1]+1, "\n");
 if XMLPARSEORIGINS <> false then
 Print("Original file: ", r[1], ", line number ", r[2],".\n");
 fi;
 badline := str{nl[2]};
 # to be perfect, consider current TERM encoding, ignore for now
 Print("-----------\n", badline, "\n");
 # this uses the same non-' ' whitespace to get the '^' at the right position
 for i in [1..pos-nl[2][1]] do
 if not badline[i] in WHITESPACE then
 badline[i] := ' ';
 fi;
 od;
 Print(badline{[1..pos-nl[2][1]]});
 Print("^", "\n-----------\n", comment, "\n!!! Type 'Show();' to watch the",
 " input string in pager - starting with\n line containing error !!!\n");
 if XMLPARSEORIGINS <> false then
 Print("Or 'ShowOrigin();' to look it up in its source file.\n");
 fi;
 fi;
 Error();
end);

## a container to collect named entities for the parser
BindGlobal("ENTITYDICT", rec());
## the default XML entities
BindGlobal("ENTITYDICT_default", rec(
lt := "<",
gt := ">",
amp := "&",
apos := "'",
quot := """) );
## the predefined entities of the GAPDoc package
## in LaTeX we use some saved boxes defined in our preamble for
## \, ~, ^, {, } because we think that the \texttt versions of these
## characters look better (than mathmode chars or accents without letter)
# (although this is a general XML parser, we make it convenient for
# GAPDoc documents)
BindGlobal("ENTITYDICT_GAPDoc", rec(
# compatibility entities, no longer needed by GAPDoc >= 1.0
tamp := "&",
tlt := "<",
tgt := ">",
hash := "#",
dollar := "$",
percent := "%",
tilde := "~",
bslash := "\\",
obrace := "{",
cbrace := "}",
uscore := "_",
circum := "^",

nbsp := " ",
copyright := "©",
ndash := "–",
GAP := "<Package>GAP</Package>",
GAPDoc := "<Package>GAPDoc</Package>",
TeX := "<Alt Only='LaTeX'>{\\TeX}</Alt><Alt Not='LaTeX'>TeX</Alt>",
LaTeX := "<Alt Only='LaTeX'>{\\LaTeX}</Alt><Alt Not='LaTeX'>LaTeX</Alt>",
BibTeX := "<Alt Only='LaTeX'>Bib{\\TeX}</Alt><Alt Not='LaTeX'>BibTeX</Alt>",
MeatAxe := "<Package>MeatAxe</Package>",
XGAP := "<Package>XGAP</Package>",
CC := "ℂ",
ZZ := "ℤ",
NN := "ℕ",
PP := "ℙ",
QQ := "ℚ",
HH := "ℍ",
RR := "ℝ",
 )
);

## Parsing and resolving an entity, the needed substitution text for
## non-character entities must be bound in ENTITYDICT.
## -- assuming str[pos-1] = '&'
## -- returns pseudo-element (Char)EntityValue with content the result string
## -- character entities are just substituted and returned as string
## -- the replacement for other entities is reparsed for recursive
## substitution
InstallGlobalFunction(GetEnt, function(str, pos)
 local d, i, ch, pos1, nam, doc, res, ent;
 # character entity
 if str[pos] = '#' then
 d := "";
 if str[pos+1] = 'x' then
 i := pos + 2;
 while str[i] <> ';' do
 Add(d, str[i]);
 i := i+1;
 od;
 d := NumberDigits(d, 16);
 else
 i := pos+1;
 while str[i] <> ';' do
 Add(d, str[i]);
 i := i+1;
 od;
 d := NumberDigits(d, 10);
 fi;
 # must consider this as unicode, translate it to UTF-8
 res := rec(name := "CharEntityValue", next := i+1,
 content := Encode(Unicode([d]), "UTF-8"));
 return res;
 fi;
 # else replace and reparse for recursive entity replacements
 pos1 := Position(str, ';', pos-1);
 if pos1=pos then
 ParseError(str, pos, "empty entity name not allowed");
 elif pos1 = fail then
 ParseError(str, pos, "no semicolon in entity reference");
 fi;
 nam := str{[pos..pos1-1]};
 if not IsBound(ENTITYDICT.(nam)) then
 # XXX error or better going on here?
## ParseError(str, pos, "don't know entity name");
 Info(InfoXMLParser, 1, "#W WARNING: Entity with name `", nam,
 "' not known!\n#W", " (Specify in <!DOCTYPE ...> tag or ",
 "in argument to parser!)\n");
 doc := Concatenation("UNKNOWNEntity(", nam, ")");
 else
 doc := ENTITYDICT.(nam);
 fi;
 i := 1;
 res := "";
 while i <= Length(doc) do
 if doc[i] <> '&' or (i<Length(doc) and doc[i+1] <> '#') then
 Add(res, doc[i]);
 i := i+1;
 else
 ent := GetEnt(doc, i+1);
 Append(res, ent.content);
 i := ent.next;
 fi;
 od;
 return rec(name := "EntityValue", content := res, next := pos1+1);
end);

## reading a start tag including attribute values
# returns rec(name := elementname,
# attributes := rec( attributename1 := attributevalue1, ...)
# content := EMPTYCONTENT or [] (to be filled recursively)
# next := positon in string after start tag )
# Special handling of case pos=1: the element name is not parsed but assumed
# to be WHOLEDOCUMENT; this way a complete document can be put in one pseudo
# element of this name.
# assuming str[pos-1] = '<' and str[pos]<>'/'
InstallGlobalFunction(GetSTag, function(str, pos)
 local res, pos2, start, attr, atval, delim, a, ent;
 res := rec(attributes := rec());
 # a small hack that allows to call GetElement with a whole document
 # after appending "</WHOLEDOCUMENT>"
 if pos=1 then
 res.name := "WHOLEDOCUMENT";
 res.next := 1;
 res.content := [];
 res.input := ShallowCopy(str);
 return res;
 fi;
 # name of element
 pos2 := GetChars(str, pos, NAMECHARS);
 if pos2=fail then
 ParseError(str, pos, "documents ends in element name");
 fi;
 if pos2=pos then
 ParseError(str, pos, "tag must start with name \'<name ...\'");
 fi;
 res.name := str{[pos..pos2-1]};
 # look for attributes or end of tag
 pos := GetChars(str, pos2, WHITESPACE);
 if pos=fail then
 ParseError(str, pos2, "document ends in tag");
 fi;
 while not str[pos] in "/>" do
 if not str[pos-1] in WHITESPACE then
 ParseError(str, pos-1, Concatenation("there must be white space ",
 "before attribute name"));
 fi;
 pos2 := GetChars(str, pos, NAMECHARS);
 if pos2=fail then
 ParseError(str, pos, "document ends in attribute name");
 fi;
 if pos2=pos then
 ParseError(str, pos, "attribute must have non-empty name");
 fi;
 # reading attribute value
 attr := str{[pos..pos2-1]};
## if not (str[pos2] = '=' and str[pos2+1] in "\"'") then
## ParseError(str, pos2, Concatenation("attribute must be specified ",
## "in form \'attr=\"text\"\'"));
## fi;
## delim := str[pos2+1];
 # can be white space around =
 pos2 := GetChars(str, pos2, WHITESPACE);
 if pos2 = fail or str[pos2] <> '=' then
 ParseError(str, pos2, "expecting '=' for attribute value");
 fi;
 pos2 := GetChars(str, pos2+1, WHITESPACE);
 if pos2 = fail or not str[pos2] in "\"'" then
 ParseError(str, pos2, "expecting quotes for attribute value");
 fi;
 delim := str[pos2];
 atval := "";
 pos2 := pos2 + 1;
 while str[pos2] <> delim do
 # we allow attr='fkjf"fafds' as well, see AnnStd 2.3
 pos2 := GetWord(str, pos2, "", "<&\"'");
 if pos2=fail then
 ParseError(str, pos, "document ends in attribute value");
 fi;
 # must allow &xyz; for entity resolution as well
 if not str[pos2[2]+1] = delim then
 if str[pos2[2]+1] = '&' then
 ent := GetEnt(str, pos2[2]+2);
 Append(atval, str{[pos2[1]..pos2[2]]});
 start := pos2[2]+2;
 pos2 := ent.next;
 if ent.name = "CharEntityValue" then
 Append(atval, ent.content);
 else
 # now ent.content may still contain some character entities, but
 # no '<' and so no markup
 if '<' in ent.content then
 ParseError(str, start,
 "entity replacement in attribute value cannot contain '<'");
 fi;
 ent := GetElement(Concatenation(ent.content,"</WHOLEDOCUMENT>"),1);
 if IsString(ent.content) then
 Append(atval, ent.content);
 else
 for a in ent.content do
 Append(atval, a.content);
 od;
 fi;
 fi;
 elif str[pos2[2]+1] in "\"'" then
 Append(atval, str{[pos2[1]..pos2[2]+1]});
 pos2 := pos2[2]+2;
 else
 ParseError(str, pos2[2]+1, "non valid character in attribute value");
 fi;
 else
 Append(atval, str{[pos2[1]..pos2[2]]});
 pos2 := pos2[2]+1;
 fi;
 od;
 res.attributes.(attr) := atval;
 pos2 := pos2+1;
 pos := GetChars(str, pos2, WHITESPACE);
 if pos=fail then
 ParseError(str, pos2, "document ends in tag");
 fi;
 od;
 if str[pos] = '/' then
 res.content := EMPTYCONTENT;
 pos := pos+1;
 else
 res.content := [];
 fi;
 if not str[pos] = '>' then
 ParseError(str, pos, "expecting end of tag \'>\' here");
 fi;
 res.next := pos+1;
 return res;
end);

## reading an end tag,
## returns rec( name := elementname,
## next := first position after this end tag)
# assuming str{[pos-2,pos-1]} = "</"
InstallGlobalFunction(GetETag, function(str, pos)
 local res, pos2;
 res := rec();
 # name of element
 pos2 := GetChars(str, pos, NAMECHARS);
 if pos2=fail then
 ParseError(str, pos, "documents ends in element name");
 fi;
 if pos2=pos then
 ParseError(str, pos, "end tag must start with name \'</name ...\'");
 fi;
 res.name := str{[pos..pos2-1]};
 pos := pos2;
 pos2 := GetChars(str, pos, WHITESPACE);
 if pos2=fail then
 ParseError(str, pos, "documents ends inside end tag");
 fi;
 if str[pos2] <> '>' then
 ParseError(str, pos2, "expecting end of tag \'>\' here");
 fi;
 res.next := pos2+1;
 return res;
end);

## reading an element: start tag, content (with recursive calls of
## GetElement) and end tag
# returns record explained before GetSTag, but with .content component
# filled
# assuming str[pos-1] = '<' and str[pos] in NAMECHARS
# (in this function we read entity definitions inside a <!DOCTYPE declaration)
InstallGlobalFunction(GetElement, function(str, pos)
 local res, r, s, pos2, lev, dt, p, nam, val, el, tmp;
 res := GetSTag(str,pos);
 res.start := pos - 1;
 # case of empty element
 if res.content = EMPTYCONTENT then
 res.stop := res.next - 1;
 return res;
 fi;
 pos := res.next;
 while true do
 if str[pos] = '&' then
 # resolve entity
 r := GetEnt(str, pos+1);
 pos := r.next;
 if r.name = "CharEntityValue" then
 # consider as PCDATA
 r.name := "PCDATA";
 Add(res.content, r);
 else
 # we have to parse the result
 s := Concatenation(r.content, "</WHOLEDOCUMENT>");
 r := GetElement(s, 1);
 Append(res.content, r.content);
 fi;
 elif str[pos] = '<' then
 if str[pos+1] = '?' then
 # processing instruction (PI), we repeat it literally
 pos2 := PositionSublist(str, "?>", pos+2);
 if pos2=fail then
 ParseError(str, pos+2, "document ends within processing instruction");
 fi;
 tmp := str{[pos+2..pos2-1]};
 Add(res.content, rec(name := "XMLPI", content := tmp));
 # check for encoding information
 if Length(tmp) > 3 and tmp{[1..4]} = "xml " then
 tmp := Concatenation(tmp, "/>");
 tmp := GetElement(tmp, 3);
 if IsBound(tmp.attributes.encoding) then
 tmp := tmp.attributes.encoding;
 if UNICODE_RECODE.NormalizedEncoding(tmp) = fail then
 Error("Cannot parse document in encoding ", tmp, "\n");
 fi;
 XMLPARSERFLAGS.Encoding := UNICODE_RECODE.NormalizedEncoding(tmp);
 # if not in UTF-8 encoding we recode rest of the document now
 if XMLPARSERFLAGS.Encoding <> "UTF-8" then
 Info(InfoGAPDoc, 1, "#I recoding input from ",
 XMLPARSERFLAGS.Encoding, " to UTF-8 . . .\n");
 tmp := Encode(Unicode(str{[pos..Length(str)]},
 XMLPARSERFLAGS.Encoding), "UTF-8");
 str{[pos..pos-1+Length(tmp)]} := tmp;
 fi;
 fi;
 fi;
 pos := pos2+2;
 elif str[pos+1] = '!' then
 if str[pos+2] = '-' and str[pos+3] = '-' then
 ## comment
 # here we ignore the restriction that inside comment
 # no "--" is allowed.
 pos2 := PositionSublist(str, "-->", pos+4);
 if pos2=fail then
 ParseError(str,pos+4, "document ends within comment");
 fi;
 Add(res.content, rec(name := "XMLCOMMENT",
 content := str{[pos+4..pos2-1]}));
 pos := pos2+3;
 elif str[pos+2] = 'D' and str{[pos+3..pos+8]} = "OCTYPE" and
 str[pos+9] in WHITESPACE then
 ## <!DOCTYPE ....
 ## end of this tag is matching ">"
 ## we have to read ENTITY declarations
 pos2 := pos+10;
 lev := 0;
 while str[pos2] <> '>' or lev > 0 do
 if str[pos2] = '<' then
 lev := lev+1;
 elif str[pos2] = '>' then
 lev := lev-1;
 fi;
 pos2 := pos2+1;
 if pos2>Length(str) then
 ParseError(str,pos+10, "document ends within DOCTYPE tag");
 fi;
 od;
 dt := rec(name := "XMLDOCTYPE",
 content := str{[pos+10..pos2-1]});
 ## convenience for parsing GAPDoc document, here we add the
 ## GAPDoc defined entities automatically
 pos := PositionSublist(dt.content, "gapdoc.dtd");
 if pos <> fail and dt.content[pos-1] in "'\"/" then
 for p in RecNames(ENTITYDICT_GAPDoc) do
 ENTITYDICT.(p) := ENTITYDICT_GAPDoc.(p);
 od;
 fi;
 Add(res.content, dt);
 ## parse entity declarations in here (no good error checking)
 pos := PositionSublist(dt.content, "<!ENTITY");
 while pos <> fail do
 p := GetWord(dt.content, pos+8);
 nam := dt.content{[p[1]..p[2]]};
 # value enclosed in ".." or '..'
 p := p[2]+1;
 while dt.content[p] in WHITESPACE do
 p := p + 1;
 od;
 p := [p+1];
 Add(p, Position(dt.content, dt.content[p[1]-1], p[1])-1);
 val := dt.content{[p[1]..p[2]]};
 ENTITYDICT.(nam) := val;
 pos := PositionSublist(dt.content, "<!ENTITY", p[2]);
 od;
 pos := pos2+1;
 elif str[pos+2] = '[' and str{[pos+3..pos+8]} = "CDATA[" then
 ## <![CDATA[ everything is verbose text until "]]>"
 pos2 := PositionSublist(str, "]]>", pos+9);
 if pos2=fail then
 ParseError(str,pos+10, "document ends within CDATA text");
 fi;
 if pos2>pos+9 then
 Add(res.content, rec(name := "PCDATA",
 content := str{[pos+9..pos2-1]}));
 fi;
 pos := pos2+3;
 else
 ParseError(str, pos, "unknown \"<!\"-tag");
 fi;
 elif str[pos+1] = '/' then
 ## end tag, must be the right one corresponding to the
 ## current element
 el := GetETag(str, pos+2);
 if res.name <> el.name then
 ParseError(str, pos, Concatenation("wrong end tag, expecting \"</",
 res.name, ">\" (starts line ",
 String(LineNumberStringPosition(str, res.start)[1]), ")"));
 else
 res.stop := el.next - 1;
 res.next := el.next;
 break;
 fi;
 elif not str[pos+1] in NAMECHARS then
 ParseError(str, pos+1, "not allowed character after '<'");
 else
 ## a new element starts, call GetElement recursively
 el := GetElement(str, pos+1);
 Add(res.content, el);
 pos := el.next;
 fi;
 else
 pos2 := GetWord(str, pos, "", "<&");
 if pos2 = fail then
 ParseError(str, pos, "document ends before end of current element");
 fi;
 if pos2[2] >= pos then
 Add(res.content, rec(name := "PCDATA",
 content := str{[pos..pos2[2]]}));
 fi;
 pos := pos2[2]+1;
 fi;
 od;
 return res;
end);

## the user function for parsing an XML document stored in a string,
## adds end tag for pseudo element WHOLEDOCUMENT (see before GetSTag)
## and calls GetElement

## <#GAPDoc Label="ParseTreeXMLString">
## <ManSection >
## <Func Arg="str[, srcinfo][, entitydict]" Name="ParseTreeXMLString" />
## <Func Arg="fname[, entitydict]" Name="ParseTreeXMLFile" />
## <Returns>a record which is root of a tree structure</Returns>
## <Description>
## The first function parses an XML-document stored in string <A>str</A>
## and returns the document in form of a tree.
##
## The optional argument <A>srcinfo</A> must have the same format
## as in <Ref Func="OriginalPositionDocument" />. If it is given then
## error messages refer to the original source of the text with the
## problem.
##
## With the optional argument <A>entitydict</A> named entities can be
## given to the parser, for example entities which are defined in the
## <C>.dtd</C>-file (which is not read by this parser). The standard
## XML-entities do not need to be provided, and for &GAPDoc; documents
## the entity definitions from <C>gapdoc.dtd</C> are automatically
## provided. Entities in the document's <C><!DOCTYPE</C> declaration
## are parsed and also need not to be provided here. The argument
## <A>entitydict</A> must be a record where each component name is an entity
## name (without the surrounding & and ;) to which is assigned its
## substitution string.
##
## The second function is just a shortcut for <C>ParseTreeXMLString(
## StringFile(</C><A>fname</A><C>), ... )</C>, see <Ref Func="StringFile"/>.
## 
##
## After these functions return the list of named entities which were known
## during the parsing can be found in the record <C>ENTITYDICT</C>. 
##
## A node in the result tree corresponds to an XML element, or to some
## parsed character data. In the first case it looks as follows:
##
## <Listing Type="Example Node">
## rec( name := "Book",
## attributes := rec( Name := "EDIM" ),
## content := [ ... list of nodes for content ...],
## start := 312,
## stop := 15610,
## next := 15611 )
## </Listing>
##
## This means that <C><A>str</A>{[312..15610]}</C> looks like
## <C><Book Name="EDIM"> ... content ... </Book></C>.
##
## The leaves of the tree encode parsed character data as in the
## following example:
##
## <Listing Type="Example Node">
## rec( name := "PCDATA",
## content := "text without markup " )
## </Listing>
##
## This function checks whether the XML document is <Emph>well
## formed</Emph>, see <Ref Chap="XMLvalid" /> for an explanation.
## If an error in the XML structure is found, a break loop is
## entered and the text around the position where the problem starts
## is shown. With <C>Show();</C> one can browse the original input
## in the <Ref BookName="Ref" Func="Pager" />, starting with the
## line where the error occurred.
##
## All entities are resolved when they are either entities defined
## in the &GAPDoc; package (in particular the standard XML entities)
## or if their definition is included in the <C><!DOCTYPE ..></C>
## tag of the document.
##
## Note that <Ref Func="ParseTreeXMLString" /> does not parse
## and interpret the corresponding document type definition (the
## <C>.dtd</C>-file given in the <C><!DOCTYPE ..></C> tag). Hence
## it also does not check the <Emph>validity</Emph> of the document
## (i.e., it is no <Emph>validating XML parser</Emph>).
##
## If you are using this function to parse a &GAPDoc; document
## you can use <Ref Func="CheckAndCleanGapDocTree" /> for some
## validation and additional checking of the document structure.
##
## </Description>
## </ManSection>
## <#/GAPDoc>
##
InstallGlobalFunction(ParseTreeXMLString, function(arg)
 local str, ents, res, a;
 # artificial end tag to wrap document in one element
 str := Concatenation(arg[1], "</WHOLEDOCUMENT>");
 # default encoding is UTF-8, may be changed if we find a <?xml ...
 # processing instruction
 XMLPARSERFLAGS.Encoding := "UTF-8";
 if Length(arg) > 1 and IsList(arg[2]) then
 XMLPARSEORIGINS := arg[2];
 else
 XMLPARSEORIGINS := false;
 fi;
 # reset ENTITYDICT
 for a in RecNames(ENTITYDICT) do
 Unbind(ENTITYDICT.(a));
 od;
 for a in RecNames(ENTITYDICT_default) do
 ENTITYDICT.(a) := ENTITYDICT_default.(a);
 od;
 # maybe load more entities from last argument
 if Length(arg) > 1 and IsRecord(arg[Length(arg)]) then
 ents := arg[Length(arg)];
 for a in RecNames(ents) do
 ENTITYDICT.(a) := ents.(a);
 od;
 fi;
 res := GetElement(str, 1);
 res.input := ShallowCopy(arg[1]);
 if XMLPARSEORIGINS <> false then
 res.inputorigins := XMLPARSEORIGINS;
 fi;
 return res;
end);
InstallGlobalFunction(ParseTreeXMLFile, function(arg)
 arg := ShallowCopy(arg);
 arg[1] := StringFile(arg[1]);
 return CallFuncList(ParseTreeXMLString, arg);
end);

## Print document tree structure (without the PCDATA entries)

## <#GAPDoc Label="DisplayXMLStructure">
## <ManSection >
## <Func Arg="tree" Name="DisplayXMLStructure" />
## <Description>
## This utility displays the tree structure of an XML document as it
## is returned by <Ref Func="ParseTreeXMLString" /> (without the
## <C>PCDATA</C> leaves).
##
## Since this is usually quite long the result is shown using the
## <Ref BookName="ref" Func="Pager" />.
## </Description>
## </ManSection>
## <#/GAPDoc>
##
InstallGlobalFunction(DisplayXMLStructure, function(doc)
 local NL, prs, app, str;
 str := "";
 NL := "\n";
 app := function(arg)
 local i;
 for i in [2..Length(arg)] do
 Append(arg[1], arg[i]);
 od;
 end;
 prs := function(doc, indent)
 local a, c, indentnext;
 if doc.name = "PCDATA" then
 return;
 fi;
 if IsBound(doc.count) then
 c := String(doc.count);
 else
 c := "";
 fi;
 app(str, indent, c, " ", doc.name, NL);
 if IsBound(doc.attributes) then
 for a in NamesOfComponents(doc.attributes) do
 app(str, indent," #",a,":",doc.attributes.(a), NL);
 od;
 fi;
 if doc.content = EMPTYCONTENT then
 app(str, indent, " # empty element\n");
 elif IsString(doc.content) then
## ??? too much output
## Print(indent, " # data\n");
 else
 for a in doc.content do
 indentnext := Concatenation(indent, " ");
 prs(a, indentnext);
 od;
 fi;
 end;
 prs(doc, "");
 Page(str);
end);

## apply a function to all nodes of a parse tree

## <#GAPDoc Label="ApplyToNodesParseTree">
## <ManSection >
## <Func Arg="tree, fun" Name="ApplyToNodesParseTree" />
## <Func Arg="tree" Name="AddRootParseTree" />
## <Func Arg="tree" Name="RemoveRootParseTree" />
## <Description>
## The function <Ref Func="ApplyToNodesParseTree" /> applies a
## function <A>fun</A> to all nodes of the parse tree <A>tree</A>
## of an XML document returned by <Ref Func="ParseTreeXMLString"
## />.
##
## The function <Ref Func="AddRootParseTree" /> is an application of
## this. It adds to all nodes a component <C>.root</C> to which
## the top node tree <A>tree</A> is assigned. These components can be
## removed afterwards with <Ref Func="RemoveRootParseTree" />.
## </Description>
## </ManSection>
## <#/GAPDoc>
##
InstallGlobalFunction(ApplyToNodesParseTree, function(r, f)
 local ff;
 ff := function(rr)
 local a;
 if IsList(rr.content) and not IsString(rr.content) then
 for a in rr.content do
 f(a);
 ff(a);
 od;
 fi;
 end;
 f(r);
 ff(r);
end);

## This is useful for things like indexing where one should have
## access to the root of the document tree during the whole processing
InstallGlobalFunction(AddRootParseTree, function(r)
 ApplyToNodesParseTree(r, function(a) a.root := r; end);
end);

## And this throws away the links
InstallGlobalFunction(RemoveRootParseTree, function(r)
 ApplyToNodesParseTree(r, function(a) Unbind(a.root); end);
end);

## <#GAPDoc Label="StringXMLElement">
## <ManSection >
## <Func Arg="tree" Name="StringXMLElement" />
## <Returns>a list <C>[string, positions]</C></Returns>
## <Description>
##
## The argument <A>tree</A> must have a format of a node in the parse tree
## of an XML document as returned by <Ref Func="ParseTreeXMLString"/>
## (including the root node representing the full document). This function
## computes a pair <C>[string, positions]</C> where <C>string</C> contains
## XML code which is equivalent to the code which was parsed to get
## <A>tree</A>. And <C>positions</C> is a list of lists of four numbers
## <C>[eltb, elte, contb, conte]</C>. There is one such list for each XML
## element occuring in <C>string</C>, where <C>eltb</C> and <C>elte</C> are
## the begin and end position of this element in <C>string</C> and where
## <C>contb</C> and <C>conte</C> are begin and end position of the content
## of this element, or both are <C>0</C> if there is no content.
##
## Note that parsing XML code is an irreversible task, we can only expect
## to get equivalent XML code from this function. But parsing the resulting
## <C>string</C> again and applying <Ref Func="StringXMLElement"/> again
## gives the same result. See the function <Ref Func="EntitySubstitution"/>
## for back-substitutions of entities in the result.
## </Description>
## </ManSection>
## <#/GAPDoc>
# args: r[, count, pos, str] count, pos, str is for use within recursion
StringXMLElement := function(arg)
 local r, str, pos, p, tmp, att, a;
 if Length(arg) = 1 then
 r := arg[1];
 str := StringXMLElement(r, [], "");
 # revert the WHOLEDOCUMENT trick of the parser
 if IsRecord(r) and r.name = "WHOLEDOCUMENT" then
 str[1] := str[1]{[16..Length(str[1])-16]};
 str[2] := str[2] - 15;
 str[2][Length(str[2])] := [1, Length(str[1]), 1, Length(str[1])];
 for a in str[2] do
 if a[3] = -15 then
 a[3] := 0;
 a[4] := 0;
 fi;
 od;
 fi;
 return str;
 fi;
 # now we are in the recursion
 r := arg[1];
 pos := arg[2];
 str := arg[3];

 if IsRecord(r) then
 if r.name = "PCDATA" then
 return StringXMLElement(r.content, pos, str);
 elif r.name = "XMLPI" then
 p := Length(str)+1;
 Append(str, Concatenation("<?", r.content, "?>"));
 Add(pos, [p, Length(str), 0, 0]);
 return [str, pos];
 elif r.name = "XMLDOCTYPE" then
 p := Length(str)+1;
 Append(str, Concatenation("<!DOCTYPE ", r.content, ">"));
 Add(pos, [p, Length(str), 0, 0]);
 return [str, pos];
 elif r.name = "XMLCOMMENT" then
 p := Length(str)+1;
 Append(str, Concatenation(""));
 Add(pos, [p, Length(str), 0, 0]);
 return [str, pos];
 fi;
 fi;
 if IsString(r) then
 r := SubstitutionSublist(r, "&", "&");
 r := SubstitutionSublist(r, "<", "<");
 if Length(r) > 0 then
 Append(str, r);
 fi;
 return [str, pos];
 fi;
 p := [Length(str)+1];
 Append(str, "<");
 Append(str, r.name);
 for att in RecNames(r.attributes) do
 Add(str, ' ');
 Append(str, att);
 Append(str, "=\"");
 tmp := SubstitutionSublist(r.attributes.(att), "\"", """);
 tmp := SubstitutionSublist(tmp, "&", "&");
 tmp := SubstitutionSublist(tmp, "<", "<");
 if Length(tmp)>0 then
 fi;
 Append(str, tmp);
 Append(str, "\"");
 od;
 if r.content = 0 then
 Append(str, "/>");
 Add(pos, [p[1], Length(str), 0, 0]);
 return [str, pos];
 fi;
 Add(str, '>');
 p[3] := Length(str)+1;
 if IsString(r.content) then
 StringXMLElement(r.content, pos, str);
 else
 for a in r.content do
 StringXMLElement(a, pos, str);
 od;
 fi;
 p[4] := Length(str);
 Append(str, "</");
 Append(str, r.name);
 Add(str, '>');
 p[2] := Length(str);
 Add(pos, p);
 return [str, pos];
end;

## <#GAPDoc Label="EntitySubstitution">
## <ManSection >
## <Func Arg="xmlstring, entities" Name="EntitySubstitution" />
## <Returns>a string</Returns>
## <Description>
## The argument <A>xmlstring</A> must be a string containing XML
## code or a pair <C>[string, positions]</C> as returned by <Ref
## Func="StringXMLElement"/>. The argument <A>entities</A> specifies entity
## names (without the surrounding <A>&</A> and <C>;</C>) and their
## substitution strings, either a list of pairs of strings or as a record
## with the names as components and the substitutions as values.
##
## This function tries to substitute non-intersecting parts of
## <C>string</C> by the given entities. If the <C>positions</C> information
## is given then only parts of the document which allow a valid
## substitution by an entity are considered. Otherwise a simple text
## substitution without further check is done. 
##
## Note that in general the entity resolution in XML documents is a
## complicated and non-reversible task. But nevertheless this utility may
## be useful in not too complicated situations.
## </Description>
## </ManSection>
## <#/GAPDoc>
EntitySubstitution := function(xmlstr, entities)
 local posinfo, entities2, check, subs, pos, npos, new, res, off, a;
 if not IsString(xmlstr) then
 posinfo := xmlstr[2];
 xmlstr := xmlstr[1];
 else
 posinfo := fail;
 fi;
 if IsRecord(entities) then
 entities := List(RecNames(entities), f-> [f, entities.(f)]);
 fi;
 # parse and rewrite entities
 entities2 := List(entities, a-> [a[1], StringXMLElement(
 ParseTreeXMLString(a[2]))[1]]);
 # checks if beginning and end of a substring are in the content of the
 # same element (if this information is available)
 check := function(b, e)
 local pb, a;
 if posinfo = fail then
 return true;
 fi;
 pb := [-1];
 for a in posinfo do
 if a[1] <= b and a[1] > pb[1] and a[2] >= e then
 pb := a;
 fi;
 od;
 if b = pb[1] and e = pb[2] then
 return true;
 fi;
 for a in posinfo do
 if a <> pb and a[1] > pb[1] and a[2] < pb[2] then
 if not Intersection([b..e],[a[1]..a[2]]) in [[], [a[1]..a[2]]] then
 return false;
 fi;
 fi;
 od;
 return true;
 end;

 subs := [];
 for a in entities2 do
 if not a[1] in ["lt", "gt", "amp", "apos", "quot"] then
 pos := 0;
 while pos <> fail do
 npos := PositionSublist(xmlstr, a[2], pos);
 if npos <> fail and check(npos, npos-1+Length(a[2])) then
 new := [npos, npos-1+Length(a[2]), a];
 if ForAll(subs, b-> b[1] > new[2] or b[2] < new[1]) then
 Add(subs, new);
 fi;
 pos := new[2];
 else
 pos := npos;
 fi;
 od;
 fi;
 od;
 Sort(subs);
 if Length(subs) > 0 then
 res := xmlstr{[1..subs[1][1]-1]};
 off := 0;
 res := "";
 for a in subs do
 Append(res, xmlstr{[off+1..a[1]-1]});
 Append(res, Concatenation("&", a[3][1], ";"));
 off := a[2];
 od;
 Append(res, xmlstr{[off+1..Length(xmlstr)]});
 xmlstr := res;
 fi;
 return xmlstr;
end;

## <#GAPDoc Label="GetTextXMLTree">
## <ManSection >
## <Func Arg="tree" Name="GetTextXMLTree" />
## <Returns>a string</Returns>
## <Description>
## The argument <A>tree</A> must be a node of a parse tree of some
## XML document, see <Ref Func="ParseTreeXMLFile"/>.
## This function collects the content of this and all included elements
## recursively into a string.
## </Description>
## </ManSection>
## <#/GAPDoc>
# extract and collect text in elements recursively
InstallGlobalFunction(GetTextXMLTree, function(r)
 local res, fun;
 res := "";
 fun := function(r)
 if IsString(r.content) then
 Append(res, r.content);
 fi;
 end;
 ApplyToNodesParseTree(r, fun);
 return res;
end);

## <#GAPDoc Label="XMLElements">
## <ManSection >
## <Func Arg="tree, eltnames" Name="XMLElements" />
## <Returns>a list of nodes</Returns>
## <Description>
## The argument <A>tree</A> must be a node of a parse tree of some
## XML document, see <Ref Func="ParseTreeXMLFile"/>.
## This function returns a list of all subnodes of <A>tree</A> (possibly
## including <A>tree</A>) of elements with name given in the list of strings
## <A>eltnames</A>. Use <C>"PCDATA"</C> as name for leave nodes which contain
## the actual text of the document. As an abbreviation <A>eltnames</A> can also
## be a string which is then put in a one element list.
## </Description>
## </ManSection>
## <#/GAPDoc>
# return list of nodes of elements with name in 'eltnames' from XML tree r
InstallGlobalFunction(XMLElements, function(r, eltnames)
 local res, fun;
 if IsString(eltnames) then
 eltnames := [eltnames];
 fi;
 res := [];
 fun := function(r)
 if r.name in eltnames then
 Add(res, r);
 fi;
 end;
 ApplyToNodesParseTree(r, fun);
 return res;
end);

## <#GAPDoc Label="XMLValidate">
## <ManSection >
## <Func Arg="doc, dtdpath" Name="XMLValidate" />
## <Returns><K>fail</K> or a record</Returns>
## <Description>
## The argument <A>doc</A> must be a string which is an XML document, and
## <A>dtdpath</A> is a path containing the corresponding DTD file.
## 
## The function returns <K>fail</K> if the program <C>xmllint</C> cannot
## be called.
## 
## Otherwise the document is validated via the external
## program <C>xmllint</C> via the function <Ref BookName="IO"
## Func="IO_PipeThroughWithError"/>, and its resulting record is returned.
## </Description>
## </ManSection>
## <#/GAPDoc>
# avoid possible error message
if not IsBound(IO_PipeThroughWithError) then
 IO_PipeThroughWithError := fail;
fi;
InstallGlobalFunction(XMLValidate, function(str, dtdpath)
 local xmllint, p;
 # find xmllint
 xmllint := Filename(DirectoriesSystemPrograms(), "xmllint");
 if xmllint = fail then
 Info(InfoWarning, 1, "Warning: XMLValidate needs 'xmllint' to be installed.");
 return fail;
 fi;
 if not IsBound(IO_PipeThroughWithError) then
 Info(InfoWarning, 1, "Warning: XMLValidate needs 'IO_PipeThroughWithError' from the 'IO' package.");
 return fail;
 fi;
 p := IO_PipeThroughWithError(xmllint,
 ["--noout", "--path", dtdpath, "--valid", "-" ], str);
 return p;
end);
if IO_PipeThroughWithError=fail then
 Unbind(IO_PipeThroughWithError);
fi;

## <#GAPDoc Label="ValidateGAPDoc">
## <ManSection >
## <Func Arg="doc" Name="ValidateGAPDoc" />
## <Returns><K>fail</K>, <K>true</K> or a record</Returns>
## <Description>
## The argument <A>doc</A> must be a string which is a GAPDoc XML
## document or a pair of a string and list as returned by <Ref
## Func="ComposedDocument"/> with argument <A>info</A> set to <K>true</K>.
## 
## The function returns <K>fail</K> in case of a problem.
## 
## Otherwise the document is validated using <Ref Func="XMLValidate"/>. If
## the validation was successful this function returns <K>true</K>. In the
## case of validation errors some information is printed and the result of
## <Ref Func="XMLValidate"/> is returned.
## <Log>
## gap> fn := Filename(DirectoriesPackageLibrary("gapdoc", ""),
## > "3k+1/3k+1.xml");;
## gap> doc := ComposedDocument("GAPDoc", "", fn, [], true);;
## gap> doc[1][220] := 't';;
## gap> check := ValidateGAPDoc(doc);;
## ValidateGAPDoc found problems:
## Line 11: parser error : Opening and ending tag mismatch
## source position: /opt/gap/pkg/GAPDoc-1.6.4/3k+1/3k+1.xml, line 11
## </Log>
## </Description>
## </ManSection>
## <#/GAPDoc>
InstallGlobalFunction(ValidateGAPDoc, function(str)
 local posinfo, dtdpath, p, lineposs, err, orig, r;

 # check if arg has info on original positions
 if IsList(str) and Length(str) > 1 and IsString(str[1]) then
 posinfo := str[2];
 str := str[1];
 else
 posinfo := false;
 fi;

 # path to gapdoc.dtd
 dtdpath := Filename(DirectoriesPackageLibrary("GAPDoc", ""), "");
 if dtdpath = fail then
 Info(InfoWarning, 1, "Warning: ValidateGAPDoc: need to be run in GAP with GAPDoc.");
 return fail;
 fi;
 p := XMLValidate(str, dtdpath);
 if p = fail then
 Info(InfoWarning, 1, "Warning: ValidateGAPDoc: cannot call 'XMLValidate'");
 return fail;
 fi;
 if p.status.WEXITSTATUS = 0 then
 return true;
 fi;
 if posinfo <> false then
 lineposs := Positions(str, '\n');
 fi;
 err := SplitString(p.err, "\n", "");
 err := List(err, r-> SplitString(r, "", ":"));
 err := Filtered(err, r-> Length(r) > 3 and Int(r[2]) <> fail);
 for r in err do
 r[2] := Int(r[2]);
 od;
 if Length(err) > 0 then
 Print("ValidateGAPDoc found problems:\n");
 for r in err do
 Print("Line ", r[2], ": ", r[3], ":", r[4], "\n");
 if posinfo <> false then
 orig := OriginalPositionDocument(posinfo, lineposs[r[2]]);
 Print(" source position: ", orig[1],", line ",orig[2],"\n");
 fi;
 od;
 fi;
 return p;
end);