After some more digging I found out that the DTD entity resolution
machanism prefixes the system ID with the path of the parent directory,
whereas parametric or general entites do not get that treatment.
class DTDResolver(etree.Resolver):
def resolve(self,system_id,public_id,context):
print( f"*** SYSTEM {system_id} PUBLIC {public_id}" )
return super().resolve(system_id,public_id,context)
/home/em/Workbench/beautifulsoup> ./dtdbug.py
*** SYSTEM parts.ent PUBLIC None
*** SYSTEM /data/home/em/Workbench/buch.dtd PUBLIC -//Testing//DTD Buch//DE
*** SYSTEM kapitel1.xml PUBLIC None
Traceback (most recent call last):
File "./dtdbug.py", line 16, in <module>
tree = etree.fromstring( doc, parser )
File "src/lxml/etree.pyx", line 3235, in lxml.etree.fromstring
File "src/lxml/parser.pxi", line 1876, in lxml.etree._parseMemoryDocument
File "src/lxml/parser.pxi", line 1764, in lxml.etree._parseDoc
File "src/lxml/parser.pxi", line 1127, in lxml.etree._BaseParser._parseDoc
File "src/lxml/parser.pxi", line 601, in lxml.etree._ParserContext._handleParseResultDoc
File "src/lxml/parser.pxi", line 711, in lxml.etree._handleParseResult
File "src/lxml/parser.pxi", line 640, in lxml.etree._raiseParseError
File "<string>", line 5
lxml.etree.XMLSyntaxError: failed to load external entity "/data/home/em/Workbench/buch.dtd", line 5, column 3
/home/em/Workbench/beautifulsoup>
However at least I can fix that using explicit catalog.xml
> XML_CATALOG_FILES=catalog.xml ./dtdbug.py
*** SYSTEM parts.ent PUBLIC None
*** SYSTEM /data/home/em/Workbench/buch.dtd PUBLIC -//Testing//DTD Buch//DE
*** SYSTEM kapitel1.xml PUBLIC None
>
Still gets the wrong system id, but does not throw expections.
After some more digging I found out that the DTD entity resolution
machanism prefixes the system ID with the path of the parent directory,
whereas parametric or general entites do not get that treatment.
class DTDResolver( etree.Resolver) : self,system_ id,public_ id,context) : .resolve( system_ id,public_ id,context)
def resolve(
print( f"*** SYSTEM {system_id} PUBLIC {public_id}" )
return super()
doc = open("rama. xml","rb" ).read( ) (dtd_validation =True,load_ dtd=True) resolvers. add( DTDResolver() )
parser = etree.XMLParser
parser.
tree = etree.fromstring( doc, parser )
/home/em/ Workbench/ beautifulsoup> ./dtdbug.py em/Workbench/ buch.dtd PUBLIC -//Testing//DTD Buch//DE etree.pyx" , line 3235, in lxml.etree. fromstring parser. pxi", line 1876, in lxml.etree. _parseMemoryDoc ument parser. pxi", line 1764, in lxml.etree. _parseDoc parser. pxi", line 1127, in lxml.etree. _BaseParser. _parseDoc parser. pxi", line 601, in lxml.etree. _ParserContext. _handleParseRes ultDoc parser. pxi", line 711, in lxml.etree. _handleParseRes ult parser. pxi", line 640, in lxml.etree. _raiseParseErro r XMLSyntaxError: failed to load external entity "/data/ home/em/ Workbench/ buch.dtd" , line 5, column 3 Workbench/ beautifulsoup>
*** SYSTEM parts.ent PUBLIC None
*** SYSTEM /data/home/
*** SYSTEM kapitel1.xml PUBLIC None
Traceback (most recent call last):
File "./dtdbug.py", line 16, in <module>
tree = etree.fromstring( doc, parser )
File "src/lxml/
File "src/lxml/
File "src/lxml/
File "src/lxml/
File "src/lxml/
File "src/lxml/
File "src/lxml/
File "<string>", line 5
lxml.etree.
/home/em/
However at least I can fix that using explicit catalog.xml
<?xml version="1.0"?> ///usr/ share/xml/ schema/ xml-core/ catalog. dtd"> urn:oasis: names:tc: entity: xmlns:xml: catalog" > "-//Testing/ /DTD Buch//DE" uri="buch.dtd"/> "parts. ent" uri="parts.ent"/> "kapitel1. xml" uri="kapitel1. xml"/> "kapitel2. xml" uri="kapitel2. xml"/>
<!DOCTYPE catalog PUBLIC "-//OASIS//DTD XML Catalogs V1.0//EN"
"file:
<catalog xmlns="
<public publicId=
<system systemId=
<system systemId=
<system systemId=
</catalog>
> XML_CATALOG_ FILES=catalog. xml ./dtdbug.py em/Workbench/ buch.dtd PUBLIC -//Testing//DTD Buch//DE
*** SYSTEM parts.ent PUBLIC None
*** SYSTEM /data/home/
*** SYSTEM kapitel1.xml PUBLIC None
>
Still gets the wrong system id, but does not throw expections.