From 7f631b112e55051cb551a8d78823da1c0e2313a7 Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Mon, 19 Apr 2021 16:53:33 -0700 Subject: [PATCH] lxml's HTML parser adds html/body tags, remove them if present... This causes issues w/ the atom blog as it puts html tags in the middle of an html document where they shouldn't be.. --- encthenet_plugins.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/encthenet_plugins.py b/encthenet_plugins.py index 86f9502..08b9db2 100644 --- a/encthenet_plugins.py +++ b/encthenet_plugins.py @@ -21,7 +21,24 @@ def rellinktoabs(context, value): # prefix them w/ the content_url i.attrib['href'] = content_url + i.attrib['href'] - return etree.tostring(html, encoding='unicode', method='html') + res = etree.tostring(html, encoding='unicode', method='html') + + # lxml.HTML wraps the html w/ html/body tags, strip them + # if present + + startstr = '' + endstr = '' + + startpos = 0 + endpos = None + if res.startswith(startstr): + startpos = len(startstr) + if res.endswith(endstr): + endpos = -len(endstr) + + res = res[startpos:endpos] + + return res # mostly copied from hyde.ext.templates.jinja.py Markdown # and using docs from: