reinit the tree, so we can have metadata

author: V3n3RiX <venerix@redcorelinux.org> 2017-10-09 18:53:29 +0100
committer: V3n3RiX <venerix@redcorelinux.org> 2017-10-09 18:53:29 +0100
commit: 4f2d7949f03e1c198bc888f2d05f421d35c57e21 (patch)
tree: ba5f07bf3f9d22d82e54a462313f5d244036c768 /dev-python/html5lib/files
2 files changed, 167 insertions, 0 deletions
diff --git a/dev-python/html5lib/files/html5lib-0.9999999-lxml-3.5.0-backport.patch b/dev-python/html5lib/files/html5lib-0.9999999-lxml-3.5.0-backport.patch
new file mode 100644
index 000000000000..fecfab9a4fb4
--- /dev/null
+++ b/dev-python/html5lib/files/html5lib-0.9999999-lxml-3.5.0-backport.patch
@@ -0,0 +1,117 @@
+From 46046c0f7125911ff8205f09a7574573bb953105 Mon Sep 17 00:00:00 2001
+From: Geoffrey Sneddon <geoffers@gmail.com>
+Date: Mon, 23 Nov 2015 15:17:07 +0000
+Subject: [PATCH 1/3] Make lxml tree-builder coerce comments to work with lxml
+ 3.5.
+
+---
+ html5lib/ihatexml.py                | 2 ++
+ html5lib/treebuilders/etree_lxml.py | 2 +-
+ 2 files changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py
+index 0fc7930..b5b2e98 100644
+--- a/html5lib/ihatexml.py
++++ b/html5lib/ihatexml.py
+@@ -225,6 +225,8 @@ def coerceComment(self, data):
+             while "--" in data:
+                 warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
+                 data = data.replace("--", "- -")
++            if data.endswith("-"):
++                data += " "
+         return data
+ 
+     def coerceCharacters(self, data):
+diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
+index 35d08ef..17007e3 100644
+--- a/html5lib/treebuilders/etree_lxml.py
++++ b/html5lib/treebuilders/etree_lxml.py
+@@ -189,7 +189,7 @@ class TreeBuilder(_base.TreeBuilder):
+ 
+     def __init__(self, namespaceHTMLElements, fullTree=False):
+         builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
+-        infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
++        infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
+         self.namespaceHTMLElements = namespaceHTMLElements
+ 
+         class Attributes(dict):
+
+From 1c22e1ce93dd4acc81a66cfa03cf9720fbd741c7 Mon Sep 17 00:00:00 2001
+From: Geoffrey Sneddon <geoffers@gmail.com>
+Date: Mon, 23 Nov 2015 15:35:21 +0000
+Subject: [PATCH 2/3] fixup! Make lxml tree-builder coerce comments to work
+ with lxml 3.5.
+
+---
+ html5lib/ihatexml.py                | 1 +
+ html5lib/treebuilders/etree_lxml.py | 7 ++++---
+ 2 files changed, 5 insertions(+), 3 deletions(-)
+
+diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py
+index b5b2e98..5a81a12 100644
+--- a/html5lib/ihatexml.py
++++ b/html5lib/ihatexml.py
+@@ -226,6 +226,7 @@ def coerceComment(self, data):
+                 warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
+                 data = data.replace("--", "- -")
+             if data.endswith("-"):
++                warnings.warn("Comments cannot contain end in a dash", DataLossWarning)
+                 data += " "
+         return data
+ 
+diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
+index 17007e3..c6c981f 100644
+--- a/html5lib/treebuilders/etree_lxml.py
++++ b/html5lib/treebuilders/etree_lxml.py
+@@ -54,7 +54,7 @@ def _getChildNodes(self):
+ def testSerializer(element):
+     rv = []
+     finalText = None
+-    infosetFilter = ihatexml.InfosetFilter()
++    infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
+ 
+     def serializeElement(element, indent=0):
+         if not hasattr(element, "tag"):
+@@ -257,7 +257,7 @@ def _getData(self):
+             data = property(_getData, _setData)
+ 
+         self.elementClass = Element
+-        self.commentClass = builder.Comment
++        self.commentClass = Comment
+         # self.fragmentClass = builder.DocumentFragment
+         _base.TreeBuilder.__init__(self, namespaceHTMLElements)
+ 
+@@ -344,7 +344,8 @@ def insertRoot(self, token):
+ 
+         # Append the initial comments:
+         for comment_token in self.initial_comments:
+-            root.addprevious(etree.Comment(comment_token["data"]))
++            comment = self.commentClass(comment_token["data"])
++            root.addprevious(comment._element)
+ 
+         # Create the root document and add the ElementTree to it
+         self.document = self.documentClass()
+
+From 235a6d7ac7e0a3e2b431766e051094c2d3110ba3 Mon Sep 17 00:00:00 2001
+From: Geoffrey Sneddon <geoffers@gmail.com>
+Date: Mon, 23 Nov 2015 15:42:12 +0000
+Subject: [PATCH 3/3] fixup! Make lxml tree-builder coerce comments to work
+ with lxml 3.5.
+
+---
+ html5lib/ihatexml.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py
+index 5a81a12..5da5d93 100644
+--- a/html5lib/ihatexml.py
++++ b/html5lib/ihatexml.py
+@@ -226,7 +226,7 @@ def coerceComment(self, data):
+                 warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
+                 data = data.replace("--", "- -")
+             if data.endswith("-"):
+-                warnings.warn("Comments cannot contain end in a dash", DataLossWarning)
++                warnings.warn("Comments cannot end in a dash", DataLossWarning)
+                 data += " "
+         return data
+ 
diff --git a/dev-python/html5lib/files/html5lib-0.9999999-python3.6-sanitizer-re.patch b/dev-python/html5lib/files/html5lib-0.9999999-python3.6-sanitizer-re.patch
new file mode 100644
index 000000000000..2fbef2ad0b9e
--- /dev/null
+++ b/dev-python/html5lib/files/html5lib-0.9999999-python3.6-sanitizer-re.patch
@@ -0,0 +1,50 @@
+--- a/html5lib/sanitizer.py
++++ b/html5lib/sanitizer.py
+@@ -203,7 +203,7 @@
+             for attr in self.attr_val_is_uri:
+                 if attr not in attrs:
+                     continue
+-                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
++                val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
+                                        unescape(attrs[attr])).lower()
+                 # remove replacement characters from unescaped characters
+                 val_unescaped = val_unescaped.replace("\ufffd", "")
+@@ -228,7 +228,7 @@
+                                          ' ',
+                                          unescape(attrs[attr]))
+             if (token["name"] in self.svg_allow_local_href and
+-                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
++                'xlink:href' in attrs and re.search(r'^\s*[^#\s].*',
+                                                     attrs['xlink:href'])):
+                 del attrs['xlink:href']
+             if 'style' in attrs:
+@@ -257,16 +257,16 @@
+ 
+     def sanitize_css(self, style):
+         # disallow urls
+-        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
++        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+ 
+         # gauntlet
+-        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
++        if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+             return ''
+-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
++        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+             return ''
+ 
+         clean = []
+-        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
++        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
+             if not value:
+                 continue
+             if prop.lower() in self.allowed_css_properties:
+@@ -275,7 +275,7 @@
+                                                 'padding']:
+                 for keyword in value.split():
+                     if keyword not in self.acceptable_css_keywords and \
+-                            not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
++                            not re.match(r"^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
+                         break
+                 else:
+                     clean.append(prop + ': ' + value + ';')
author	V3n3RiX <venerix@redcorelinux.org>	2017-10-09 18:53:29 +0100
committer	V3n3RiX <venerix@redcorelinux.org>	2017-10-09 18:53:29 +0100
commit	4f2d7949f03e1c198bc888f2d05f421d35c57e21 (patch)
tree	ba5f07bf3f9d22d82e54a462313f5d244036c768 /dev-python/html5lib/files