root / env / lib / python2.7 / site-packages / django / utils / html_parser.py @ 1a305335
History | View | Annotate | Download (4.27 KB)
1 |
import HTMLParser as _HTMLParser |
---|---|
2 |
import re |
3 |
import sys |
4 |
|
5 |
current_version = sys.version_info |
6 |
|
7 |
use_workaround = ( |
8 |
(current_version < (2, 7, 3)) or |
9 |
(current_version >= (3, 0) and current_version < (3, 2, 3)) |
10 |
) |
11 |
|
12 |
if not use_workaround: |
13 |
HTMLParser = _HTMLParser.HTMLParser |
14 |
else:
|
15 |
class HTMLParser(_HTMLParser.HTMLParser): |
16 |
"""
|
17 |
Patched version of stdlib's HTMLParser with patch from:
|
18 |
http://bugs.python.org/issue670664
|
19 |
"""
|
20 |
def __init__(self): |
21 |
_HTMLParser.HTMLParser.__init__(self)
|
22 |
self.cdata_tag = None |
23 |
|
24 |
def set_cdata_mode(self, tag): |
25 |
try:
|
26 |
self.interesting = _HTMLParser.interesting_cdata
|
27 |
except AttributeError: |
28 |
self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I) |
29 |
self.cdata_tag = tag.lower()
|
30 |
|
31 |
def clear_cdata_mode(self): |
32 |
self.interesting = _HTMLParser.interesting_normal
|
33 |
self.cdata_tag = None |
34 |
|
35 |
# Internal -- handle starttag, return end or -1 if not terminated
|
36 |
def parse_starttag(self, i): |
37 |
self.__starttag_text = None |
38 |
endpos = self.check_for_whole_start_tag(i)
|
39 |
if endpos < 0: |
40 |
return endpos
|
41 |
rawdata = self.rawdata
|
42 |
self.__starttag_text = rawdata[i:endpos]
|
43 |
|
44 |
# Now parse the data between i+1 and j into a tag and attrs
|
45 |
attrs = [] |
46 |
match = _HTMLParser.tagfind.match(rawdata, i + 1)
|
47 |
assert match, 'unexpected call to parse_starttag()' |
48 |
k = match.end() |
49 |
self.lasttag = tag = rawdata[i + 1:k].lower() |
50 |
|
51 |
while k < endpos:
|
52 |
m = _HTMLParser.attrfind.match(rawdata, k) |
53 |
if not m: |
54 |
break
|
55 |
attrname, rest, attrvalue = m.group(1, 2, 3) |
56 |
if not rest: |
57 |
attrvalue = None
|
58 |
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ |
59 |
attrvalue[:1] == '"' == attrvalue[-1:]: |
60 |
attrvalue = attrvalue[1:-1] |
61 |
attrvalue = self.unescape(attrvalue)
|
62 |
attrs.append((attrname.lower(), attrvalue)) |
63 |
k = m.end() |
64 |
|
65 |
end = rawdata[k:endpos].strip() |
66 |
if end not in (">", "/>"): |
67 |
lineno, offset = self.getpos()
|
68 |
if "\n" in self.__starttag_text: |
69 |
lineno = lineno + self.__starttag_text.count("\n") |
70 |
offset = len(self.__starttag_text) \ |
71 |
- self.__starttag_text.rfind("\n") |
72 |
else:
|
73 |
offset = offset + len(self.__starttag_text) |
74 |
self.error("junk characters in start tag: %r" |
75 |
% (rawdata[k:endpos][:20],))
|
76 |
if end.endswith('/>'): |
77 |
# XHTML-style empty tag: <span attr="value" />
|
78 |
self.handle_startendtag(tag, attrs)
|
79 |
else:
|
80 |
self.handle_starttag(tag, attrs)
|
81 |
if tag in self.CDATA_CONTENT_ELEMENTS: |
82 |
self.set_cdata_mode(tag) # <--------------------------- Changed |
83 |
return endpos
|
84 |
|
85 |
# Internal -- parse endtag, return end or -1 if incomplete
|
86 |
def parse_endtag(self, i): |
87 |
rawdata = self.rawdata
|
88 |
assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag" |
89 |
match = _HTMLParser.endendtag.search(rawdata, i + 1) # > |
90 |
if not match: |
91 |
return -1 |
92 |
j = match.end() |
93 |
match = _HTMLParser.endtagfind.match(rawdata, i) # </ + tag + >
|
94 |
if not match: |
95 |
if self.cdata_tag is not None: # *** add *** |
96 |
self.handle_data(rawdata[i:j]) # *** add *** |
97 |
return j # *** add *** |
98 |
self.error("bad end tag: %r" % (rawdata[i:j],)) |
99 |
# --- changed start ---------------------------------------------------
|
100 |
tag = match.group(1).strip()
|
101 |
if self.cdata_tag is not None: |
102 |
if tag.lower() != self.cdata_tag: |
103 |
self.handle_data(rawdata[i:j])
|
104 |
return j
|
105 |
# --- changed end -----------------------------------------------------
|
106 |
self.handle_endtag(tag.lower())
|
107 |
self.clear_cdata_mode()
|
108 |
return j
|