2 # -*- coding: utf-8 -*-
9 # r"""[^"'<>]*(?:"[^"]*"[^"'<>]*|'[^']*'[^"'<>]*)*(?:>|(?=<)|$(?!\n))"""
11 # r'<!(?:--[^-]*-(?:[^-]+-)*?-(?:[^>-]*(?:-[^>-]+)*?)??)*(?:>|$(?!\n)|--.*$)'
18 a:* -> a tag's any attribute
19 *:style -> any tag's style attribute
22 def __init__(self, rule, allow_list, deny_list):
26 @param rule: filtering rule. DENY_ALLOW or ALLOW_DENY.
29 @param allow_list: allowed tag/attribute's list.
30 @type allow_list: sequence
32 @param deny_list: denied tag/attribue's list.
33 @type deny_list: sequece
36 self.allow_list = allow_list[:]
37 self.deny_list = deny_list[:]
39 allow_tuple = self._create_filtering_rule(allow_list)
40 deny_tuple = self._create_filtering_rule(deny_list)
42 self.allow_attributes = allow_tuple[0]
43 self.allow_elements = allow_tuple[1]
44 self.deny_attributes = deny_tuple[0]
45 self.deny_elements = deny_tuple[1]
48 def _create_filtering_rule(self, rule_list):
51 for item in rule_list:
53 if item.find(":") == -1 : # element rule
54 elem_list.append(item)
55 else: # attribute rule
56 match_obj = re.search(r"^(\w*|\*):(\w*|\*)$", item)
57 elem = match_obj.group(1)
58 attr = match_obj.group(2)
65 attr_list = attr_map.get(elem, [])
66 attr_list.append(attr)
67 attr_map[elem] = attr_list
68 return (attr_map, elem_list)
72 apply filter rule to string.
73 return string's filtered copy.
75 @param str: target string
79 str_regex_split = r"""(<[^"'<>]*(?:"[^"]*"[^"'<>]*|'[^']*'[^"'<>]*)*(?:>|(?=<)|$(?!\n)))"""
80 str_regex_tag = r"""^<.*>$"""
81 regex_split = re.compile(str_regex_split)
82 regex_tag = re.compile(str_regex_tag)
83 splitted_list = regex_split.split(str)
85 for term in splitted_list:
86 if regex_tag.search(term):
87 term = self.filter_tag(term)
89 term = self.quote(term)
90 ret_str = ret_str + term
94 def filter_tag(self, str):
96 match_obj = re.search(r"^<(/{0,1}\s*\w+)\s*(.*)>", str)
100 tag = match_obj.group(1)
101 attr = match_obj.group(2).strip()
103 attr_list = re.split("\s+", attr)
106 tag = tag.replace("/", "")
107 if self._check(tag, self.allow_elements, self.deny_elements):
108 str = self.quote(str)
111 # attribute filtering
113 for item in attr_list:
114 (attr, val) = item.split("=", 1)
115 allow_list = self.allow_attributes.get(tag, [])
116 deny_list = self.deny_attributes.get(tag,[])
118 if not self._check(attr, allow_list, deny_list):
119 new_list.append(item)
121 if len(new_list) > 0:
122 str = "<" + tag + " " + " ".join(new_list) + ">"
124 str = "<" + tag + ">"
127 def _check(self, str, allow_list, deny_list):
128 is_allow = (str in allow_list) or ("*" in allow_list)
129 is_deny = (str in deny_list) or ("*" in deny_list)
130 if self.rule == ALLOW_DENY:
131 if (not is_allow) or (is_deny):
133 elif self.rule == DENY_ALLOW:
134 if (is_deny) and (not is_allow):
138 def quote(self, str):
139 str_ret = str.replace("&", "&")
140 str_ret = str_ret.replace("<", "<")
141 str_ret = str_ret.replace(">", ">")
146 #alist = ["a", "b", "br", "p", "a:href" ]
148 #filter = HTMLTagFilter(DENY_ALLOW, alist, dlist)
149 #str = """hoge > hoge < hoge<a href="URL" style="<"><b>test</b><br>"""
152 # print "allow-elem:"
153 # for item in filter.allow_elements:
156 # for item in filter.deny_elements:
159 # print "allow-attr:"
160 # for elem in filter.allow_attributes:
161 # for attr in filter.allow_attributes[elem]:
162 # print "%s : %s" % (elem, attr)
164 # for elem in filter.deny_attributes:
165 # for attr in filter.deny_attributes[elem]:
166 # print "%s : %s" % (elem, attr)
170 #print "Input: %s" % str
171 #print filter.apply(str)