root / env / lib / python2.7 / site-packages / django / utils / regex_helper.py @ 1a305335
History | View | Annotate | Download (12.3 KB)
1 |
"""
|
---|---|
2 |
Functions for reversing a regular expression (used in reverse URL resolving).
|
3 |
Used internally by Django and not intended for external use.
|
4 |
|
5 |
This is not, and is not intended to be, a complete reg-exp decompiler. It
|
6 |
should be good enough for a large class of URLS, however.
|
7 |
"""
|
8 |
|
9 |
# Mapping of an escape character to a representative of that class. So, e.g.,
|
10 |
# "\w" is replaced by "x" in a reverse URL. A value of None means to ignore
|
11 |
# this sequence. Any missing key is mapped to itself.
|
12 |
ESCAPE_MAPPINGS = { |
13 |
"A": None, |
14 |
"b": None, |
15 |
"B": None, |
16 |
"d": u"0", |
17 |
"D": u"x", |
18 |
"s": u" ", |
19 |
"S": u"x", |
20 |
"w": u"x", |
21 |
"W": u"!", |
22 |
"Z": None, |
23 |
} |
24 |
|
25 |
class Choice(list): |
26 |
"""
|
27 |
Used to represent multiple possibilities at this point in a pattern string.
|
28 |
We use a distinguished type, rather than a list, so that the usage in the
|
29 |
code is clear.
|
30 |
"""
|
31 |
|
32 |
class Group(list): |
33 |
"""
|
34 |
Used to represent a capturing group in the pattern string.
|
35 |
"""
|
36 |
|
37 |
class NonCapture(list): |
38 |
"""
|
39 |
Used to represent a non-capturing group in the pattern string.
|
40 |
"""
|
41 |
|
42 |
def normalize(pattern): |
43 |
"""
|
44 |
Given a reg-exp pattern, normalizes it to a list of forms that suffice for
|
45 |
reverse matching. This does the following:
|
46 |
|
47 |
(1) For any repeating sections, keeps the minimum number of occurrences
|
48 |
permitted (this means zero for optional groups).
|
49 |
(2) If an optional group includes parameters, include one occurrence of
|
50 |
that group (along with the zero occurrence case from step (1)).
|
51 |
(3) Select the first (essentially an arbitrary) element from any character
|
52 |
class. Select an arbitrary character for any unordered class (e.g. '.'
|
53 |
or '\w') in the pattern.
|
54 |
(5) Ignore comments and any of the reg-exp flags that won't change
|
55 |
what we construct ("iLmsu"). "(?x)" is an error, however.
|
56 |
(6) Raise an error on all other non-capturing (?...) forms (e.g.
|
57 |
look-ahead and look-behind matches) and any disjunctive ('|')
|
58 |
constructs.
|
59 |
|
60 |
Django's URLs for forward resolving are either all positional arguments or
|
61 |
all keyword arguments. That is assumed here, as well. Although reverse
|
62 |
resolving can be done using positional args when keyword args are
|
63 |
specified, the two cannot be mixed in the same reverse() call.
|
64 |
"""
|
65 |
# Do a linear scan to work out the special features of this pattern. The
|
66 |
# idea is that we scan once here and collect all the information we need to
|
67 |
# make future decisions.
|
68 |
result = [] |
69 |
non_capturing_groups = [] |
70 |
consume_next = True
|
71 |
pattern_iter = next_char(iter(pattern))
|
72 |
num_args = 0
|
73 |
|
74 |
# A "while" loop is used here because later on we need to be able to peek
|
75 |
# at the next character and possibly go around without consuming another
|
76 |
# one at the top of the loop.
|
77 |
try:
|
78 |
ch, escaped = pattern_iter.next() |
79 |
except StopIteration: |
80 |
return zip([u''], [[]]) |
81 |
|
82 |
try:
|
83 |
while True: |
84 |
if escaped:
|
85 |
result.append(ch) |
86 |
elif ch == '.': |
87 |
# Replace "any character" with an arbitrary representative.
|
88 |
result.append(u".")
|
89 |
elif ch == '|': |
90 |
# FIXME: One day we'll should do this, but not in 1.0.
|
91 |
raise NotImplementedError |
92 |
elif ch == "^": |
93 |
pass
|
94 |
elif ch == '$': |
95 |
break
|
96 |
elif ch == ')': |
97 |
# This can only be the end of a non-capturing group, since all
|
98 |
# other unescaped parentheses are handled by the grouping
|
99 |
# section later (and the full group is handled there).
|
100 |
#
|
101 |
# We regroup everything inside the capturing group so that it
|
102 |
# can be quantified, if necessary.
|
103 |
start = non_capturing_groups.pop() |
104 |
inner = NonCapture(result[start:]) |
105 |
result = result[:start] + [inner] |
106 |
elif ch == '[': |
107 |
# Replace ranges with the first character in the range.
|
108 |
ch, escaped = pattern_iter.next() |
109 |
result.append(ch) |
110 |
ch, escaped = pattern_iter.next() |
111 |
while escaped or ch != ']': |
112 |
ch, escaped = pattern_iter.next() |
113 |
elif ch == '(': |
114 |
# Some kind of group.
|
115 |
ch, escaped = pattern_iter.next() |
116 |
if ch != '?' or escaped: |
117 |
# A positional group
|
118 |
name = "_%d" % num_args
|
119 |
num_args += 1
|
120 |
result.append(Group(((u"%%(%s)s" % name), name)))
|
121 |
walk_to_end(ch, pattern_iter) |
122 |
else:
|
123 |
ch, escaped = pattern_iter.next() |
124 |
if ch in "iLmsu#": |
125 |
# All of these are ignorable. Walk to the end of the
|
126 |
# group.
|
127 |
walk_to_end(ch, pattern_iter) |
128 |
elif ch == ':': |
129 |
# Non-capturing group
|
130 |
non_capturing_groups.append(len(result))
|
131 |
elif ch != 'P': |
132 |
# Anything else, other than a named group, is something
|
133 |
# we cannot reverse.
|
134 |
raise ValueError("Non-reversible reg-exp portion: '(?%s'" % ch) |
135 |
else:
|
136 |
ch, escaped = pattern_iter.next() |
137 |
if ch not in ('<', '='): |
138 |
raise ValueError("Non-reversible reg-exp portion: '(?P%s'" % ch) |
139 |
# We are in a named capturing group. Extra the name and
|
140 |
# then skip to the end.
|
141 |
if ch == '<': |
142 |
terminal_char = '>'
|
143 |
# We are in a named backreference.
|
144 |
else:
|
145 |
terminal_char = ')'
|
146 |
name = [] |
147 |
ch, escaped = pattern_iter.next() |
148 |
while ch != terminal_char:
|
149 |
name.append(ch) |
150 |
ch, escaped = pattern_iter.next() |
151 |
param = ''.join(name)
|
152 |
# Named backreferences have already consumed the
|
153 |
# parenthesis.
|
154 |
if terminal_char != ')': |
155 |
result.append(Group(((u"%%(%s)s" % param), param)))
|
156 |
walk_to_end(ch, pattern_iter) |
157 |
else:
|
158 |
result.append(Group(((u"%%(%s)s" % param), None))) |
159 |
elif ch in "*?+{": |
160 |
# Quanitifers affect the previous item in the result list.
|
161 |
count, ch = get_quantifier(ch, pattern_iter) |
162 |
if ch:
|
163 |
# We had to look ahead, but it wasn't need to compute the
|
164 |
# quanitifer, so use this character next time around the
|
165 |
# main loop.
|
166 |
consume_next = False
|
167 |
|
168 |
if count == 0: |
169 |
if contains(result[-1], Group): |
170 |
# If we are quantifying a capturing group (or
|
171 |
# something containing such a group) and the minimum is
|
172 |
# zero, we must also handle the case of one occurrence
|
173 |
# being present. All the quantifiers (except {0,0},
|
174 |
# which we conveniently ignore) that have a 0 minimum
|
175 |
# also allow a single occurrence.
|
176 |
result[-1] = Choice([None, result[-1]]) |
177 |
else:
|
178 |
result.pop() |
179 |
elif count > 1: |
180 |
result.extend([result[-1]] * (count - 1)) |
181 |
else:
|
182 |
# Anything else is a literal.
|
183 |
result.append(ch) |
184 |
|
185 |
if consume_next:
|
186 |
ch, escaped = pattern_iter.next() |
187 |
else:
|
188 |
consume_next = True
|
189 |
except StopIteration: |
190 |
pass
|
191 |
except NotImplementedError: |
192 |
# A case of using the disjunctive form. No results for you!
|
193 |
return zip([u''], [[]]) |
194 |
|
195 |
return zip(*flatten_result(result)) |
196 |
|
197 |
def next_char(input_iter): |
198 |
"""
|
199 |
An iterator that yields the next character from "pattern_iter", respecting
|
200 |
escape sequences. An escaped character is replaced by a representative of
|
201 |
its class (e.g. \w -> "x"). If the escaped character is one that is
|
202 |
skipped, it is not returned (the next character is returned instead).
|
203 |
|
204 |
Yields the next character, along with a boolean indicating whether it is a
|
205 |
raw (unescaped) character or not.
|
206 |
"""
|
207 |
for ch in input_iter: |
208 |
if ch != '\\': |
209 |
yield ch, False |
210 |
continue
|
211 |
ch = input_iter.next() |
212 |
representative = ESCAPE_MAPPINGS.get(ch, ch) |
213 |
if representative is None: |
214 |
continue
|
215 |
yield representative, True |
216 |
|
217 |
def walk_to_end(ch, input_iter): |
218 |
"""
|
219 |
The iterator is currently inside a capturing group. We want to walk to the
|
220 |
close of this group, skipping over any nested groups and handling escaped
|
221 |
parentheses correctly.
|
222 |
"""
|
223 |
if ch == '(': |
224 |
nesting = 1
|
225 |
else:
|
226 |
nesting = 0
|
227 |
for ch, escaped in input_iter: |
228 |
if escaped:
|
229 |
continue
|
230 |
elif ch == '(': |
231 |
nesting += 1
|
232 |
elif ch == ')': |
233 |
if not nesting: |
234 |
return
|
235 |
nesting -= 1
|
236 |
|
237 |
def get_quantifier(ch, input_iter): |
238 |
"""
|
239 |
Parse a quantifier from the input, where "ch" is the first character in the
|
240 |
quantifier.
|
241 |
|
242 |
Returns the minimum number of occurences permitted by the quantifier and
|
243 |
either None or the next character from the input_iter if the next character
|
244 |
is not part of the quantifier.
|
245 |
"""
|
246 |
if ch in '*?+': |
247 |
try:
|
248 |
ch2, escaped = input_iter.next() |
249 |
except StopIteration: |
250 |
ch2 = None
|
251 |
if ch2 == '?': |
252 |
ch2 = None
|
253 |
if ch == '+': |
254 |
return 1, ch2 |
255 |
return 0, ch2 |
256 |
|
257 |
quant = [] |
258 |
while ch != '}': |
259 |
ch, escaped = input_iter.next() |
260 |
quant.append(ch) |
261 |
quant = quant[:-1]
|
262 |
values = ''.join(quant).split(',') |
263 |
|
264 |
# Consume the trailing '?', if necessary.
|
265 |
try:
|
266 |
ch, escaped = input_iter.next() |
267 |
except StopIteration: |
268 |
ch = None
|
269 |
if ch == '?': |
270 |
ch = None
|
271 |
return int(values[0]), ch |
272 |
|
273 |
def contains(source, inst): |
274 |
"""
|
275 |
Returns True if the "source" contains an instance of "inst". False,
|
276 |
otherwise.
|
277 |
"""
|
278 |
if isinstance(source, inst): |
279 |
return True |
280 |
if isinstance(source, NonCapture): |
281 |
for elt in source: |
282 |
if contains(elt, inst):
|
283 |
return True |
284 |
return False |
285 |
|
286 |
def flatten_result(source): |
287 |
"""
|
288 |
Turns the given source sequence into a list of reg-exp possibilities and
|
289 |
their arguments. Returns a list of strings and a list of argument lists.
|
290 |
Each of the two lists will be of the same length.
|
291 |
"""
|
292 |
if source is None: |
293 |
return [u''], [[]] |
294 |
if isinstance(source, Group): |
295 |
if source[1] is None: |
296 |
params = [] |
297 |
else:
|
298 |
params = [source[1]]
|
299 |
return [source[0]], [params] |
300 |
result = [u'']
|
301 |
result_args = [[]] |
302 |
pos = last = 0
|
303 |
for pos, elt in enumerate(source): |
304 |
if isinstance(elt, basestring): |
305 |
continue
|
306 |
piece = u''.join(source[last:pos])
|
307 |
if isinstance(elt, Group): |
308 |
piece += elt[0]
|
309 |
param = elt[1]
|
310 |
else:
|
311 |
param = None
|
312 |
last = pos + 1
|
313 |
for i in range(len(result)): |
314 |
result[i] += piece |
315 |
if param:
|
316 |
result_args[i].append(param) |
317 |
if isinstance(elt, (Choice, NonCapture)): |
318 |
if isinstance(elt, NonCapture): |
319 |
elt = [elt] |
320 |
inner_result, inner_args = [], [] |
321 |
for item in elt: |
322 |
res, args = flatten_result(item) |
323 |
inner_result.extend(res) |
324 |
inner_args.extend(args) |
325 |
new_result = [] |
326 |
new_args = [] |
327 |
for item, args in zip(result, result_args): |
328 |
for i_item, i_args in zip(inner_result, inner_args): |
329 |
new_result.append(item + i_item) |
330 |
new_args.append(args[:] + i_args) |
331 |
result = new_result |
332 |
result_args = new_args |
333 |
if pos >= last:
|
334 |
piece = u''.join(source[last:])
|
335 |
for i in range(len(result)): |
336 |
result[i] += piece |
337 |
return result, result_args
|
338 |
|