Project

General

Profile

Statistics
| Branch: | Revision:

root / env / lib / python2.7 / site-packages / django / utils / regex_helper.py @ 1a305335

History | View | Annotate | Download (12.3 KB)

1
"""
2
Functions for reversing a regular expression (used in reverse URL resolving).
3
Used internally by Django and not intended for external use.
4

5
This is not, and is not intended to be, a complete reg-exp decompiler. It
6
should be good enough for a large class of URLS, however.
7
"""
8

    
9
# Mapping of an escape character to a representative of that class. So, e.g.,
10
# "\w" is replaced by "x" in a reverse URL. A value of None means to ignore
11
# this sequence. Any missing key is mapped to itself.
12
ESCAPE_MAPPINGS = {
13
    "A": None,
14
    "b": None,
15
    "B": None,
16
    "d": u"0",
17
    "D": u"x",
18
    "s": u" ",
19
    "S": u"x",
20
    "w": u"x",
21
    "W": u"!",
22
    "Z": None,
23
}
24

    
25
class Choice(list):
26
    """
27
    Used to represent multiple possibilities at this point in a pattern string.
28
    We use a distinguished type, rather than a list, so that the usage in the
29
    code is clear.
30
    """
31

    
32
class Group(list):
33
    """
34
    Used to represent a capturing group in the pattern string.
35
    """
36

    
37
class NonCapture(list):
38
    """
39
    Used to represent a non-capturing group in the pattern string.
40
    """
41

    
42
def normalize(pattern):
43
    """
44
    Given a reg-exp pattern, normalizes it to a list of forms that suffice for
45
    reverse matching. This does the following:
46

47
    (1) For any repeating sections, keeps the minimum number of occurrences
48
        permitted (this means zero for optional groups).
49
    (2) If an optional group includes parameters, include one occurrence of
50
        that group (along with the zero occurrence case from step (1)).
51
    (3) Select the first (essentially an arbitrary) element from any character
52
        class. Select an arbitrary character for any unordered class (e.g. '.'
53
        or '\w') in the pattern.
54
    (5) Ignore comments and any of the reg-exp flags that won't change
55
        what we construct ("iLmsu"). "(?x)" is an error, however.
56
    (6) Raise an error on all other non-capturing (?...) forms (e.g.
57
        look-ahead and look-behind matches) and any disjunctive ('|')
58
        constructs.
59

60
    Django's URLs for forward resolving are either all positional arguments or
61
    all keyword arguments. That is assumed here, as well. Although reverse
62
    resolving can be done using positional args when keyword args are
63
    specified, the two cannot be mixed in the same reverse() call.
64
    """
65
    # Do a linear scan to work out the special features of this pattern. The
66
    # idea is that we scan once here and collect all the information we need to
67
    # make future decisions.
68
    result = []
69
    non_capturing_groups = []
70
    consume_next = True
71
    pattern_iter = next_char(iter(pattern))
72
    num_args = 0
73

    
74
    # A "while" loop is used here because later on we need to be able to peek
75
    # at the next character and possibly go around without consuming another
76
    # one at the top of the loop.
77
    try:
78
        ch, escaped = pattern_iter.next()
79
    except StopIteration:
80
        return zip([u''],  [[]])
81

    
82
    try:
83
        while True:
84
            if escaped:
85
                result.append(ch)
86
            elif ch == '.':
87
                # Replace "any character" with an arbitrary representative.
88
                result.append(u".")
89
            elif ch == '|':
90
                # FIXME: One day we'll should do this, but not in 1.0.
91
                raise NotImplementedError
92
            elif ch == "^":
93
                pass
94
            elif ch == '$':
95
                break
96
            elif ch == ')':
97
                # This can only be the end of a non-capturing group, since all
98
                # other unescaped parentheses are handled by the grouping
99
                # section later (and the full group is handled there).
100
                #
101
                # We regroup everything inside the capturing group so that it
102
                # can be quantified, if necessary.
103
                start = non_capturing_groups.pop()
104
                inner = NonCapture(result[start:])
105
                result = result[:start] + [inner]
106
            elif ch == '[':
107
                # Replace ranges with the first character in the range.
108
                ch, escaped = pattern_iter.next()
109
                result.append(ch)
110
                ch, escaped = pattern_iter.next()
111
                while escaped or ch != ']':
112
                    ch, escaped = pattern_iter.next()
113
            elif ch == '(':
114
                # Some kind of group.
115
                ch, escaped = pattern_iter.next()
116
                if ch != '?' or escaped:
117
                    # A positional group
118
                    name = "_%d" % num_args
119
                    num_args += 1
120
                    result.append(Group(((u"%%(%s)s" % name), name)))
121
                    walk_to_end(ch, pattern_iter)
122
                else:
123
                    ch, escaped = pattern_iter.next()
124
                    if ch in "iLmsu#":
125
                        # All of these are ignorable. Walk to the end of the
126
                        # group.
127
                        walk_to_end(ch, pattern_iter)
128
                    elif ch == ':':
129
                        # Non-capturing group
130
                        non_capturing_groups.append(len(result))
131
                    elif ch != 'P':
132
                        # Anything else, other than a named group, is something
133
                        # we cannot reverse.
134
                        raise ValueError("Non-reversible reg-exp portion: '(?%s'" % ch)
135
                    else:
136
                        ch, escaped = pattern_iter.next()
137
                        if ch not in ('<', '='):
138
                            raise ValueError("Non-reversible reg-exp portion: '(?P%s'" % ch)
139
                        # We are in a named capturing group. Extra the name and
140
                        # then skip to the end.
141
                        if ch == '<':
142
                            terminal_char = '>'
143
                        # We are in a named backreference.
144
                        else:
145
                            terminal_char = ')'
146
                        name = []
147
                        ch, escaped = pattern_iter.next()
148
                        while ch != terminal_char:
149
                            name.append(ch)
150
                            ch, escaped = pattern_iter.next()
151
                        param = ''.join(name)
152
                        # Named backreferences have already consumed the
153
                        # parenthesis.
154
                        if terminal_char != ')':
155
                            result.append(Group(((u"%%(%s)s" % param), param)))
156
                            walk_to_end(ch, pattern_iter)
157
                        else:
158
                            result.append(Group(((u"%%(%s)s" % param), None)))
159
            elif ch in "*?+{":
160
                # Quanitifers affect the previous item in the result list.
161
                count, ch = get_quantifier(ch, pattern_iter)
162
                if ch:
163
                    # We had to look ahead, but it wasn't need to compute the
164
                    # quanitifer, so use this character next time around the
165
                    # main loop.
166
                    consume_next = False
167

    
168
                if count == 0:
169
                    if contains(result[-1], Group):
170
                        # If we are quantifying a capturing group (or
171
                        # something containing such a group) and the minimum is
172
                        # zero, we must also handle the case of one occurrence
173
                        # being present. All the quantifiers (except {0,0},
174
                        # which we conveniently ignore) that have a 0 minimum
175
                        # also allow a single occurrence.
176
                        result[-1] = Choice([None, result[-1]])
177
                    else:
178
                        result.pop()
179
                elif count > 1:
180
                    result.extend([result[-1]] * (count - 1))
181
            else:
182
                # Anything else is a literal.
183
                result.append(ch)
184

    
185
            if consume_next:
186
                ch, escaped = pattern_iter.next()
187
            else:
188
                consume_next = True
189
    except StopIteration:
190
        pass
191
    except NotImplementedError:
192
        # A case of using the disjunctive form. No results for you!
193
        return zip([u''],  [[]])
194

    
195
    return zip(*flatten_result(result))
196

    
197
def next_char(input_iter):
198
    """
199
    An iterator that yields the next character from "pattern_iter", respecting
200
    escape sequences. An escaped character is replaced by a representative of
201
    its class (e.g. \w -> "x"). If the escaped character is one that is
202
    skipped, it is not returned (the next character is returned instead).
203

204
    Yields the next character, along with a boolean indicating whether it is a
205
    raw (unescaped) character or not.
206
    """
207
    for ch in input_iter:
208
        if ch != '\\':
209
            yield ch, False
210
            continue
211
        ch = input_iter.next()
212
        representative = ESCAPE_MAPPINGS.get(ch, ch)
213
        if representative is None:
214
            continue
215
        yield representative, True
216

    
217
def walk_to_end(ch, input_iter):
218
    """
219
    The iterator is currently inside a capturing group. We want to walk to the
220
    close of this group, skipping over any nested groups and handling escaped
221
    parentheses correctly.
222
    """
223
    if ch == '(':
224
        nesting = 1
225
    else:
226
        nesting = 0
227
    for ch, escaped in input_iter:
228
        if escaped:
229
            continue
230
        elif ch == '(':
231
            nesting += 1
232
        elif ch == ')':
233
            if not nesting:
234
                return
235
            nesting -= 1
236

    
237
def get_quantifier(ch, input_iter):
238
    """
239
    Parse a quantifier from the input, where "ch" is the first character in the
240
    quantifier.
241

242
    Returns the minimum number of occurences permitted by the quantifier and
243
    either None or the next character from the input_iter if the next character
244
    is not part of the quantifier.
245
    """
246
    if ch in '*?+':
247
        try:
248
            ch2, escaped = input_iter.next()
249
        except StopIteration:
250
            ch2 = None
251
        if ch2 == '?':
252
            ch2 = None
253
        if ch == '+':
254
            return 1, ch2
255
        return 0, ch2
256

    
257
    quant = []
258
    while ch != '}':
259
        ch, escaped = input_iter.next()
260
        quant.append(ch)
261
    quant = quant[:-1]
262
    values = ''.join(quant).split(',')
263

    
264
    # Consume the trailing '?', if necessary.
265
    try:
266
        ch, escaped = input_iter.next()
267
    except StopIteration:
268
        ch = None
269
    if ch == '?':
270
        ch = None
271
    return int(values[0]), ch
272

    
273
def contains(source, inst):
274
    """
275
    Returns True if the "source" contains an instance of "inst". False,
276
    otherwise.
277
    """
278
    if isinstance(source, inst):
279
        return True
280
    if isinstance(source, NonCapture):
281
        for elt in source:
282
            if contains(elt, inst):
283
                return True
284
    return False
285

    
286
def flatten_result(source):
287
    """
288
    Turns the given source sequence into a list of reg-exp possibilities and
289
    their arguments. Returns a list of strings and a list of argument lists.
290
    Each of the two lists will be of the same length.
291
    """
292
    if source is None:
293
        return [u''], [[]]
294
    if isinstance(source, Group):
295
        if source[1] is None:
296
            params = []
297
        else:
298
            params = [source[1]]
299
        return [source[0]], [params]
300
    result = [u'']
301
    result_args = [[]]
302
    pos = last = 0
303
    for pos, elt in enumerate(source):
304
        if isinstance(elt, basestring):
305
            continue
306
        piece = u''.join(source[last:pos])
307
        if isinstance(elt, Group):
308
            piece += elt[0]
309
            param = elt[1]
310
        else:
311
            param = None
312
        last = pos + 1
313
        for i in range(len(result)):
314
            result[i] += piece
315
            if param:
316
                result_args[i].append(param)
317
        if isinstance(elt, (Choice, NonCapture)):
318
            if isinstance(elt, NonCapture):
319
                elt = [elt]
320
            inner_result, inner_args = [], []
321
            for item in elt:
322
                res, args = flatten_result(item)
323
                inner_result.extend(res)
324
                inner_args.extend(args)
325
            new_result = []
326
            new_args = []
327
            for item, args in zip(result, result_args):
328
                for i_item, i_args in zip(inner_result, inner_args):
329
                    new_result.append(item + i_item)
330
                    new_args.append(args[:] + i_args)
331
            result = new_result
332
            result_args = new_args
333
    if pos >= last:
334
        piece = u''.join(source[last:])
335
        for i in range(len(result)):
336
            result[i] += piece
337
    return result, result_args
338