forked from attardi/wikiextractor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdropNested.py
74 lines (71 loc) · 2.23 KB
/
dropNested.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
# Extract Template definition
templates = {}
redirects = {}
def dropNested(text, openDelim, closeDelim):
"""
A matching function for nested expressions, e.g. namespaces and tables.
"""
openRE = re.compile(openDelim)
closeRE = re.compile(closeDelim)
# partition text in separate blocks { } { }
spans = [] # pairs (s, e) for each partition
nest = 0 # nesting level
start = openRE.search(text, 0)
if not start:
return text
end = closeRE.search(text, start.end())
next = start
while end:
next = openRE.search(text, next.end())
if not next: # termination
while nest: # close all pending
nest -=1
end0 = closeRE.search(text, end.end())
if end0:
end = end0
else:
break
spans.append((start.start(), end.end()))
break
while end.end() < next.start():
# { } {
if nest:
nest -= 1
# try closing more
last = end.end()
end = closeRE.search(text, end.end())
if not end: # unbalanced
if spans:
span = (spans[0][0], last)
else:
span = (start.start(), last)
spans = [span]
break
else:
spans.append((start.start(), end.end()))
# advance start, find next close
start = next
end = closeRE.search(text, next.end())
break # { }
if next != start:
# { { }
nest += 1
# collect text outside partitions
return dropSpans(spans, text)
def dropSpans(spans, text):
"""
Drop from text the blocks identified in :param spans:, possibly nested.
"""
spans.sort()
res = ''
offset = 0
for s, e in spans:
if offset <= s: # handle nesting
if offset < s:
res += text[offset:s]
offset = e
res += text[offset:]
return res