forked from attardi/wikiextractor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompact.py
59 lines (52 loc) · 1.62 KB
/
compact.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
# skip level 1, it is page name level
section = re.compile(r'(==+)\s*(.*?)\s*\1')
def appendSpace(title):
"""End the title with a period if it does not end in ? or !"""
if title and title[-1] not in ' ':
title += ' '
return title
def compact(text):
"""Deal with headers, lists, empty sections, residuals of tables.
:param toHTML: convert to HTML
"""
out = "" # list of paragraph
isEmptySection = False # empty sections are discarded
for line in text.split('\n'):
if not line:
continue
# Handle section titles
m = section.match(line)
if m:
title = m.group(2)
lev = len(m.group(1))
title = appendSpace(title)
isEmptySection = True
out += title
continue
# Handle page title
if line.startswith('++'):
title = line[2:]
if line.endswith('++'):
title = line[:-2]
title = appendSpace(title)
out += title
continue
# handle indents
elif line[0] == ':':
out += line.lstrip(':*#;')
continue
# Drop residuals of lists
elif line[0] in '{|' or line[-1] == '}':
continue
# Drop irrelevant lines
elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
continue
elif isEmptySection:
out += line # first line
isEmptySection = False
elif not isEmptySection:
out += line
return out