-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathimport_wikipedia_test.py
More file actions
123 lines (100 loc) · 4.34 KB
/
import_wikipedia_test.py
File metadata and controls
123 lines (100 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
import unittest
import xml
import mwparserfromhell
import re
from import_wikipedia import WikiXmlHandler, extact_general, parse_coordinate
DUMP = """<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>enwiki</dbname>
<base>https://en.wikipedia.org/wiki/Main_Page</base>
<generator>MediaWiki 1.28.0-wmf.15</generator>
<case>first-letter</case>
<namespaces>
<namespace key="-2" case="first-letter">Media</namespace>
</namespaces>
</siteinfo>
<page>
<title>AccessibleComputing</title>
<ns>0</ns>
<id>10</id>
<redirect title="Computer accessibility" />
<revision>
<id>631144794</id>
<parentid>381202555</parentid>
<timestamp>2014-10-26T04:50:23Z</timestamp>
<contributor>
<username>Paine Ellsworth</username>
<id>9092818</id>
</contributor>
<comment>add [[WP:RCAT|rcat]]s</comment>
<model>wikitext</model>
<format>text/x-wiki</format>
<text xml:space="preserve">#REDIRECT [[Computer accessibility]]
{{Redr|move|from CamelCase|up}}</text>
<sha1>4ro7vvppa5kmm0o1egfjztzcwd0vabw</sha1>
</revision>
</page>
<page>
<title>Anarchism</title>
<ns>0</ns>
<id>12</id>
<revision>
<id>734566960</id>
<timestamp>2016-08-15T06:01:51Z</timestamp>
<model>wikitext</model>
<format>text/x-wiki</format>
<text xml:space="preserve">{{Redirect2|Anarchist|Anarchists|the fictional character|Anarchist (comics)|other uses|Anarchists (disambiguation)}}
{{Basic forms of government}}
'''Anarchism''' is a [[political philosophy]] that advocates [[self-governance|self-governed]] societies based on voluntary institutions. These are often described
<--This is a *citation* from a book, DON'T CHANGE-->
===First International and the Paris Commune===
{{coord|52|22|N|4|54|E|region:NL|display=inline,title}}
{{Main article|International Workingmen's Association|Paris Commune}}
[[File:Bakunin.png|thumb|upright|Collectivist anarchist [[Mikhail Bakunin]] opposed the
[[Category:Anti-fascism]]
[[Category:Ideas of idealists]]
[[Category:Anti-capitalism]]
[[Category:Far-left politics]]</text>
<sha1>az60vahaazg403faw6x2gzpbmiws0o3</sha1>
</revision>
</page>
</mediawiki>"""
RE_PAR = re.compile('\(([^\)]+)\)')
class FakeCursor:
def __init__(self):
self.results = []
def execute(self, sql, params):
g = RE_PAR.search(sql)
fields = [x.strip() for x in g.group(1).split(',')]
self.results.append(dict(zip(fields, params)))
class TestImportWikipedia(unittest.TestCase):
def test_parse_wikipedia(self):
parser = xml.sax.make_parser()
fc = FakeCursor()
parser.setContentHandler(WikiXmlHandler(fc))
for line in DUMP.split('\n'):
parser.feed(line + '\n')
self.assertEqual(len(fc.results), 2)
self.assertEqual(fc.results[0]['title'], 'AccessibleComputing')
self.assertTrue('redr' in fc.results[0]['templates'])
self.assertTrue("<--This is a *citation* from a book, DON'T CHANGE-->" in fc.results[1]['wikitext'])
self.assertTrue('main article' in fc.results[1]['templates'])
self.assertTrue('ideas' in fc.results[1]['general'])
def test_extract_general(self):
self.assertEqual(extact_general('something something dark'), None)
self.assertEqual(extact_general('the streets of philadelpha'), 'the streets')
self.assertEqual(extact_general('paintings by dutch potato eaters'), 'paintings')
self.assertEqual(extact_general('Cities in trouble'), 'Cities')
def assertCoos(self, fragment, lat, lng):
wikicode = mwparserfromhell.parse(fragment)
template = wikicode.filter_templates()[0]
lat1, lng1 = parse_coordinate(template)
self.assertAlmostEqual(lat, lat1, 3)
self.assertAlmostEqual(lng, lng1, 3)
def test_extract_coordinates(self):
self.assertCoos('{{coord|38|42|N|9|11|W|source:eswiki_type:country|display=title}}', 38.7, -9.183333)
self.assertCoos('{{Coord|31|35|type:country|display=title}}', 31, 35)
if __name__ == '__main__':
unittest.main()