11import re
22from collections import defaultdict
33
4- import mwparserfromhell as mwp
54from more_itertools import peekable
65
76from ..identifier import Identifier
87
9- DOI_RE = re .compile (r'\b(10\.\d+/[^\s\|\]\}\?\,]+)' )
10-
11- def extract_regex (text ):
12- for match in DOI_RE .finditer (text ):
13- id = re .sub (TAGS_RE , "" , match .group (1 )).rstrip ("." )
14- yield Identifier ("doi" , id )
158
169DOI_START_RE = re .compile (r'10\.[0-9]{4,}/' )
1710
1811HTML_TAGS = ['ref' , 'span' , 'div' , 'table' , 'h1' , 'h2' , 'h3' , 'h4' , 'h5' , 'h6' ,
19- 'b' , 'u' , 'i' , 's' , 'ins' , 'del' , 'code' , 'tt' , 'blockquote' ,
20- 'pre' ]
12+ 'b' , 'u' , 'i' , 's' , 'ins' , 'del' , 'code' , 'tt' , 'blockquote' ,
13+ 'pre' ]
2114
2215TAGS_RE = re .compile (r'<(/\s*)?(' + '|' .join (HTML_TAGS ) + ')(\s[^>\n \r ]+)?>' , re .I )
2316
17+ '''
18+ DOI_RE = re.compile(r'\b (10\.\d+/[^\s\|\]\}\?\,]+)')
2419
20+ def extract_regex(text):
21+ for match in DOI_RE.finditer(text):
22+ id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
23+ yield Identifier("doi", id)
24+
25+ import mwparserfromhell as mwp
2526def extract_mwp(text):
2627 no_tags = mwp.parse(text).strip_code()
2728 for match in DOI_RE.finditer(no_tags):
2829 id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
2930 yield Identifier("doi", id)
31+ '''
3032
3133LEXICON = [
3234 (DOI_START_RE .pattern , 'doi_start' ),
@@ -53,21 +55,21 @@ def extract_mwp(text):
5355def extract_island (text ):
5456 tokens = tokenize_finditer (text , LEXICON )
5557 tokens = peekable (tokens )
56-
58+
5759 while tokens .peek (None ) is not None :
58-
60+
5961 if tokens .peek ()[0 ] == 'doi_start' :
6062 yield ('doi' , read_doi (tokens ))
61-
63+
6264 next (tokens )
6365
6466
6567def tokenize_finditer (text , lexicon = LEXICON ):
6668 pattern = '|' .join ("(?P<{0}>{1})" .format (name , pattern )
6769 for pattern , name in lexicon )
68-
70+
6971 group_regex = re .compile (pattern , re .I | re .U | re .M )
70-
72+
7173 for match in group_regex .finditer (text ):
7274 yield match .lastgroup , match .group (0 )
7375
@@ -84,14 +86,14 @@ def tokenize_scanner(text, lexicon=LEXICON):
8486
8587def read_doi (tokens ):
8688 assert tokens .peek ()[0 ] == 'doi_start'
87-
89+
8890 depth = defaultdict (lambda : 0 )
89-
91+
9092 doi_buffer = [next (tokens )[1 ]]
91-
93+
9294 while tokens .peek (None ) is not None :
9395 name , match = tokens .peek ()
94-
96+
9597 if name in ('url_end' , 'break' , 'whitespace' , 'tag' , 'pipe' ,
9698 'comment_start' , 'comment_end' ):
9799 break
@@ -115,8 +117,8 @@ def read_doi(tokens):
115117 break
116118 else :
117119 doi_buffer .append (next (tokens )[1 ])
118-
119-
120+
121+
120122 # Do not return a doi with punctuation at the end
121123 return re .sub (r'[\.,!]+$' , '' , '' .join (doi_buffer ))
122124
@@ -125,16 +127,16 @@ def read_doi(tokens):
125127def tokenize_search (text , start , lexicon = LEXICON ):
126128 pattern = '|' .join ("(?P<{0}>{1})" .format (name , pattern )
127129 for pattern , name in lexicon )
128-
130+
129131 group_regex = re .compile (pattern , re .I | re .U )
130-
132+
131133 match = group_regex .search (text , start )
132134 while match is not None :
133135 yield match .lastgroup , match .group (0 )
134136 match = group_regex .search (text , match .span ()[1 ])
135137
136138def extract_search (text , lexicon = LEXICON ):
137-
139+
138140 last_end = 0
139141 for match in DOI_START_RE .finditer (text ):
140142 if match .span ()[0 ] > last_end :
0 commit comments