77convert_heading_re = re .compile (r'convert_h(\d+)' )
88line_beginning_re = re .compile (r'^' , re .MULTILINE )
99whitespace_re = re .compile (r'[\t ]+' )
10- all_whitespace_re = re .compile (r'[\s]+' )
10+ all_whitespace_re = re .compile (r'[\t \r\n]+' )
11+ newline_whitespace_re = re .compile (r'[\t \r\n]*[\r\n][\t \r\n]*' )
1112html_heading_re = re .compile (r'h[1-6]' )
1213
1314
@@ -66,6 +67,23 @@ def _todict(obj):
6667 return dict ((k , getattr (obj , k )) for k in dir (obj ) if not k .startswith ('_' ))
6768
6869
70+ def should_remove_whitespace_inside (el ):
71+ """Return to remove whitespace immediately inside a block-level element."""
72+ if not el or not el .name :
73+ return False
74+ if html_heading_re .match (el .name ) is not None :
75+ return True
76+ return el .name in ('p' , 'blockquote' ,
77+ 'ol' , 'ul' , 'li' ,
78+ 'table' , 'thead' , 'tbody' , 'tfoot' ,
79+ 'tr' , 'td' , 'th' )
80+
81+
82+ def should_remove_whitespace_outside (el ):
83+ """Return to remove whitespace immediately outside a block-level element."""
84+ return should_remove_whitespace_inside (el ) or (el and el .name == 'pre' )
85+
86+
6987class MarkdownConverter (object ):
7088 class DefaultOptions :
7189 autolinks = True
@@ -76,7 +94,7 @@ class DefaultOptions:
7694 default_title = False
7795 escape_asterisks = True
7896 escape_underscores = True
79- escape_misc = True
97+ escape_misc = False
8098 heading_style = UNDERLINED
8199 keep_inline_images_in = []
82100 newline_style = SPACES
@@ -119,27 +137,23 @@ def process_tag(self, node, convert_as_inline, children_only=False):
119137 if not children_only and (isHeading or isCell ):
120138 convert_children_as_inline = True
121139
122- # Remove whitespace-only textnodes in purely nested nodes
123- def is_nested_node (el ):
124- return el and el .name in ['ol' , 'ul' , 'li' ,
125- 'table' , 'thead' , 'tbody' , 'tfoot' ,
126- 'tr' , 'td' , 'th' ]
127-
128- if is_nested_node (node ):
129- for el in node .children :
130- # Only extract (remove) whitespace-only text node if any of the
131- # conditions is true:
132- # - el is the first element in its parent
133- # - el is the last element in its parent
134- # - el is adjacent to an nested node
135- can_extract = (not el .previous_sibling
136- or not el .next_sibling
137- or is_nested_node (el .previous_sibling )
138- or is_nested_node (el .next_sibling ))
139- if (isinstance (el , NavigableString )
140- and six .text_type (el ).strip () == ''
141- and can_extract ):
142- el .extract ()
140+ # Remove whitespace-only textnodes just before, after or
141+ # inside block-level elements.
142+ should_remove_inside = should_remove_whitespace_inside (node )
143+ for el in node .children :
144+ # Only extract (remove) whitespace-only text node if any of the
145+ # conditions is true:
146+ # - el is the first element in its parent (block-level)
147+ # - el is the last element in its parent (block-level)
148+ # - el is adjacent to a block-level node
149+ can_extract = (should_remove_inside and (not el .previous_sibling
150+ or not el .next_sibling )
151+ or should_remove_whitespace_outside (el .previous_sibling )
152+ or should_remove_whitespace_outside (el .next_sibling ))
153+ if (isinstance (el , NavigableString )
154+ and six .text_type (el ).strip () == ''
155+ and can_extract ):
156+ el .extract ()
143157
144158 # Convert the children first
145159 for el in node .children :
@@ -148,7 +162,13 @@ def is_nested_node(el):
148162 elif isinstance (el , NavigableString ):
149163 text += self .process_text (el )
150164 else :
151- text += self .process_tag (el , convert_children_as_inline )
165+ text_strip = text .rstrip ('\n ' )
166+ newlines_left = len (text ) - len (text_strip )
167+ next_text = self .process_tag (el , convert_children_as_inline )
168+ next_text_strip = next_text .lstrip ('\n ' )
169+ newlines_right = len (next_text ) - len (next_text_strip )
170+ newlines = '\n ' * max (newlines_left , newlines_right )
171+ text = text_strip + newlines + next_text_strip
152172
153173 if not children_only :
154174 convert_fn = getattr (self , 'convert_%s' % node .name , None )
@@ -162,18 +182,26 @@ def process_text(self, el):
162182
163183 # normalize whitespace if we're not inside a preformatted element
164184 if not el .find_parent ('pre' ):
165- text = whitespace_re .sub (' ' , text )
185+ if self .options ['wrap' ]:
186+ text = all_whitespace_re .sub (' ' , text )
187+ else :
188+ text = newline_whitespace_re .sub ('\n ' , text )
189+ text = whitespace_re .sub (' ' , text )
166190
167191 # escape special characters if we're not inside a preformatted or code element
168192 if not el .find_parent (['pre' , 'code' , 'kbd' , 'samp' ]):
169193 text = self .escape (text )
170194
171- # remove trailing whitespaces if any of the following condition is true:
172- # - current text node is the last node in li
173- # - current text node is followed by an embedded list
174- if (el .parent .name == 'li'
175- and (not el .next_sibling
176- or el .next_sibling .name in ['ul' , 'ol' ])):
195+ # remove leading whitespace at the start or just after a
196+ # block-level element; remove traliing whitespace at the end
197+ # or just before a block-level element.
198+ if (should_remove_whitespace_outside (el .previous_sibling )
199+ or (should_remove_whitespace_inside (el .parent )
200+ and not el .previous_sibling )):
201+ text = text .lstrip ()
202+ if (should_remove_whitespace_outside (el .next_sibling )
203+ or (should_remove_whitespace_inside (el .parent )
204+ and not el .next_sibling )):
177205 text = text .rstrip ()
178206
179207 return text
@@ -208,20 +236,32 @@ def escape(self, text):
208236 if not text :
209237 return ''
210238 if self .options ['escape_misc' ]:
211- text = re .sub (r'([\\&<`[>~#=+|-])' , r'\\\1' , text )
212- text = re .sub (r'([0-9])([.)])' , r'\1\\\2' , text )
239+ text = re .sub (r'([\\&<`[>~=+|])' , r'\\\1' , text )
240+ # A sequence of one or more consecutive '-', preceded and
241+ # followed by whitespace or start/end of fragment, might
242+ # be confused with an underline of a header, or with a
243+ # list marker.
244+ text = re .sub (r'(\s|^)(-+(?:\s|$))' , r'\1\\\2' , text )
245+ # A sequence of up to six consecutive '#', preceded and
246+ # followed by whitespace or start/end of fragment, might
247+ # be confused with an ATX heading.
248+ text = re .sub (r'(\s|^)(#{1,6}(?:\s|$))' , r'\1\\\2' , text )
249+ # '.' or ')' preceded by up to nine digits might be
250+ # confused with a list item.
251+ text = re .sub (r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))' , r'\1\\\2' ,
252+ text )
213253 if self .options ['escape_asterisks' ]:
214254 text = text .replace ('*' , r'\*' )
215255 if self .options ['escape_underscores' ]:
216256 text = text .replace ('_' , r'\_' )
217257 return text
218258
219- def indent (self , text , level ):
220- return line_beginning_re .sub ('\t ' * level , text ) if text else ''
259+ def indent (self , text , columns ):
260+ return line_beginning_re .sub (' ' * columns , text ) if text else ''
221261
222262 def underline (self , text , pad_char ):
223263 text = (text or '' ).rstrip ()
224- return '%s\n %s\n \n ' % (text , pad_char * len (text )) if text else ''
264+ return '\n \n %s\n %s\n \n ' % (text , pad_char * len (text )) if text else ''
225265
226266 def convert_a (self , el , text , convert_as_inline ):
227267 prefix , suffix , text = chomp (text )
@@ -246,7 +286,7 @@ def convert_a(self, el, text, convert_as_inline):
246286 def convert_blockquote (self , el , text , convert_as_inline ):
247287
248288 if convert_as_inline :
249- return text
289+ return ' ' + text . strip () + ' '
250290
251291 return '\n ' + (line_beginning_re .sub ('> ' , text .strip ()) + '\n \n ' ) if text else ''
252292
@@ -280,10 +320,11 @@ def convert_hn(self, n, el, text, convert_as_inline):
280320 if style == UNDERLINED and n <= 2 :
281321 line = '=' if n == 1 else '-'
282322 return self .underline (text , line )
323+ text = all_whitespace_re .sub (' ' , text )
283324 hashes = '#' * n
284325 if style == ATX_CLOSED :
285- return '%s %s %s\n \n ' % (hashes , text , hashes )
286- return '%s %s\n \n ' % (hashes , text )
326+ return '\n %s %s %s\n \n ' % (hashes , text , hashes )
327+ return '\n %s %s\n \n ' % (hashes , text )
287328
288329 def convert_hr (self , el , text , convert_as_inline ):
289330 return '\n \n ---\n \n '
@@ -317,8 +358,8 @@ def convert_list(self, el, text, convert_as_inline):
317358 el = el .parent
318359 if nested :
319360 # remove trailing newline if nested
320- return '\n ' + self . indent ( text , 1 ) .rstrip ()
321- return text + ('\n ' if before_paragraph else '' )
361+ return '\n ' + text .rstrip ()
362+ return ' \n \n ' + text + ('\n ' if before_paragraph else '' )
322363
323364 convert_ul = convert_list
324365 convert_ol = convert_list
@@ -339,17 +380,33 @@ def convert_li(self, el, text, convert_as_inline):
339380 el = el .parent
340381 bullets = self .options ['bullets' ]
341382 bullet = bullets [depth % len (bullets )]
342- return '%s %s\n ' % (bullet , (text or '' ).strip ())
383+ bullet = bullet + ' '
384+ text = (text or '' ).strip ()
385+ text = self .indent (text , len (bullet ))
386+ if text :
387+ text = bullet + text [len (bullet ):]
388+ return '%s\n ' % text
343389
344390 def convert_p (self , el , text , convert_as_inline ):
345391 if convert_as_inline :
346- return text
392+ return ' ' + text . strip () + ' '
347393 if self .options ['wrap' ]:
348- text = fill (text ,
349- width = self .options ['wrap_width' ],
350- break_long_words = False ,
351- break_on_hyphens = False )
352- return '%s\n \n ' % text if text else ''
394+ # Preserve newlines (and preceding whitespace) resulting
395+ # from <br> tags. Newlines in the input have already been
396+ # replaced by spaces.
397+ lines = text .split ('\n ' )
398+ new_lines = []
399+ for line in lines :
400+ line = line .lstrip ()
401+ line_no_trailing = line .rstrip ()
402+ trailing = line [len (line_no_trailing ):]
403+ line = fill (line ,
404+ width = self .options ['wrap_width' ],
405+ break_long_words = False ,
406+ break_on_hyphens = False )
407+ new_lines .append (line + trailing )
408+ text = '\n ' .join (new_lines )
409+ return '\n \n %s\n \n ' % text if text else ''
353410
354411 def convert_pre (self , el , text , convert_as_inline ):
355412 if not text :
0 commit comments