1111re_all_whitespace = re .compile (r'[\t \r\n]+' )
1212re_newline_whitespace = re .compile (r'[\t \r\n]*[\r\n][\t \r\n]*' )
1313re_html_heading = re .compile (r'h(\d+)' )
14+ re_pre_lstrip1 = re .compile (r'^ *\n' )
15+ re_pre_rstrip1 = re .compile (r'\n *$' )
16+ re_pre_lstrip = re .compile (r'^[ \n]*\n' )
17+ re_pre_rstrip = re .compile (r'[ \n]*$' )
1418
1519# Pattern for creating convert_<tag> function names from tag names
1620re_make_convert_fn_name = re .compile (r'[\[\]:-]' )
3741# confused with a list item
3842re_escape_misc_list_items = re .compile (r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))' )
3943
44+ # Find consecutive backtick sequences in a string
45+ re_backtick_runs = re .compile (r'`+' )
46+
4047# Heading styles
4148ATX = 'atx'
4249ATX_CLOSED = 'atx_closed'
5158ASTERISK = '*'
5259UNDERSCORE = '_'
5360
54- # Document strip styles
61+ # Document/pre strip styles
5562LSTRIP = 'lstrip'
5663RSTRIP = 'rstrip'
5764STRIP = 'strip'
65+ STRIP_ONE = 'strip_one'
66+
67+
68+ def strip1_pre (text ):
69+ """Strip one leading and trailing newline from a <pre> string."""
70+ text = re_pre_lstrip1 .sub ('' , text )
71+ text = re_pre_rstrip1 .sub ('' , text )
72+ return text
73+
74+
75+ def strip_pre (text ):
76+ """Strip all leading and trailing newlines from a <pre> string."""
77+ text = re_pre_lstrip .sub ('' , text )
78+ text = re_pre_rstrip .sub ('' , text )
79+ return text
5880
5981
6082def chomp (text ):
@@ -154,6 +176,7 @@ def _next_block_content_sibling(el):
154176class MarkdownConverter (object ):
155177 class DefaultOptions :
156178 autolinks = True
179+ bs4_options = 'html.parser'
157180 bullets = '*+-' # An iterable of bullet types.
158181 code_language = ''
159182 code_language_callback = None
@@ -167,6 +190,7 @@ class DefaultOptions:
167190 newline_style = SPACES
168191 strip = None
169192 strip_document = STRIP
193+ strip_pre = STRIP
170194 strong_em_symbol = ASTERISK
171195 sub_symbol = ''
172196 sup_symbol = ''
@@ -187,11 +211,15 @@ def __init__(self, **options):
187211 raise ValueError ('You may specify either tags to strip or tags to'
188212 ' convert, but not both.' )
189213
214+ # If a string or list is passed to bs4_options, assume it is a 'features' specification
215+ if not isinstance (self .options ['bs4_options' ], dict ):
216+ self .options ['bs4_options' ] = {'features' : self .options ['bs4_options' ]}
217+
190218 # Initialize the conversion function cache
191219 self .convert_fn_cache = {}
192220
193221 def convert (self , html ):
194- soup = BeautifulSoup (html , 'html.parser' )
222+ soup = BeautifulSoup (html , ** self . options [ 'bs4_options' ] )
195223 return self .convert_soup (soup )
196224
197225 def convert_soup (self , soup ):
@@ -362,16 +390,20 @@ def get_conv_fn(self, tag_name):
362390 if not self .should_convert_tag (tag_name ):
363391 return None
364392
365- # Handle headings with _convert_hn() function
393+ # Look for an explicitly defined conversion function by tag name first
394+ convert_fn_name = "convert_%s" % re_make_convert_fn_name .sub ("_" , tag_name )
395+ convert_fn = getattr (self , convert_fn_name , None )
396+ if convert_fn :
397+ return convert_fn
398+
399+ # If tag is any heading, handle with convert_hN() function
366400 match = re_html_heading .match (tag_name )
367401 if match :
368- n = int (match .group (1 ))
369- return lambda el , text , parent_tags : self ._convert_hn (n , el , text , parent_tags )
402+ n = int (match .group (1 )) # get value of N from <hN>
403+ return lambda el , text , parent_tags : self .convert_hN (n , el , text , parent_tags )
370404
371- # For other tags, look up their conversion function by tag name
372- convert_fn_name = "convert_%s" % re_make_convert_fn_name .sub ('_' , tag_name )
373- convert_fn = getattr (self , convert_fn_name , None )
374- return convert_fn
405+ # No conversion function was found
406+ return None
375407
376408 def should_convert_tag (self , tag ):
377409 """Given a tag name, return whether to convert based on strip/convert options."""
@@ -451,10 +483,24 @@ def convert_br(self, el, text, parent_tags):
451483 return ' \n '
452484
453485 def convert_code (self , el , text , parent_tags ):
454- if 'pre ' in parent_tags :
486+ if '_noformat ' in parent_tags :
455487 return text
456- converter = abstract_inline_conversion (lambda self : '`' )
457- return converter (self , el , text , parent_tags )
488+
489+ prefix , suffix , text = chomp (text )
490+ if not text :
491+ return ''
492+
493+ # Find the maximum number of consecutive backticks in the text, then
494+ # delimit the code span with one more backtick than that
495+ max_backticks = max ((len (match ) for match in re .findall (re_backtick_runs , text )), default = 0 )
496+ markup_delimiter = '`' * (max_backticks + 1 )
497+
498+ # If the maximum number of backticks is greater than zero, add a space
499+ # to avoid interpretation of inside backticks as literals
500+ if max_backticks > 0 :
501+ text = " " + text + " "
502+
503+ return '%s%s%s%s%s' % (prefix , markup_delimiter , text , markup_delimiter , suffix )
458504
459505 convert_del = abstract_inline_conversion (lambda self : '~~' )
460506
@@ -509,12 +555,12 @@ def convert_dt(self, el, text, parent_tags):
509555
510556 return '\n \n %s\n ' % text
511557
512- def _convert_hn (self , n , el , text , parent_tags ):
513- """ Method name prefixed with _ to prevent <hn> to call this """
558+ def convert_hN (self , n , el , text , parent_tags ):
559+ # convert_hN() converts <hN> tags, where N is any integer
514560 if '_inline' in parent_tags :
515561 return text
516562
517- # prevent MemoryErrors in case of very large n
563+ # Markdown does not support heading depths of n > 6
518564 n = max (1 , min (6 , n ))
519565
520566 style = self .options ['heading_style' ].lower ()
@@ -647,8 +693,20 @@ def convert_pre(self, el, text, parent_tags):
647693 if self .options ['code_language_callback' ]:
648694 code_language = self .options ['code_language_callback' ](el ) or code_language
649695
696+ if self .options ['strip_pre' ] == STRIP :
697+ text = strip_pre (text ) # remove all leading/trailing newlines
698+ elif self .options ['strip_pre' ] == STRIP_ONE :
699+ text = strip1_pre (text ) # remove one leading/trailing newline
700+ elif self .options ['strip_pre' ] is None :
701+ pass # leave leading and trailing newlines as-is
702+ else :
703+ raise ValueError ('Invalid value for strip_pre: %s' % self .options ['strip_pre' ])
704+
650705 return '\n \n ```%s\n %s\n ```\n \n ' % (code_language , text )
651706
707+ def convert_q (self , el , text , parent_tags ):
708+ return '"' + text + '"'
709+
652710 def convert_script (self , el , text , parent_tags ):
653711 return ''
654712
@@ -677,13 +735,13 @@ def convert_figcaption(self, el, text, parent_tags):
677735 def convert_td (self , el , text , parent_tags ):
678736 colspan = 1
679737 if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
680- colspan = int (el ['colspan' ])
738+ colspan = max ( 1 , min ( 1000 , int (el ['colspan' ])) )
681739 return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
682740
683741 def convert_th (self , el , text , parent_tags ):
684742 colspan = 1
685743 if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
686- colspan = int (el ['colspan' ])
744+ colspan = max ( 1 , min ( 1000 , int (el ['colspan' ])) )
687745 return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
688746
689747 def convert_tr (self , el , text , parent_tags ):
@@ -704,7 +762,7 @@ def convert_tr(self, el, text, parent_tags):
704762 full_colspan = 0
705763 for cell in cells :
706764 if 'colspan' in cell .attrs and cell ['colspan' ].isdigit ():
707- full_colspan += int (cell [" colspan" ] )
765+ full_colspan += max ( 1 , min ( 1000 , int (cell [' colspan' ])) )
708766 else :
709767 full_colspan += 1
710768 if ((is_headrow
0 commit comments