@@ -7,6 +7,7 @@ use std::sync::LazyLock;
77
88use quick_xml:: {
99 Reader ,
10+ errors:: Error as QuickXmlError ,
1011 events:: { BytesEnd , BytesStart , BytesText } ,
1112} ;
1213
@@ -132,6 +133,7 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) {
132133 reader. config_mut ( ) . check_end_names = false ;
133134
134135 let mut buf = Vec :: new ( ) ;
136+ let mut char_buf = String :: with_capacity ( 4 ) ;
135137
136138 loop {
137139 match reader. read_event_into ( & mut buf) {
@@ -140,16 +142,9 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) {
140142 }
141143 Ok ( quick_xml:: events:: Event :: End ( ref e) ) => dehtml_endtag_cb ( e, & mut dehtml) ,
142144 Ok ( quick_xml:: events:: Event :: Text ( ref e) ) => dehtml_text_cb ( e, & mut dehtml) ,
143- Ok ( quick_xml:: events:: Event :: CData ( e) ) => match e. escape ( ) {
144- Ok ( e) => dehtml_text_cb ( & e, & mut dehtml) ,
145- Err ( e) => {
146- eprintln ! (
147- "CDATA escape error at position {}: {:?}" ,
148- reader. buffer_position( ) ,
149- e,
150- ) ;
151- }
152- } ,
145+ Ok ( quick_xml:: events:: Event :: CData ( e) ) => {
146+ str_cb ( & String :: from_utf8_lossy ( & e as & [ _ ] ) , & mut dehtml)
147+ }
153148 Ok ( quick_xml:: events:: Event :: Empty ( ref e) ) => {
154149 // Handle empty tags as a start tag immediately followed by end tag.
155150 // For example, `<p/>` is treated as `<p></p>`.
@@ -159,6 +154,32 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) {
159154 & mut dehtml,
160155 ) ;
161156 }
157+ Ok ( quick_xml:: events:: Event :: GeneralRef ( ref e) ) => {
158+ let res = e. resolve_char_ref ( ) ;
159+ if let Err ( e) = res {
160+ eprintln ! (
161+ "resolve_char_ref() error at position {}: {:?}" ,
162+ reader. buffer_position( ) ,
163+ e,
164+ ) ;
165+ } else if let Some ( ch) = res. ok ( ) . flatten ( ) {
166+ char_buf. clear ( ) ;
167+ char_buf. push ( ch) ;
168+ str_cb ( & char_buf, & mut dehtml) ;
169+ } else {
170+ let event_str = String :: from_utf8_lossy ( e) ;
171+ if let Some ( s) = quick_xml:: escape:: resolve_html5_entity ( & event_str) {
172+ str_cb ( s, & mut dehtml) ;
173+ } else {
174+ // Nonstandard entity. Add escaped.
175+ str_cb ( & format ! ( "&{event_str};" ) , & mut dehtml) ;
176+ }
177+ }
178+ }
179+ Err ( QuickXmlError :: IllFormed ( _) ) => {
180+ // This is probably not HTML at all and should be left as is.
181+ str_cb ( & String :: from_utf8_lossy ( & buf) , & mut dehtml) ;
182+ }
162183 Err ( e) => {
163184 eprintln ! (
164185 "Parse html error: Error at position {}: {:?}" ,
@@ -176,36 +197,36 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) {
176197}
177198
178199fn dehtml_text_cb ( event : & BytesText , dehtml : & mut Dehtml ) {
179- static LINE_RE : LazyLock < regex:: Regex > =
180- LazyLock :: new ( || regex:: Regex :: new ( r"(\r?\n)+" ) . unwrap ( ) ) ;
181-
182200 if dehtml. get_add_text ( ) == AddText :: YesPreserveLineEnds
183201 || dehtml. get_add_text ( ) == AddText :: YesRemoveLineEnds
184202 {
185203 let event = event as & [ _ ] ;
186204 let event_str = std:: str:: from_utf8 ( event) . unwrap_or_default ( ) ;
187- let mut last_added = escaper:: decode_html_buf_sloppy ( event) . unwrap_or_default ( ) ;
188- if event_str. starts_with ( & last_added) {
189- last_added = event_str. to_string ( ) ;
190- }
205+ str_cb ( event_str, dehtml) ;
206+ }
207+ }
191208
192- if dehtml. get_add_text ( ) == AddText :: YesRemoveLineEnds {
193- // Replace all line ends with spaces.
194- // E.g. `\r\n\r\n` is replaced with one space.
195- let last_added = LINE_RE . replace_all ( & last_added, " " ) ;
196-
197- // Add a space if `last_added` starts with a space
198- // and there is no whitespace at the end of the buffer yet.
199- // Trim the rest of leading whitespace from `last_added`.
200- let buf = dehtml. get_buf ( ) ;
201- if !buf. ends_with ( ' ' ) && !buf. ends_with ( '\n' ) && last_added. starts_with ( ' ' ) {
202- * buf += " " ;
203- }
209+ fn str_cb ( event_str : & str , dehtml : & mut Dehtml ) {
210+ static LINE_RE : LazyLock < regex:: Regex > =
211+ LazyLock :: new ( || regex:: Regex :: new ( r"(\r?\n)+" ) . unwrap ( ) ) ;
204212
205- * buf += last_added. trim_start ( ) ;
206- } else {
207- * dehtml. get_buf ( ) += LINE_RE . replace_all ( & last_added, "\n " ) . as_ref ( ) ;
213+ let add_text = dehtml. get_add_text ( ) ;
214+ if add_text == AddText :: YesRemoveLineEnds {
215+ // Replace all line ends with spaces.
216+ // E.g. `\r\n\r\n` is replaced with one space.
217+ let event_str = LINE_RE . replace_all ( event_str, " " ) ;
218+
219+ // Add a space if `event_str` starts with a space
220+ // and there is no whitespace at the end of the buffer yet.
221+ // Trim the rest of leading whitespace from `event_str`.
222+ let buf = dehtml. get_buf ( ) ;
223+ if !buf. ends_with ( ' ' ) && !buf. ends_with ( '\n' ) && event_str. starts_with ( ' ' ) {
224+ * buf += " " ;
208225 }
226+
227+ * buf += event_str. trim_start ( ) ;
228+ } else if add_text == AddText :: YesPreserveLineEnds {
229+ * dehtml. get_buf ( ) += LINE_RE . replace_all ( event_str, "\n " ) . as_ref ( ) ;
209230 }
210231}
211232
0 commit comments