@@ -7,6 +7,7 @@ use std::sync::LazyLock;
77
88use quick_xml:: {
99 Reader ,
10+ errors:: Error as QuickXmlError ,
1011 events:: { BytesEnd , BytesStart , BytesText } ,
1112} ;
1213
@@ -132,6 +133,7 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) {
132133 reader. config_mut ( ) . check_end_names = false ;
133134
134135 let mut buf = Vec :: new ( ) ;
136+ let mut char_buf = String :: with_capacity ( 4 ) ;
135137
136138 loop {
137139 match reader. read_event_into ( & mut buf) {
@@ -140,16 +142,9 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) {
140142 }
141143 Ok ( quick_xml:: events:: Event :: End ( ref e) ) => dehtml_endtag_cb ( e, & mut dehtml) ,
142144 Ok ( quick_xml:: events:: Event :: Text ( ref e) ) => dehtml_text_cb ( e, & mut dehtml) ,
143- Ok ( quick_xml:: events:: Event :: CData ( e) ) => match e. escape ( ) {
144- Ok ( e) => dehtml_text_cb ( & e, & mut dehtml) ,
145- Err ( e) => {
146- eprintln ! (
147- "CDATA escape error at position {}: {:?}" ,
148- reader. buffer_position( ) ,
149- e,
150- ) ;
151- }
152- } ,
145+ Ok ( quick_xml:: events:: Event :: CData ( e) ) => {
146+ str_cb ( & String :: from_utf8_lossy ( & e as & [ _ ] ) , & mut dehtml)
147+ }
153148 Ok ( quick_xml:: events:: Event :: Empty ( ref e) ) => {
154149 // Handle empty tags as a start tag immediately followed by end tag.
155150 // For example, `<p/>` is treated as `<p></p>`.
@@ -159,6 +154,33 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) {
159154 & mut dehtml,
160155 ) ;
161156 }
157+ Ok ( quick_xml:: events:: Event :: GeneralRef ( ref e) ) => {
158+ match e. resolve_char_ref ( ) {
159+ Err ( err) => eprintln ! (
160+ "resolve_char_ref() error at position {}: {:?}" ,
161+ reader. buffer_position( ) ,
162+ err,
163+ ) ,
164+ Ok ( Some ( ch) ) => {
165+ char_buf. clear ( ) ;
166+ char_buf. push ( ch) ;
167+ str_cb ( & char_buf, & mut dehtml) ;
168+ }
169+ Ok ( None ) => {
170+ let event_str = String :: from_utf8_lossy ( e) ;
171+ if let Some ( s) = quick_xml:: escape:: resolve_html5_entity ( & event_str) {
172+ str_cb ( s, & mut dehtml) ;
173+ } else {
174+ // Nonstandard entity. Add escaped.
175+ str_cb ( & format ! ( "&{event_str};" ) , & mut dehtml) ;
176+ }
177+ }
178+ }
179+ }
180+ Err ( QuickXmlError :: IllFormed ( _) ) => {
181+ // This is probably not HTML at all and should be left as is.
182+ str_cb ( & String :: from_utf8_lossy ( & buf) , & mut dehtml) ;
183+ }
162184 Err ( e) => {
163185 eprintln ! (
164186 "Parse html error: Error at position {}: {:?}" ,
@@ -176,36 +198,36 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) {
176198}
177199
178200fn dehtml_text_cb ( event : & BytesText , dehtml : & mut Dehtml ) {
179- static LINE_RE : LazyLock < regex:: Regex > =
180- LazyLock :: new ( || regex:: Regex :: new ( r"(\r?\n)+" ) . unwrap ( ) ) ;
181-
182201 if dehtml. get_add_text ( ) == AddText :: YesPreserveLineEnds
183202 || dehtml. get_add_text ( ) == AddText :: YesRemoveLineEnds
184203 {
185204 let event = event as & [ _ ] ;
186205 let event_str = std:: str:: from_utf8 ( event) . unwrap_or_default ( ) ;
187- let mut last_added = escaper:: decode_html_buf_sloppy ( event) . unwrap_or_default ( ) ;
188- if event_str. starts_with ( & last_added) {
189- last_added = event_str. to_string ( ) ;
190- }
206+ str_cb ( event_str, dehtml) ;
207+ }
208+ }
191209
192- if dehtml. get_add_text ( ) == AddText :: YesRemoveLineEnds {
193- // Replace all line ends with spaces.
194- // E.g. `\r\n\r\n` is replaced with one space.
195- let last_added = LINE_RE . replace_all ( & last_added, " " ) ;
196-
197- // Add a space if `last_added` starts with a space
198- // and there is no whitespace at the end of the buffer yet.
199- // Trim the rest of leading whitespace from `last_added`.
200- let buf = dehtml. get_buf ( ) ;
201- if !buf. ends_with ( ' ' ) && !buf. ends_with ( '\n' ) && last_added. starts_with ( ' ' ) {
202- * buf += " " ;
203- }
210+ fn str_cb ( event_str : & str , dehtml : & mut Dehtml ) {
211+ static LINE_RE : LazyLock < regex:: Regex > =
212+ LazyLock :: new ( || regex:: Regex :: new ( r"(\r?\n)+" ) . unwrap ( ) ) ;
204213
205- * buf += last_added. trim_start ( ) ;
206- } else {
207- * dehtml. get_buf ( ) += LINE_RE . replace_all ( & last_added, "\n " ) . as_ref ( ) ;
214+ let add_text = dehtml. get_add_text ( ) ;
215+ if add_text == AddText :: YesRemoveLineEnds {
216+ // Replace all line ends with spaces.
217+ // E.g. `\r\n\r\n` is replaced with one space.
218+ let event_str = LINE_RE . replace_all ( event_str, " " ) ;
219+
220+ // Add a space if `event_str` starts with a space
221+ // and there is no whitespace at the end of the buffer yet.
222+ // Trim the rest of leading whitespace from `event_str`.
223+ let buf = dehtml. get_buf ( ) ;
224+ if !buf. ends_with ( ' ' ) && !buf. ends_with ( '\n' ) && event_str. starts_with ( ' ' ) {
225+ * buf += " " ;
208226 }
227+
228+ * buf += event_str. trim_start ( ) ;
229+ } else if add_text == AddText :: YesPreserveLineEnds {
230+ * dehtml. get_buf ( ) += LINE_RE . replace_all ( event_str, "\n " ) . as_ref ( ) ;
209231 }
210232}
211233
0 commit comments