Plain Text Extractor is a Golang library that helps you extract plain text from HTML and Markdown.
It provides a flexible and extensible interface for extracting the plain text content using both the predefined extraction methods and your own custom extraction requirements.
- Parse HTML and Markdown documents into plain text.
- Support for custom extraction functions.
- Easy-to-use API to convert complex documents to simple plain text.
go get github.com/huantt/plaintext-extractrormarkdownContent := "# H1 \n*italic* **bold** `code` `not code [link](https://example.com)  ~~strikethrough~~"
extractor := NewMarkdownExtractor()
output, err := extractor.PlainText(markdownContent)
if err != nil {
panic(err)
}
fmt.Println(output)
// Output: H1 \nitalic bold code `not code link image strikethroughgoos: windows
goarch: amd64
pkg: github.com/huantt/plaintext-extractor/markdown
cpu: 11th Gen Intel(R) Core(TM) i5-1155G7 @ 2.50GHz
BenchmarkMarkdownExtractorMediumSize
BenchmarkMarkdownExtractorMediumSize-8 12194006 89.09 ns/op 16 B/op 1 allocs/op
BenchmarkMarkdownExtractorLargeSize
BenchmarkMarkdownExtractorLargeSize-8 12645927 88.25 ns/op 16 B/op 1 allocs/op
PASSmarkdownContent := "This is {color:#0A84FF}red{color}"
customTag := markdown.Tag{
Name: "color-custom-tag",
FullRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}(.*?){color}"),
StartRegex: regexp.MustCompile("{color:[a-zA-Z0-9#]+}"),
EndRegex: regexp.MustCompile("{color}"),
}
markdownExtractor := NewMarkdownExtractor(customTag)
plaintextExtractor := plaintext.NewExtractor(markdownExtractor.PlainText)
plaintext, err := plaintextExtractor.PlainText(markdownContent)
if err != nil{
panic(nil)
}
fmt.Println(plaintext)
// Output: This is redhtml := `<div>This is a <a href="https://example.com">link</a></div>`
extractor := NewHtmlExtractor()
output, err := extractor.PlainText(html)
if err != nil {
panic(err)
}
fmt.Println(output)
// Output: This is a linkinput := `<div> html </div> *markdown*`
markdownExtractor := markdown.NewExtractor()
htmlExtractor := html.NewExtractor()
extractor := NewExtractor(markdownExtractor.PlainText, htmlExtractor.PlainText)
output, err := extractor.PlainText(input)
if err != nil {
panic(err)
}
fmt.Println(output)
// Output: html markdownContributions to the Plain Text Parser project are welcome! If you find any issues or want to add new features, please feel free to open an issue or submit a pull request. Please see the CONTRIBUTING.md for more information.
This project released under the MIT License, refer LICENSE file.