mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 21:19:19 +00:00
64 lines
1.3 KiB
Go
64 lines
1.3 KiB
Go
package htmlutil
|
|
|
|
import (
|
|
"bytes"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
var whitespaceRegex = regexp.MustCompile(`[\s]+`)
|
|
|
|
func HTML(node *html.Node) string {
|
|
writer := strings.Builder{}
|
|
html.Render(&writer, node)
|
|
return writer.String()
|
|
}
|
|
|
|
func InnerHTML(node *html.Node) string {
|
|
writer := strings.Builder{}
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
html.Render(&writer, c)
|
|
}
|
|
return writer.String()
|
|
}
|
|
|
|
func Attr(node *html.Node, key string) string {
|
|
for _, a := range node.Attr {
|
|
if strings.EqualFold(a.Key, key) {
|
|
return a.Val
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func Text(node *html.Node) string {
|
|
text := make([]string, 0)
|
|
isTextNode := func(n *html.Node) bool {
|
|
return n.Type == html.TextNode
|
|
}
|
|
for _, n := range FindNodes(node, isTextNode) {
|
|
text = append(text, strings.TrimSpace(n.Data))
|
|
}
|
|
return strings.Join(text, " ")
|
|
}
|
|
|
|
func ExtractText(content string) string {
|
|
tokenizer := html.NewTokenizer(strings.NewReader(content))
|
|
buffer := bytes.Buffer{}
|
|
for {
|
|
token := tokenizer.Next()
|
|
if token == html.ErrorToken {
|
|
break
|
|
}
|
|
if token == html.TextToken {
|
|
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
|
|
}
|
|
}
|
|
text := buffer.String()
|
|
text = strings.TrimSpace(text)
|
|
text = whitespaceRegex.ReplaceAllLiteralString(text, " ")
|
|
return text
|
|
}
|