mirror of
https://github.com/nkanaev/yarr.git
synced 2025-09-13 09:55:36 +00:00
handle html type atom text
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package htmlutil
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
@@ -39,3 +40,18 @@ func Text(node *html.Node) string {
|
||||
}
|
||||
return strings.Join(text, " ")
|
||||
}
|
||||
|
||||
func ExtractText(content string) string {
|
||||
tokenizer := html.NewTokenizer(strings.NewReader(content))
|
||||
buffer := bytes.Buffer{}
|
||||
for {
|
||||
token := tokenizer.Next()
|
||||
if token == html.ErrorToken {
|
||||
break
|
||||
}
|
||||
if token == html.TextToken {
|
||||
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
|
||||
}
|
||||
}
|
||||
return buffer.String()
|
||||
}
|
||||
|
Reference in New Issue
Block a user