mirror of
https://github.com/nkanaev/yarr.git
synced 2025-09-16 11:20:14 +00:00
remove whitespace in extracttext
This commit is contained in:
@@ -2,11 +2,14 @@ package htmlutil
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
var whitespaceRegex = regexp.MustCompile(`[\s]+`)
|
||||
|
||||
func HTML(node *html.Node) string {
|
||||
writer := strings.Builder{}
|
||||
html.Render(&writer, node)
|
||||
@@ -53,5 +56,8 @@ func ExtractText(content string) string {
|
||||
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
|
||||
}
|
||||
}
|
||||
return buffer.String()
|
||||
text := buffer.String()
|
||||
text = strings.TrimSpace(text)
|
||||
text = whitespaceRegex.ReplaceAllLiteralString(text, " ")
|
||||
return text
|
||||
}
|
||||
|
Reference in New Issue
Block a user