remove whitespace in extracttext

This commit is contained in:
Nazar Kanaev
2021-04-02 22:02:21 +01:00
parent e50c7e1a51
commit 9edd865bf4
2 changed files with 33 additions and 1 deletions

View File

@@ -2,11 +2,14 @@ package htmlutil
import (
"bytes"
"regexp"
"strings"
"golang.org/x/net/html"
)
var whitespaceRegex = regexp.MustCompile(`[\s]+`)
func HTML(node *html.Node) string {
writer := strings.Builder{}
html.Render(&writer, node)
@@ -53,5 +56,8 @@ func ExtractText(content string) string {
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
}
}
return buffer.String()
text := buffer.String()
text = strings.TrimSpace(text)
text = whitespaceRegex.ReplaceAllLiteralString(text, " ")
return text
}