diff --git a/src/content/htmlutil/utils.go b/src/content/htmlutil/utils.go index 3e475f8..5f66ae8 100644 --- a/src/content/htmlutil/utils.go +++ b/src/content/htmlutil/utils.go @@ -2,11 +2,14 @@ package htmlutil import ( "bytes" + "regexp" "strings" "golang.org/x/net/html" ) +var whitespaceRegex = regexp.MustCompile(`[\s]+`) + func HTML(node *html.Node) string { writer := strings.Builder{} html.Render(&writer, node) @@ -53,5 +56,8 @@ func ExtractText(content string) string { buffer.WriteString(html.UnescapeString(string(tokenizer.Text()))) } } - return buffer.String() + text := buffer.String() + text = strings.TrimSpace(text) + text = whitespaceRegex.ReplaceAllLiteralString(text, " ") + return text } diff --git a/src/content/htmlutil/utils_test.go b/src/content/htmlutil/utils_test.go new file mode 100644 index 0000000..4ab6251 --- /dev/null +++ b/src/content/htmlutil/utils_test.go @@ -0,0 +1,26 @@ +package htmlutil + +import "testing" + +func TestExtractText(t *testing.T) { + testcases := [][2]string { + {"hello", "