remove whitespace in extracttext

This commit is contained in:
Nazar Kanaev 2021-04-02 22:02:21 +01:00
parent e50c7e1a51
commit 9edd865bf4
2 changed files with 33 additions and 1 deletions

View File

@ -2,11 +2,14 @@ package htmlutil
import (
"bytes"
"regexp"
"strings"
"golang.org/x/net/html"
)
var whitespaceRegex = regexp.MustCompile(`[\s]+`)
func HTML(node *html.Node) string {
writer := strings.Builder{}
html.Render(&writer, node)
@ -53,5 +56,8 @@ func ExtractText(content string) string {
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
}
}
return buffer.String()
text := buffer.String()
text = strings.TrimSpace(text)
text = whitespaceRegex.ReplaceAllLiteralString(text, " ")
return text
}

View File

@ -0,0 +1,26 @@
package htmlutil
import "testing"
func TestExtractText(t *testing.T) {
testcases := [][2]string {
{"hello", "<div>hello</div>"},
{"hello world", "<div>hello</div> world"},
{"helloworld", "<div>hello</div>world"},
{"hello world", "hello <div>world</div>"},
{"helloworld", "hello<div>world</div>"},
{"hello world!", "hello <div>world</div>!"},
{"hello world !", "hello <div> world\r\n </div>!"},
}
for _, testcase := range testcases {
want := testcase[0]
base := testcase[1]
have := ExtractText(base)
if want != have {
t.Logf("base: %#v\n", base)
t.Logf("want: %#v\n", want)
t.Logf("have: %#v\n", have)
t.Fail()
}
}
}