From 9edd865bf4911c344845673b849d3aba7824a3e6 Mon Sep 17 00:00:00 2001 From: Nazar Kanaev Date: Fri, 2 Apr 2021 22:02:21 +0100 Subject: [PATCH] remove whitespace in extracttext --- src/content/htmlutil/utils.go | 8 +++++++- src/content/htmlutil/utils_test.go | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 src/content/htmlutil/utils_test.go diff --git a/src/content/htmlutil/utils.go b/src/content/htmlutil/utils.go index 3e475f8..5f66ae8 100644 --- a/src/content/htmlutil/utils.go +++ b/src/content/htmlutil/utils.go @@ -2,11 +2,14 @@ package htmlutil import ( "bytes" + "regexp" "strings" "golang.org/x/net/html" ) +var whitespaceRegex = regexp.MustCompile(`[\s]+`) + func HTML(node *html.Node) string { writer := strings.Builder{} html.Render(&writer, node) @@ -53,5 +56,8 @@ func ExtractText(content string) string { buffer.WriteString(html.UnescapeString(string(tokenizer.Text()))) } } - return buffer.String() + text := buffer.String() + text = strings.TrimSpace(text) + text = whitespaceRegex.ReplaceAllLiteralString(text, " ") + return text } diff --git a/src/content/htmlutil/utils_test.go b/src/content/htmlutil/utils_test.go new file mode 100644 index 0000000..4ab6251 --- /dev/null +++ b/src/content/htmlutil/utils_test.go @@ -0,0 +1,26 @@ +package htmlutil + +import "testing" + +func TestExtractText(t *testing.T) { + testcases := [][2]string { + {"hello", "
hello
"}, + {"hello world", "
hello
world"}, + {"helloworld", "
hello
world"}, + {"hello world", "hello
world
"}, + {"helloworld", "hello
world
"}, + {"hello world!", "hello
world
!"}, + {"hello world !", "hello
world\r\n
!"}, + } + for _, testcase := range testcases { + want := testcase[0] + base := testcase[1] + have := ExtractText(base) + if want != have { + t.Logf("base: %#v\n", base) + t.Logf("want: %#v\n", want) + t.Logf("have: %#v\n", have) + t.Fail() + } + } +}