mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
remove whitespace in extracttext
This commit is contained in:
parent
e50c7e1a51
commit
9edd865bf4
@ -2,11 +2,14 @@ package htmlutil
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
var whitespaceRegex = regexp.MustCompile(`[\s]+`)
|
||||
|
||||
func HTML(node *html.Node) string {
|
||||
writer := strings.Builder{}
|
||||
html.Render(&writer, node)
|
||||
@ -53,5 +56,8 @@ func ExtractText(content string) string {
|
||||
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
|
||||
}
|
||||
}
|
||||
return buffer.String()
|
||||
text := buffer.String()
|
||||
text = strings.TrimSpace(text)
|
||||
text = whitespaceRegex.ReplaceAllLiteralString(text, " ")
|
||||
return text
|
||||
}
|
||||
|
26
src/content/htmlutil/utils_test.go
Normal file
26
src/content/htmlutil/utils_test.go
Normal file
@ -0,0 +1,26 @@
|
||||
package htmlutil
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestExtractText(t *testing.T) {
|
||||
testcases := [][2]string {
|
||||
{"hello", "<div>hello</div>"},
|
||||
{"hello world", "<div>hello</div> world"},
|
||||
{"helloworld", "<div>hello</div>world"},
|
||||
{"hello world", "hello <div>world</div>"},
|
||||
{"helloworld", "hello<div>world</div>"},
|
||||
{"hello world!", "hello <div>world</div>!"},
|
||||
{"hello world !", "hello <div> world\r\n </div>!"},
|
||||
}
|
||||
for _, testcase := range testcases {
|
||||
want := testcase[0]
|
||||
base := testcase[1]
|
||||
have := ExtractText(base)
|
||||
if want != have {
|
||||
t.Logf("base: %#v\n", base)
|
||||
t.Logf("want: %#v\n", want)
|
||||
t.Logf("have: %#v\n", have)
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user