mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-25 05:29:20 +00:00
remove whitespace in extracttext
This commit is contained in:
parent
e50c7e1a51
commit
9edd865bf4
@ -2,11 +2,14 @@ package htmlutil
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var whitespaceRegex = regexp.MustCompile(`[\s]+`)
|
||||||
|
|
||||||
func HTML(node *html.Node) string {
|
func HTML(node *html.Node) string {
|
||||||
writer := strings.Builder{}
|
writer := strings.Builder{}
|
||||||
html.Render(&writer, node)
|
html.Render(&writer, node)
|
||||||
@ -53,5 +56,8 @@ func ExtractText(content string) string {
|
|||||||
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
|
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return buffer.String()
|
text := buffer.String()
|
||||||
|
text = strings.TrimSpace(text)
|
||||||
|
text = whitespaceRegex.ReplaceAllLiteralString(text, " ")
|
||||||
|
return text
|
||||||
}
|
}
|
||||||
|
26
src/content/htmlutil/utils_test.go
Normal file
26
src/content/htmlutil/utils_test.go
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
package htmlutil
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestExtractText(t *testing.T) {
|
||||||
|
testcases := [][2]string {
|
||||||
|
{"hello", "<div>hello</div>"},
|
||||||
|
{"hello world", "<div>hello</div> world"},
|
||||||
|
{"helloworld", "<div>hello</div>world"},
|
||||||
|
{"hello world", "hello <div>world</div>"},
|
||||||
|
{"helloworld", "hello<div>world</div>"},
|
||||||
|
{"hello world!", "hello <div>world</div>!"},
|
||||||
|
{"hello world !", "hello <div> world\r\n </div>!"},
|
||||||
|
}
|
||||||
|
for _, testcase := range testcases {
|
||||||
|
want := testcase[0]
|
||||||
|
base := testcase[1]
|
||||||
|
have := ExtractText(base)
|
||||||
|
if want != have {
|
||||||
|
t.Logf("base: %#v\n", base)
|
||||||
|
t.Logf("want: %#v\n", want)
|
||||||
|
t.Logf("have: %#v\n", have)
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user