reorganizing content-related packages

This commit is contained in:
Nazar Kanaev
2021-03-31 23:40:59 +01:00
parent 0b8bf50204
commit b04e8c1e93
12 changed files with 19 additions and 18 deletions

View File

@@ -0,0 +1,76 @@
package htmlutil
import (
"regexp"
"strings"
"golang.org/x/net/html"
)
var nodeNameRegex = regexp.MustCompile(`\w+|\*`)
func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
nodes := make([]*html.Node, 0)
queue := make([]*html.Node, 0)
queue = append(queue, node)
for len(queue) > 0 {
var n *html.Node
n, queue = queue[0], queue[1:]
if match(n) {
nodes = append(nodes, n)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
queue = append(queue, c)
}
}
return nodes
}
func Query(node *html.Node, sel string) []*html.Node {
matcher := NewMatcher(sel)
return FindNodes(node, matcher.Match)
}
func NewMatcher(sel string) Matcher {
multi := MultiMatch{}
parts := strings.Split(sel, ",")
for _, part := range parts {
part := strings.TrimSpace(part)
if nodeNameRegex.MatchString(part) {
multi.Add(ElementMatch{Name: part})
} else {
panic("unsupported selector: " + part)
}
}
return multi
}
type Matcher interface {
Match(*html.Node) bool
}
type ElementMatch struct {
Name string
}
func (m ElementMatch) Match(n *html.Node) bool {
return n.Type == html.ElementNode && (n.Data == m.Name || m.Name == "*")
}
type MultiMatch struct {
matchers []Matcher
}
func (m *MultiMatch) Add(matcher Matcher) {
m.matchers = append(m.matchers, matcher)
}
func (m MultiMatch) Match(n *html.Node) bool {
for _, matcher := range m.matchers {
if matcher.Match(n) {
return true
}
}
return false
}

View File

@@ -0,0 +1,64 @@
package htmlutil
import (
"strings"
"testing"
"golang.org/x/net/html"
)
func TestQuery(t *testing.T) {
node, _ := html.Parse(strings.NewReader(`
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<div>
<p>test</p>
</div>
</body>
</html>
`))
nodes := Query(node, "p")
match := (
len(nodes) == 1 &&
nodes[0].Type == html.ElementNode &&
nodes[0].Data == "p")
if !match {
t.Fatalf("incorrect match: %#v", nodes)
}
}
func TestQueryMulti(t *testing.T) {
node, _ := html.Parse(strings.NewReader(`
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<p>foo</p>
<div>
<p>bar</p>
<span>baz</span>
</div>
</body>
</html>
`))
nodes := Query(node, "p , span")
match := (
len(nodes) == 3 &&
nodes[0].Type == html.ElementNode && nodes[0].Data == "p" &&
nodes[1].Type == html.ElementNode && nodes[1].Data == "p" &&
nodes[2].Type == html.ElementNode && nodes[2].Data == "span")
if !match {
for i, n := range nodes {
t.Logf("%d: %s", i, HTML(n))
}
t.Fatal("incorrect match")
}
}

View File

@@ -0,0 +1,33 @@
package htmlutil
import (
"net/url"
)
func Any(els []string, el string, match func(string, string) bool) bool {
for _, x := range els {
if match(x, el) {
return true
}
}
return false
}
func AbsoluteUrl(href, base string) string {
baseUrl, err := url.Parse(base)
if err != nil {
return ""
}
hrefUrl, err := url.Parse(href)
if err != nil {
return ""
}
return baseUrl.ResolveReference(hrefUrl).String()
}
func URLDomain(val string) string {
if u, err := url.Parse(val); err == nil {
return u.Host
}
return val
}

View File

@@ -0,0 +1,41 @@
package htmlutil
import (
"strings"
"golang.org/x/net/html"
)
func HTML(node *html.Node) string {
writer := strings.Builder{}
html.Render(&writer, node)
return writer.String()
}
func InnerHTML(node *html.Node) string {
writer := strings.Builder{}
for c := node.FirstChild; c != nil; c = c.NextSibling {
html.Render(&writer, c)
}
return writer.String()
}
func Attr(node *html.Node, key string) string {
for _, a := range node.Attr {
if strings.EqualFold(a.Key, key) {
return a.Val
}
}
return ""
}
func Text(node *html.Node) string {
text := make([]string, 0)
isTextNode := func(n *html.Node) bool {
return n.Type == html.TextNode
}
for _, n := range FindNodes(node, isTextNode) {
text = append(text, strings.TrimSpace(n.Data))
}
return strings.Join(text, " ")
}