mirror of
https://github.com/nkanaev/yarr.git
synced 2025-09-13 18:00:05 +00:00
reorganizing content-related packages
This commit is contained in:
76
src/content/htmlutil/query.go
Normal file
76
src/content/htmlutil/query.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package htmlutil
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
var nodeNameRegex = regexp.MustCompile(`\w+|\*`)
|
||||
|
||||
func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
|
||||
nodes := make([]*html.Node, 0)
|
||||
|
||||
queue := make([]*html.Node, 0)
|
||||
queue = append(queue, node)
|
||||
for len(queue) > 0 {
|
||||
var n *html.Node
|
||||
n, queue = queue[0], queue[1:]
|
||||
if match(n) {
|
||||
nodes = append(nodes, n)
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
queue = append(queue, c)
|
||||
}
|
||||
}
|
||||
return nodes
|
||||
}
|
||||
|
||||
func Query(node *html.Node, sel string) []*html.Node {
|
||||
matcher := NewMatcher(sel)
|
||||
return FindNodes(node, matcher.Match)
|
||||
}
|
||||
|
||||
func NewMatcher(sel string) Matcher {
|
||||
multi := MultiMatch{}
|
||||
parts := strings.Split(sel, ",")
|
||||
for _, part := range parts {
|
||||
part := strings.TrimSpace(part)
|
||||
if nodeNameRegex.MatchString(part) {
|
||||
multi.Add(ElementMatch{Name: part})
|
||||
} else {
|
||||
panic("unsupported selector: " + part)
|
||||
}
|
||||
}
|
||||
return multi
|
||||
}
|
||||
|
||||
type Matcher interface {
|
||||
Match(*html.Node) bool
|
||||
}
|
||||
|
||||
type ElementMatch struct {
|
||||
Name string
|
||||
}
|
||||
|
||||
func (m ElementMatch) Match(n *html.Node) bool {
|
||||
return n.Type == html.ElementNode && (n.Data == m.Name || m.Name == "*")
|
||||
}
|
||||
|
||||
type MultiMatch struct {
|
||||
matchers []Matcher
|
||||
}
|
||||
|
||||
func (m *MultiMatch) Add(matcher Matcher) {
|
||||
m.matchers = append(m.matchers, matcher)
|
||||
}
|
||||
|
||||
func (m MultiMatch) Match(n *html.Node) bool {
|
||||
for _, matcher := range m.matchers {
|
||||
if matcher.Match(n) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
64
src/content/htmlutil/query_test.go
Normal file
64
src/content/htmlutil/query_test.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package htmlutil
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
func TestQuery(t *testing.T) {
|
||||
node, _ := html.Parse(strings.NewReader(`
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title></title>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<p>test</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
`))
|
||||
nodes := Query(node, "p")
|
||||
match := (
|
||||
len(nodes) == 1 &&
|
||||
nodes[0].Type == html.ElementNode &&
|
||||
nodes[0].Data == "p")
|
||||
if !match {
|
||||
t.Fatalf("incorrect match: %#v", nodes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestQueryMulti(t *testing.T) {
|
||||
node, _ := html.Parse(strings.NewReader(`
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title></title>
|
||||
</head>
|
||||
<body>
|
||||
<p>foo</p>
|
||||
<div>
|
||||
<p>bar</p>
|
||||
<span>baz</span>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
`))
|
||||
nodes := Query(node, "p , span")
|
||||
match := (
|
||||
len(nodes) == 3 &&
|
||||
nodes[0].Type == html.ElementNode && nodes[0].Data == "p" &&
|
||||
nodes[1].Type == html.ElementNode && nodes[1].Data == "p" &&
|
||||
nodes[2].Type == html.ElementNode && nodes[2].Data == "span")
|
||||
if !match {
|
||||
for i, n := range nodes {
|
||||
t.Logf("%d: %s", i, HTML(n))
|
||||
}
|
||||
t.Fatal("incorrect match")
|
||||
}
|
||||
}
|
33
src/content/htmlutil/urlutils.go
Normal file
33
src/content/htmlutil/urlutils.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package htmlutil
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
)
|
||||
|
||||
func Any(els []string, el string, match func(string, string) bool) bool {
|
||||
for _, x := range els {
|
||||
if match(x, el) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func AbsoluteUrl(href, base string) string {
|
||||
baseUrl, err := url.Parse(base)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
hrefUrl, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return baseUrl.ResolveReference(hrefUrl).String()
|
||||
}
|
||||
|
||||
func URLDomain(val string) string {
|
||||
if u, err := url.Parse(val); err == nil {
|
||||
return u.Host
|
||||
}
|
||||
return val
|
||||
}
|
41
src/content/htmlutil/utils.go
Normal file
41
src/content/htmlutil/utils.go
Normal file
@@ -0,0 +1,41 @@
|
||||
package htmlutil
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
func HTML(node *html.Node) string {
|
||||
writer := strings.Builder{}
|
||||
html.Render(&writer, node)
|
||||
return writer.String()
|
||||
}
|
||||
|
||||
func InnerHTML(node *html.Node) string {
|
||||
writer := strings.Builder{}
|
||||
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
||||
html.Render(&writer, c)
|
||||
}
|
||||
return writer.String()
|
||||
}
|
||||
|
||||
func Attr(node *html.Node, key string) string {
|
||||
for _, a := range node.Attr {
|
||||
if strings.EqualFold(a.Key, key) {
|
||||
return a.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func Text(node *html.Node) string {
|
||||
text := make([]string, 0)
|
||||
isTextNode := func(n *html.Node) bool {
|
||||
return n.Type == html.TextNode
|
||||
}
|
||||
for _, n := range FindNodes(node, isTextNode) {
|
||||
text = append(text, strings.TrimSpace(n.Data))
|
||||
}
|
||||
return strings.Join(text, " ")
|
||||
}
|
Reference in New Issue
Block a user