separate package for crawler

This commit is contained in:
Nazar Kanaev 2021-03-18 11:05:47 +00:00
parent fc3383946d
commit 1f042a8434
4 changed files with 205 additions and 109 deletions

73
src/crawler/finder.go Normal file
View File

@ -0,0 +1,73 @@
package crawler
import (
"strings"
"golang.org/x/net/html"
)
func FindFeeds(body string, base string) map[string]string {
candidates := make(map[string]string)
doc, err := html.Parse(strings.NewReader(body))
if err != nil {
return candidates
}
linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"}
isFeedLink := func(n *html.Node) bool {
if n.Type == html.ElementNode && n.Data == "link" {
t := getAttr(n, "type")
for _, tt := range linkTypes {
if tt == t {
return true
}
}
}
return false
}
for _, node := range getNodes(doc, isFeedLink) {
href := getAttr(node, "href")
name := getAttr(node, "title")
link := absoluteUrl(href, base)
if link != "" {
candidates[link] = name
}
}
if len(candidates) == 0 {
// guess by hyperlink properties:
// - a[href="feed"]
// - a:contains("rss")
// ...etc
feedHrefs := []string{"feed", "feed.xml", "rss.xml", "atom.xml"}
feedTexts := []string{"rss", "feed"}
isFeedHyperLink := func(n *html.Node) bool {
if n.Type == html.ElementNode && n.Data == "a" {
href := strings.Trim(getAttr(n, "href"), "/")
text := getText(n)
for _, feedHref := range feedHrefs {
if strings.HasSuffix(href, feedHref) {
return true
}
}
for _, feedText := range feedTexts {
if strings.EqualFold(text, feedText) {
return true
}
}
}
return false
}
for _, node := range getNodes(doc, isFeedHyperLink) {
href := getAttr(node, "href")
link := absoluteUrl(href, base)
if link != "" {
candidates[link] = ""
}
}
}
return candidates
}

View File

@ -0,0 +1,74 @@
package crawler
import (
"testing"
"reflect"
)
const base = "http://example.com"
func TestFindFeedsInvalidHTML(t *testing.T) {
x := `some nonsense`
r := FindFeeds(x, base)
if len(r) != 0 {
t.Fatal("not expecting results")
}
}
func TestFindFeedsLinks(t *testing.T) {
x := `
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title></title>
<link rel="alternate" href="/feed.xml" type="application/rss+xml" title="rss with title">
<link rel="alternate" href="/atom.xml" type="application/atom+xml">
<link rel="alternate" href="/feed.json" type="application/json">
</head>
<body>
<a href="/feed.xml">rss</a>
</body>
</html>
`
have := FindFeeds(x, base)
want := map[string]string{
base + "/feed.xml": "rss with title",
base + "/atom.xml": "",
base + "/feed.json": "",
}
if !reflect.DeepEqual(have, want) {
t.Logf("want: %#v", want)
t.Logf("have: %#v", have)
t.Fatal("invalid result")
}
}
func TestFindFeedsGuess(t *testing.T) {
x := `
<!DOCTYPE html>
<html lang="en">
<body>
<!-- negative -->
<a href="/about">what is rss?</a>
<a href="/feed/cows">moo</a>
<!-- positive -->
<a href="/feed.xml">subscribe</a>
<a href="/news">rss</a>
</body>
</html>
`
r := FindFeeds(x, base)
e := map[string]string{
base + "/feed.xml": "",
base + "/news": "",
}
if !reflect.DeepEqual(e, r) {
t.Logf("want: %#v", e)
t.Logf("have: %#v", r)
t.Fatal("invalid result")
}
}

58
src/crawler/utils.go Normal file
View File

@ -0,0 +1,58 @@
package crawler
import (
"net/url"
"strings"
"golang.org/x/net/html"
)
func getAttr(node *html.Node, key string) string {
for _, a := range node.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}
func getText(node *html.Node) string {
text := make([]string, 0)
isTextNode := func(n *html.Node) bool {
return n.Type == html.TextNode
}
for _, n := range getNodes(node, isTextNode) {
text = append(text, strings.TrimSpace(n.Data))
}
return strings.Join(text, " ")
}
func getNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
nodes := make([]*html.Node, 0)
queue := make([]*html.Node, 0)
queue = append(queue, node)
for len(queue) > 0 {
var n *html.Node
n, queue = queue[0], queue[1:]
if match(n) {
nodes = append(nodes, n)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
queue = append(queue, c)
}
}
return nodes
}
func absoluteUrl(href, base string) string {
baseUrl, err := url.Parse(base)
if err != nil {
return ""
}
hrefUrl, err := url.Parse(href)
if err != nil {
return ""
}
return baseUrl.ResolveReference(hrefUrl).String()
}

View File

@ -1,109 +0,0 @@
package worker
import (
"net/url"
"strings"
"golang.org/x/net/html"
)
func getAttr(node *html.Node, key string) string {
for _, a := range node.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}
func getText(node *html.Node) string {
text := make([]string, 0)
isTextNode := func(n *html.Node) bool {
return n.Type == html.TextNode
}
for _, n := range getNodes(node, isTextNode) {
text = append(text, strings.TrimSpace(n.Data))
}
return strings.Join(text, " ")
}
func getNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
nodes := make([]*html.Node, 0)
queue := make([]*html.Node, 0)
queue = append(queue, node)
for len(queue) > 0 {
queue, n := queue[1:], queue[0]
if match(n) {
nodes = append(nodes, n)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
queue = append(queue, c)
}
}
return nodes
}
func FindFeeds(doc *html.Node, baseUrl *url.URL) []*FeedSource {
candidates := make(map[string]string)
linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"}
isFeedLink := func(n *html.Node) bool {
if n.Type == html.ElementNode && n.Data == "link" {
t := getAttr(n, "type")
for _, tt := range linkTypes {
if tt == t {
return true
}
}
}
return false
}
for _, node := range getNodes(doc, isFeedLink) {
href := getAttr(node, "href")
name := getAttr(node, "title")
link := baseUrl.ResolveReference(href).String()
if href != "" {
candidates[link] = name
}
}
if len(candidates) == 0 {
// guess by hyperlink properties:
// - a[href="feed"]
// - a:contains("rss")
// ...etc
feedHrefs := []string{"feed", "feed.xml", "rss.xml", "atom.xml"}
feedTexts := []string{"rss", "feed"}
isFeedHyperLink := func(n *html.Node) bool {
if n.Type == html.ElementNode && n.Data == "a" {
href := strings.Trim(getAttr(n, "href"), "/")
text := strings.Lower(getText(n))
for _, feedHref := range feedHrefs {
if strings.HasSuffix(href, feedHref) {
return true
}
}
for _, feedText := range feedTexts {
if strings.Contains(text, feedText) {
return true
}
}
}
return false
}
for _, node := range getNodes(doc, isFeedHyperLink) {
href := getAttr(node, "href")
link := baseUrl.ResolveReference(href).String()
candidates[link] = ""
}
}
sources := make([]*FeedSource, 0, len(candidates))
for url, title := range candidates {
sources = append(sources, &FeedSource{Title: title, Url: url})
}
return sources
}