From 1f042a843413c8353301bd10d332fdee6f0d5779 Mon Sep 17 00:00:00 2001 From: Nazar Kanaev Date: Thu, 18 Mar 2021 11:05:47 +0000 Subject: [PATCH] separate package for crawler --- src/crawler/finder.go | 73 +++++++++++++++++++++++++ src/crawler/finder_test.go | 74 +++++++++++++++++++++++++ src/crawler/utils.go | 58 ++++++++++++++++++++ src/worker/crawl.go | 109 ------------------------------------- 4 files changed, 205 insertions(+), 109 deletions(-) create mode 100644 src/crawler/finder.go create mode 100644 src/crawler/finder_test.go create mode 100644 src/crawler/utils.go delete mode 100644 src/worker/crawl.go diff --git a/src/crawler/finder.go b/src/crawler/finder.go new file mode 100644 index 0000000..3ffb48c --- /dev/null +++ b/src/crawler/finder.go @@ -0,0 +1,73 @@ +package crawler + +import ( + "strings" + + "golang.org/x/net/html" +) + +func FindFeeds(body string, base string) map[string]string { + candidates := make(map[string]string) + + doc, err := html.Parse(strings.NewReader(body)) + if err != nil { + return candidates + } + + linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"} + isFeedLink := func(n *html.Node) bool { + if n.Type == html.ElementNode && n.Data == "link" { + t := getAttr(n, "type") + for _, tt := range linkTypes { + if tt == t { + return true + } + } + } + return false + } + for _, node := range getNodes(doc, isFeedLink) { + href := getAttr(node, "href") + name := getAttr(node, "title") + link := absoluteUrl(href, base) + if link != "" { + candidates[link] = name + } + } + + if len(candidates) == 0 { + // guess by hyperlink properties: + // - a[href="feed"] + // - a:contains("rss") + // ...etc + feedHrefs := []string{"feed", "feed.xml", "rss.xml", "atom.xml"} + feedTexts := []string{"rss", "feed"} + isFeedHyperLink := func(n *html.Node) bool { + if n.Type == html.ElementNode && n.Data == "a" { + href := strings.Trim(getAttr(n, "href"), "/") + text := getText(n) + + for _, feedHref := range feedHrefs { + if strings.HasSuffix(href, feedHref) { + return true + } + } + for _, feedText := range feedTexts { + if strings.EqualFold(text, feedText) { + return true + } + } + } + return false + } + for _, node := range getNodes(doc, isFeedHyperLink) { + href := getAttr(node, "href") + link := absoluteUrl(href, base) + if link != "" { + candidates[link] = "" + } + } + } + + return candidates +} diff --git a/src/crawler/finder_test.go b/src/crawler/finder_test.go new file mode 100644 index 0000000..90a5377 --- /dev/null +++ b/src/crawler/finder_test.go @@ -0,0 +1,74 @@ +package crawler + +import ( + "testing" + "reflect" +) + +const base = "http://example.com" + +func TestFindFeedsInvalidHTML(t *testing.T) { + x := `some nonsense` + r := FindFeeds(x, base) + if len(r) != 0 { + t.Fatal("not expecting results") + } +} + +func TestFindFeedsLinks(t *testing.T) { + x := ` + + + + + + + + + + + rss + + + ` + have := FindFeeds(x, base) + + want := map[string]string{ + base + "/feed.xml": "rss with title", + base + "/atom.xml": "", + base + "/feed.json": "", + } + if !reflect.DeepEqual(have, want) { + t.Logf("want: %#v", want) + t.Logf("have: %#v", have) + t.Fatal("invalid result") + } +} + +func TestFindFeedsGuess(t *testing.T) { + x := ` + + + + + what is rss? + moo + + + subscribe + rss + + + ` + r := FindFeeds(x, base) + + e := map[string]string{ + base + "/feed.xml": "", + base + "/news": "", + } + if !reflect.DeepEqual(e, r) { + t.Logf("want: %#v", e) + t.Logf("have: %#v", r) + t.Fatal("invalid result") + } +} diff --git a/src/crawler/utils.go b/src/crawler/utils.go new file mode 100644 index 0000000..7291c4d --- /dev/null +++ b/src/crawler/utils.go @@ -0,0 +1,58 @@ +package crawler + +import ( + "net/url" + "strings" + + "golang.org/x/net/html" +) + +func getAttr(node *html.Node, key string) string { + for _, a := range node.Attr { + if a.Key == key { + return a.Val + } + } + return "" +} + +func getText(node *html.Node) string { + text := make([]string, 0) + isTextNode := func(n *html.Node) bool { + return n.Type == html.TextNode + } + for _, n := range getNodes(node, isTextNode) { + text = append(text, strings.TrimSpace(n.Data)) + } + return strings.Join(text, " ") +} + +func getNodes(node *html.Node, match func(*html.Node) bool) []*html.Node { + nodes := make([]*html.Node, 0) + + queue := make([]*html.Node, 0) + queue = append(queue, node) + for len(queue) > 0 { + var n *html.Node + n, queue = queue[0], queue[1:] + if match(n) { + nodes = append(nodes, n) + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + queue = append(queue, c) + } + } + return nodes +} + +func absoluteUrl(href, base string) string { + baseUrl, err := url.Parse(base) + if err != nil { + return "" + } + hrefUrl, err := url.Parse(href) + if err != nil { + return "" + } + return baseUrl.ResolveReference(hrefUrl).String() +} diff --git a/src/worker/crawl.go b/src/worker/crawl.go deleted file mode 100644 index df29d14..0000000 --- a/src/worker/crawl.go +++ /dev/null @@ -1,109 +0,0 @@ -package worker - -import ( - "net/url" - "strings" - - "golang.org/x/net/html" -) - -func getAttr(node *html.Node, key string) string { - for _, a := range node.Attr { - if a.Key == key { - return a.Val - } - } - return "" -} - -func getText(node *html.Node) string { - text := make([]string, 0) - isTextNode := func(n *html.Node) bool { - return n.Type == html.TextNode - } - for _, n := range getNodes(node, isTextNode) { - text = append(text, strings.TrimSpace(n.Data)) - } - return strings.Join(text, " ") -} - -func getNodes(node *html.Node, match func(*html.Node) bool) []*html.Node { - nodes := make([]*html.Node, 0) - - queue := make([]*html.Node, 0) - queue = append(queue, node) - for len(queue) > 0 { - queue, n := queue[1:], queue[0] - if match(n) { - nodes = append(nodes, n) - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - queue = append(queue, c) - } - } - return nodes -} - -func FindFeeds(doc *html.Node, baseUrl *url.URL) []*FeedSource { - candidates := make(map[string]string) - - linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"} - isFeedLink := func(n *html.Node) bool { - if n.Type == html.ElementNode && n.Data == "link" { - t := getAttr(n, "type") - for _, tt := range linkTypes { - if tt == t { - return true - } - } - } - return false - } - for _, node := range getNodes(doc, isFeedLink) { - href := getAttr(node, "href") - name := getAttr(node, "title") - link := baseUrl.ResolveReference(href).String() - - if href != "" { - candidates[link] = name - } - } - - if len(candidates) == 0 { - // guess by hyperlink properties: - // - a[href="feed"] - // - a:contains("rss") - // ...etc - feedHrefs := []string{"feed", "feed.xml", "rss.xml", "atom.xml"} - feedTexts := []string{"rss", "feed"} - isFeedHyperLink := func(n *html.Node) bool { - if n.Type == html.ElementNode && n.Data == "a" { - href := strings.Trim(getAttr(n, "href"), "/") - text := strings.Lower(getText(n)) - - for _, feedHref := range feedHrefs { - if strings.HasSuffix(href, feedHref) { - return true - } - } - for _, feedText := range feedTexts { - if strings.Contains(text, feedText) { - return true - } - } - } - return false - } - for _, node := range getNodes(doc, isFeedHyperLink) { - href := getAttr(node, "href") - link := baseUrl.ResolveReference(href).String() - candidates[link] = "" - } - } - - sources := make([]*FeedSource, 0, len(candidates)) - for url, title := range candidates { - sources = append(sources, &FeedSource{Title: title, Url: url}) - } - return sources -}