From c89644052528aed74a4571811c549b6ecb74155e Mon Sep 17 00:00:00 2001 From: Nazar Kanaev Date: Thu, 18 Mar 2021 11:30:01 +0000 Subject: [PATCH] find favicons --- src/crawler/finder.go | 44 +++++++++++++++++++++++++------------- src/crawler/finder_test.go | 37 ++++++++++++++++++++++++++------ src/crawler/utils.go | 9 ++++++++ 3 files changed, 68 insertions(+), 22 deletions(-) diff --git a/src/crawler/finder.go b/src/crawler/finder.go index 3ffb48c..8fe17d2 100644 --- a/src/crawler/finder.go +++ b/src/crawler/finder.go @@ -14,6 +14,8 @@ func FindFeeds(body string, base string) map[string]string { return candidates } + // find direct links + // css: link[type=application/atom+xml] linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"} isFeedLink := func(n *html.Node) bool { if n.Type == html.ElementNode && n.Data == "link" { @@ -35,27 +37,19 @@ func FindFeeds(body string, base string) map[string]string { } } + // guess by hyperlink properties if len(candidates) == 0 { - // guess by hyperlink properties: - // - a[href="feed"] - // - a:contains("rss") - // ...etc + // css: a[href="feed"] + // css: a:contains("rss") feedHrefs := []string{"feed", "feed.xml", "rss.xml", "atom.xml"} feedTexts := []string{"rss", "feed"} isFeedHyperLink := func(n *html.Node) bool { if n.Type == html.ElementNode && n.Data == "a" { - href := strings.Trim(getAttr(n, "href"), "/") - text := getText(n) - - for _, feedHref := range feedHrefs { - if strings.HasSuffix(href, feedHref) { - return true - } + if any(feedHrefs, strings.Trim(getAttr(n, "href"), "/"), strings.HasSuffix) { + return true } - for _, feedText := range feedTexts { - if strings.EqualFold(text, feedText) { - return true - } + if any(feedTexts, getText(n), strings.EqualFold) { + return true } } return false @@ -71,3 +65,23 @@ func FindFeeds(body string, base string) map[string]string { return candidates } + +func FindIcons(body string, base string) []string { + icons := make([]string, 0) + + doc, err := html.Parse(strings.NewReader(body)) + if err != nil { + return icons + } + + // css: link[rel=icon] + isLink := func(n *html.Node) bool { + return n.Type == html.ElementNode && n.Data == "link" + } + for _, node := range getNodes(doc, isLink) { + if any(strings.Split(getAttr(node, "rel"), " "), "icon", strings.EqualFold) { + icons = append(icons, absoluteUrl(getAttr(node, "href"), base)) + } + } + return icons +} diff --git a/src/crawler/finder_test.go b/src/crawler/finder_test.go index 90a5377..a0f596f 100644 --- a/src/crawler/finder_test.go +++ b/src/crawler/finder_test.go @@ -46,7 +46,7 @@ func TestFindFeedsLinks(t *testing.T) { } func TestFindFeedsGuess(t *testing.T) { - x := ` + body := ` @@ -60,15 +60,38 @@ func TestFindFeedsGuess(t *testing.T) { ` - r := FindFeeds(x, base) - - e := map[string]string{ + have := FindFeeds(body, base) + want := map[string]string{ base + "/feed.xml": "", base + "/news": "", } - if !reflect.DeepEqual(e, r) { - t.Logf("want: %#v", e) - t.Logf("have: %#v", r) + if !reflect.DeepEqual(want, have) { + t.Logf("want: %#v", want) + t.Logf("have: %#v", have) + t.Fatal("invalid result") + } +} + +func TestFindIcons(t *testing.T) { + body := ` + + + + + + + + + + + + + ` + have := FindIcons(body, base) + want := []string{base + "/favicon.ico", base + "/path/to/favicon.png"} + if !reflect.DeepEqual(have, want) { + t.Logf("want: %#v", want) + t.Logf("have: %#v", have) t.Fatal("invalid result") } } diff --git a/src/crawler/utils.go b/src/crawler/utils.go index 7291c4d..0381fe0 100644 --- a/src/crawler/utils.go +++ b/src/crawler/utils.go @@ -7,6 +7,15 @@ import ( "golang.org/x/net/html" ) +func any(els []string, el string, match func(string, string) bool) bool { + for _, x := range els { + if match(x, el) { + return true + } + } + return false +} + func getAttr(node *html.Node, key string) string { for _, a := range node.Attr { if a.Key == key {