diff --git a/src/crawler/finder.go b/src/crawler/finder.go index 3ffb48c..8fe17d2 100644 --- a/src/crawler/finder.go +++ b/src/crawler/finder.go @@ -14,6 +14,8 @@ func FindFeeds(body string, base string) map[string]string { return candidates } + // find direct links + // css: link[type=application/atom+xml] linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"} isFeedLink := func(n *html.Node) bool { if n.Type == html.ElementNode && n.Data == "link" { @@ -35,27 +37,19 @@ func FindFeeds(body string, base string) map[string]string { } } + // guess by hyperlink properties if len(candidates) == 0 { - // guess by hyperlink properties: - // - a[href="feed"] - // - a:contains("rss") - // ...etc + // css: a[href="feed"] + // css: a:contains("rss") feedHrefs := []string{"feed", "feed.xml", "rss.xml", "atom.xml"} feedTexts := []string{"rss", "feed"} isFeedHyperLink := func(n *html.Node) bool { if n.Type == html.ElementNode && n.Data == "a" { - href := strings.Trim(getAttr(n, "href"), "/") - text := getText(n) - - for _, feedHref := range feedHrefs { - if strings.HasSuffix(href, feedHref) { - return true - } + if any(feedHrefs, strings.Trim(getAttr(n, "href"), "/"), strings.HasSuffix) { + return true } - for _, feedText := range feedTexts { - if strings.EqualFold(text, feedText) { - return true - } + if any(feedTexts, getText(n), strings.EqualFold) { + return true } } return false @@ -71,3 +65,23 @@ func FindFeeds(body string, base string) map[string]string { return candidates } + +func FindIcons(body string, base string) []string { + icons := make([]string, 0) + + doc, err := html.Parse(strings.NewReader(body)) + if err != nil { + return icons + } + + // css: link[rel=icon] + isLink := func(n *html.Node) bool { + return n.Type == html.ElementNode && n.Data == "link" + } + for _, node := range getNodes(doc, isLink) { + if any(strings.Split(getAttr(node, "rel"), " "), "icon", strings.EqualFold) { + icons = append(icons, absoluteUrl(getAttr(node, "href"), base)) + } + } + return icons +} diff --git a/src/crawler/finder_test.go b/src/crawler/finder_test.go index 90a5377..a0f596f 100644 --- a/src/crawler/finder_test.go +++ b/src/crawler/finder_test.go @@ -46,7 +46,7 @@ func TestFindFeedsLinks(t *testing.T) { } func TestFindFeedsGuess(t *testing.T) { - x := ` + body := `
@@ -60,15 +60,38 @@ func TestFindFeedsGuess(t *testing.T) { ` - r := FindFeeds(x, base) - - e := map[string]string{ + have := FindFeeds(body, base) + want := map[string]string{ base + "/feed.xml": "", base + "/news": "", } - if !reflect.DeepEqual(e, r) { - t.Logf("want: %#v", e) - t.Logf("have: %#v", r) + if !reflect.DeepEqual(want, have) { + t.Logf("want: %#v", want) + t.Logf("have: %#v", have) + t.Fatal("invalid result") + } +} + +func TestFindIcons(t *testing.T) { + body := ` + + + + +