From 796832025c4202817e3f1c65af9725ead0a90889 Mon Sep 17 00:00:00 2001 From: Nazar Kanaev Date: Mon, 14 Sep 2020 12:17:39 +0100 Subject: [PATCH] eliminate duplicate links --- server/crawler.go | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/server/crawler.go b/server/crawler.go index 100c067..2e7d1cf 100644 --- a/server/crawler.go +++ b/server/crawler.go @@ -60,17 +60,36 @@ func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) { return sources, err } + // feed {url: title} map + feeds := make(map[string]string) + doc.Find(feedLinks).Each(func(i int, s *goquery.Selection) { + // Unlikely to happen, but don't get more than N links + if len(feeds) > 10 { + return + } if href, ok := s.Attr("href"); ok { feedUrl, err := url.Parse(href) if err != nil { return } + title := s.AttrOr("title", "") url := base.ResolveReference(feedUrl).String() - sources = append(sources, FeedSource{Title: title, Url: url}) + + if _, alreadyExists := feeds[url]; alreadyExists { + if feeds[url] == "" { + feeds[url] = title + } + } else { + feeds[url] = title + } } }) + + for url, title := range feeds { + sources = append(sources, FeedSource{Title: title, Url: url}) + } return sources, nil }