diff --git a/go.mod b/go.mod index f4ef7ff..1192447 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,6 @@ module github.com/nkanaev/yarr go 1.16 require ( - github.com/PuerkitoBio/goquery v1.5.1 github.com/mattn/go-sqlite3 v1.14.0 github.com/mmcdole/gofeed v1.0.0 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e diff --git a/src/worker/crawler.go b/src/worker/crawler.go index a5fb2a2..5ab4ddc 100644 --- a/src/worker/crawler.go +++ b/src/worker/crawler.go @@ -4,9 +4,9 @@ import ( "bytes" "errors" "fmt" - "github.com/PuerkitoBio/goquery" "github.com/mmcdole/gofeed" "github.com/nkanaev/yarr/src/storage" + "github.com/nkanaev/yarr/src/crawler" "io/ioutil" "net" "net/http" @@ -63,44 +63,7 @@ var defaultClient *Client func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) { sources := make([]FeedSource, 0, 0) - - doc, err := goquery.NewDocumentFromReader(bytes.NewReader(html)) - if err != nil { - return sources, err - } - base, err := url.Parse(siteurl) - if err != nil { - return sources, err - } - - // feed {url: title} map - feeds := make(map[string]string) - - doc.Find(feedLinks).Each(func(i int, s *goquery.Selection) { - // Unlikely to happen, but don't get more than N links - if len(feeds) > 10 { - return - } - if href, ok := s.Attr("href"); ok { - feedUrl, err := url.Parse(href) - if err != nil { - return - } - - title := s.AttrOr("title", "") - url := base.ResolveReference(feedUrl).String() - - if _, alreadyExists := feeds[url]; alreadyExists { - if feeds[url] == "" { - feeds[url] = title - } - } else { - feeds[url] = title - } - } - }) - - for url, title := range feeds { + for url, title := range crawler.FindFeeds(string(html), siteurl) { sources = append(sources, FeedSource{Title: title, Url: url}) } return sources, nil @@ -170,29 +133,16 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) { } if len(websiteUrl) != 0 { - base, err := url.Parse(websiteUrl) - if err != nil { - return nil, err - } res, err := defaultClient.get(websiteUrl) if err != nil { return nil, err } + body, err := ioutil.ReadAll(res.Body) defer res.Body.Close() - doc, err := goquery.NewDocumentFromReader(res.Body) if err != nil { return nil, err } - doc.Find(`link[rel=icon]`).EachWithBreak(func(i int, s *goquery.Selection) bool { - if href, ok := s.Attr("href"); ok { - if hrefUrl, err := url.Parse(href); err == nil { - faviconUrl := base.ResolveReference(hrefUrl).String() - candidateUrls = append(candidateUrls, faviconUrl) - } - } - return true - }) - + candidateUrls = append(candidateUrls, crawler.FindIcons(string(body), websiteUrl)...) if c := favicon(websiteUrl); len(c) != 0 { candidateUrls = append(candidateUrls, c) }