diff --git a/server/crawler.go b/server/crawler.go index 50f0092..832fd5b 100644 --- a/server/crawler.go +++ b/server/crawler.go @@ -1,6 +1,8 @@ package server import ( + "bytes" + "errors" "fmt" "github.com/PuerkitoBio/goquery" "github.com/mmcdole/gofeed" @@ -29,12 +31,18 @@ const feedLinks = ` a:contains("FEED") ` -func FindFeeds(r *http.Response) ([]FeedSource, error) { +func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) { sources := make([]FeedSource, 0, 0) - doc, err := goquery.NewDocumentFromResponse(r) + + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(html)) if err != nil { return sources, err } + base, err := url.Parse(siteurl) + if err != nil { + return sources, err + } + doc.Find(feedLinks).Each(func(i int, s *goquery.Selection) { if href, ok := s.Attr("href"); ok { feedUrl, err := url.Parse(href) @@ -42,13 +50,56 @@ func FindFeeds(r *http.Response) ([]FeedSource, error) { return } title := s.AttrOr("title", "") - url := doc.Url.ResolveReference(feedUrl).String() + url := base.ResolveReference(feedUrl).String() sources = append(sources, FeedSource{Title: title, Url: url}) } }) return sources, nil } +func discoverFeed(url, userAgent string) (*gofeed.Feed, *[]FeedSource, error) { + // Query URL + feedreq, _ := http.NewRequest("GET", url, nil) + feedreq.Header.Set("user-agent", userAgent) + feedclient := &http.Client{} + res, err := feedclient.Do(feedreq) + if err != nil { + return nil, nil, err + } else if res.StatusCode != 200 { + errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", url, res.StatusCode) + return nil, nil, errors.New(errmsg) + } + content, err := ioutil.ReadAll(res.Body) + if err != nil { + return nil, nil, err + } + + // Try to feed into parser + feedparser := gofeed.NewParser() + feed, err := feedparser.Parse(bytes.NewReader(content)) + if err == nil { + // WILD: some feeds do not have link to itself + if len(feed.FeedLink) == 0 { + feed.FeedLink = url + } + return feed, nil, nil + } + + // Possibly an html link. Search for feed links + sources, err := searchFeedLinks(content, url) + if err != nil { + return nil, nil, err + } else if len(sources) == 0 { + return nil, nil, errors.New("No feeds found at the given url") + } else if len(sources) == 1 { + if sources[0].Url == url { + return nil, nil, errors.New("Recursion!") + } + return discoverFeed(sources[0].Url, userAgent) + } + return nil, &sources, nil +} + func findFavicon(websiteUrl, feedUrl string) (*[]byte, error) { candidateUrls := make([]string, 0) @@ -142,24 +193,3 @@ func listItems(f storage.Feed) ([]storage.Item, error) { } return convertItems(feed.Items, f), nil } - -func createFeed(s *storage.Storage, url string, folderId *int64) error { - fp := gofeed.NewParser() - feed, err := fp.ParseURL(url) - if err != nil { - return err - } - feedLink := feed.FeedLink - if len(feedLink) == 0 { - feedLink = url - } - storedFeed := s.CreateFeed( - feed.Title, - feed.Description, - feed.Link, - feedLink, - folderId, - ) - s.CreateItems(convertItems(feed.Items, *storedFeed)) - return nil -} diff --git a/server/handlers.go b/server/handlers.go index ada4626..b3d47d2 100644 --- a/server/handlers.go +++ b/server/handlers.go @@ -220,64 +220,34 @@ func FeedListHandler(rw http.ResponseWriter, req *http.Request) { list := db(req).ListFeeds() writeJSON(rw, list) } else if req.Method == "POST" { - var feed FeedCreateForm - if err := json.NewDecoder(req.Body).Decode(&feed); err != nil { + var form FeedCreateForm + if err := json.NewDecoder(req.Body).Decode(&form); err != nil { handler(req).log.Print(err) rw.WriteHeader(http.StatusBadRequest) return } - feedUrl := feed.Url - feedreq, _ := http.NewRequest("GET", feedUrl, nil) - feedreq.Header.Set("user-agent", req.Header.Get("user-agent")) - feedclient := &http.Client{} - res, err := feedclient.Do(feedreq) + feed, sources, err := discoverFeed(form.Url, req.Header.Get("user-agent")) if err != nil { handler(req).log.Print(err) writeJSON(rw, map[string]string{"status": "notfound"}) return - } else if res.StatusCode != 200 { - handler(req).log.Printf("Failed to fetch %s (status: %d)", feedUrl, res.StatusCode) - body, err := ioutil.ReadAll(res.Body) - handler(req).log.Print(string(body), err) - writeJSON(rw, map[string]string{"status": "notfound"}) - return } - contentType := res.Header.Get("Content-Type") - if strings.HasPrefix(contentType, "text/html") || contentType == "" { - sources, err := FindFeeds(res) - if err != nil { - handler(req).log.Print(err) - writeJSON(rw, map[string]string{"status": "notfound"}) - return - } - if len(sources) == 0 { - writeJSON(rw, map[string]string{"status": "notfound"}) - } else if len(sources) > 1 { - writeJSON(rw, map[string]interface{}{ - "status": "multiple", - "choice": sources, - }) - } else if len(sources) == 1 { - feedUrl = sources[0].Url - err = createFeed(db(req), feedUrl, feed.FolderID) - if err != nil { - handler(req).log.Print(err) - rw.WriteHeader(http.StatusBadRequest) - return - } - writeJSON(rw, map[string]string{"status": "success"}) - } - } else if strings.Contains(contentType, "xml") || strings.Contains(contentType, "json") { - // text/xml, application/xml, application/rss+xml, application/atom+xml - err = createFeed(db(req), feedUrl, feed.FolderID) - if err == nil { - writeJSON(rw, map[string]string{"status": "success"}) - } + if feed != nil { + storedFeed := db(req).CreateFeed( + feed.Title, + feed.Description, + feed.Link, + feed.FeedLink, + form.FolderID, + ) + db(req).CreateItems(convertItems(feed.Items, *storedFeed)) + writeJSON(rw, map[string]string{"status": "success"}) + } else if sources != nil { + writeJSON(rw, map[string]interface{}{"status": "multiple", "choice": sources}) } else { writeJSON(rw, map[string]string{"status": "notfound"}) - return } } }