diff --git a/src/server/routes.go b/src/server/routes.go index 453033c..78375f3 100644 --- a/src/server/routes.go +++ b/src/server/routes.go @@ -155,35 +155,32 @@ func (s *Server) handleFeedList(c *router.Context) { return } - feed, feedUrl, sources, err := worker.DiscoverFeed(form.Url) - if err != nil { - log.Print(err) + result, err := worker.DiscoverFeed(form.Url) + switch { + case err != nil: + log.Printf("Faild to discover feed for %s: %s", form.Url, err) c.JSON(http.StatusOK, map[string]string{"status": "notfound"}) - return - } - - if feed != nil { - storedFeed := s.db.CreateFeed( - feed.Title, + case len(result.Sources) > 0: + c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": result.Sources}) + case result.Feed != nil: + feed := s.db.CreateFeed( + result.Feed.Title, "", - feed.SiteURL, - feedUrl, + result.Feed.SiteURL, + result.FeedLink, form.FolderID, ) - s.db.CreateItems(worker.ConvertItems(feed.Items, *storedFeed)) + s.db.CreateItems(worker.ConvertItems(result.Feed.Items, *feed)) - icon, err := worker.FindFavicon(storedFeed.Link, storedFeed.FeedLink) + icon, err := worker.FindFavicon(feed.Link, feed.FeedLink) if icon != nil { - s.db.UpdateFeedIcon(storedFeed.Id, icon) + s.db.UpdateFeedIcon(feed.Id, icon) } if err != nil { - log.Printf("Failed to find favicon for %s (%d): %s", storedFeed.FeedLink, storedFeed.Id, err) + log.Printf("Failed to find favicon for %s (%d): %s", feed.FeedLink, feed.Id, err) } - c.JSON(http.StatusOK, map[string]string{"status": "success"}) - } else if sources != nil { - c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": sources}) - } else { + default: c.JSON(http.StatusOK, map[string]string{"status": "notfound"}) } } diff --git a/src/worker/client.go b/src/worker/client.go new file mode 100644 index 0000000..d7cac5d --- /dev/null +++ b/src/worker/client.go @@ -0,0 +1,52 @@ +package worker + +import ( + "net" + "net/http" + "time" +) + +type Client struct { + httpClient *http.Client + userAgent string +} + +func (c *Client) get(url string) (*http.Response, error) { + return c.getConditional(url, "", "") +} + +func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", c.userAgent) + if lastModified != "" { + req.Header.Set("If-Modified-Since", lastModified) + } + if etag != "" { + req.Header.Set("If-None-Match", etag) + } + return c.httpClient.Do(req) +} + +var client *Client + +func init() { + transport := &http.Transport{ + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: 10 * time.Second, + }).DialContext, + DisableKeepAlives: true, + TLSHandshakeTimeout: time.Second * 10, + } + httpClient := &http.Client{ + Timeout: time.Second * 30, + Transport: transport, + } + client = &Client{ + httpClient: httpClient, + userAgent: "Yarr/1.0", + } +} diff --git a/src/worker/crawler.go b/src/worker/crawler.go index f922e63..c5d63d4 100644 --- a/src/worker/crawler.go +++ b/src/worker/crawler.go @@ -5,11 +5,8 @@ import ( "errors" "fmt" "io/ioutil" - "log" - "net" "net/http" "net/url" - "time" "github.com/nkanaev/yarr/src/scraper" "github.com/nkanaev/yarr/src/parser" @@ -22,102 +19,54 @@ type FeedSource struct { Url string `json:"url"` } -type Client struct { - httpClient *http.Client - userAgent string +type DiscoverResult struct { + Feed *parser.Feed + FeedLink string + Sources []FeedSource } -func (c *Client) get(url string) (*http.Response, error) { - req, err := http.NewRequest("GET", url, nil) - if err != nil { - return nil, err - } - req.Header.Set("User-Agent", c.userAgent) - return c.httpClient.Do(req) -} - -func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) { - req, err := http.NewRequest("GET", url, nil) - if err != nil { - return nil, err - } - req.Header.Set("User-Agent", c.userAgent) - if lastModified != "" { - req.Header.Set("If-Modified-Since", lastModified) - } - if etag != "" { - req.Header.Set("If-None-Match", etag) - } - return c.httpClient.Do(req) -} - -var defaultClient *Client - -func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) { - sources := make([]FeedSource, 0, 0) - for url, title := range scraper.FindFeeds(string(html), siteurl) { - sources = append(sources, FeedSource{Title: title, Url: url}) - } - return sources, nil -} - -func DiscoverFeed(candidateUrl string) (*parser.Feed, string, *[]FeedSource, error) { +func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) { + result := &DiscoverResult{} // Query URL - res, err := defaultClient.get(candidateUrl) + res, err := client.get(candidateUrl) if err != nil { - return nil, "", nil, err + return nil, err } defer res.Body.Close() if res.StatusCode != 200 { - errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", candidateUrl, res.StatusCode) - return nil, "", nil, errors.New(errmsg) + return nil, fmt.Errorf("status code %d", res.StatusCode) } content, err := ioutil.ReadAll(res.Body) if err != nil { - return nil, "", nil, err + return nil, err } // Try to feed into parser feed, err := parser.Parse(bytes.NewReader(content)) if err == nil { - /* - // WILD: feeds may not always have link to themselves - if len(feed.FeedLink) == 0 { - feed.FeedLink = candidateUrl - } - */ - - // WILD: resolve relative links (path, without host) - /* - base, _ := url.Parse(candidateUrl) - if link, err := url.Parse(feed.Link); err == nil && link.Host == "" { - feed.Link = base.ResolveReference(link).String() - } - if link, err := url.Parse(feed.FeedLink); err == nil && link.Host == "" { - feed.FeedLink = base.ResolveReference(link).String() - } - */ - err := feed.TranslateURLs(candidateUrl) - if err != nil { - log.Printf("Failed to translate feed urls: %s", err) - } - - return feed, candidateUrl, nil, nil + feed.TranslateURLs(candidateUrl) + result.Feed = feed + result.FeedLink = candidateUrl + return result, nil } // Possibly an html link. Search for feed links - sources, err := searchFeedLinks(content, candidateUrl) - if err != nil { - return nil, "", nil, err - } else if len(sources) == 0 { - return nil, "", nil, errors.New("No feeds found at the given url") - } else if len(sources) == 1 { + sources := make([]FeedSource, 0) + for url, title := range scraper.FindFeeds(string(content), candidateUrl) { + sources = append(sources, FeedSource{Title: title, Url: url}) + } + switch { + case len(sources) == 0: + return nil, errors.New("No feeds found at the given url") + case len(sources) == 1: if sources[0].Url == candidateUrl { - return nil, "", nil, errors.New("Recursion!") + return nil, errors.New("Recursion!") } return DiscoverFeed(sources[0].Url) } - return nil, "", &sources, nil + + result.Sources = sources + return result, nil } func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) { @@ -132,7 +81,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) { } if len(websiteUrl) != 0 { - res, err := defaultClient.get(websiteUrl) + res, err := client.get(websiteUrl) if err != nil { return nil, err } @@ -157,7 +106,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) { "image/gif", } for _, url := range candidateUrls { - res, err := defaultClient.get(url) + res, err := client.get(url) if err != nil { continue } @@ -180,18 +129,6 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item { result := make([]storage.Item, len(items)) for i, item := range items { item := item - podcastUrl := item.PodcastURL - - /* - var podcastUrl *string - if item.Enclosures != nil { - for _, enclosure := range item.Enclosures { - if strings.ToLower(enclosure.Type) == "audio/mpeg" { - podcastUrl = &enclosure.URL - } - } - } - */ result[i] = storage.Item{ GUID: item.GUID, FeedId: feed.Id, @@ -203,33 +140,30 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item { Date: &item.Date, Status: storage.UNREAD, Image: item.ImageURL, - PodcastURL: &podcastUrl, + PodcastURL: nil, } } return result } func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) { - var res *http.Response - var err error - - httpState := db.GetHTTPState(f.Id) - if httpState != nil { - res, err = defaultClient.getConditional(f.FeedLink, httpState.LastModified, httpState.Etag) - } else { - res, err = defaultClient.get(f.FeedLink) + lmod := "" + etag := "" + if state := db.GetHTTPState(f.Id); state != nil { + lmod = state.LastModified + etag = state.Etag } + res, err := client.getConditional(f.FeedLink, lmod, etag) if err != nil { return nil, fmt.Errorf("unable to get: %s", err) } defer res.Body.Close() - if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 { + switch { + case res.StatusCode < 200 || res.StatusCode > 399: return nil, fmt.Errorf("status code %d", res.StatusCode) - } - - if res.StatusCode == 304 { + case res.StatusCode == http.StatusNotModified: return nil, nil } @@ -237,34 +171,17 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) { if err != nil { return nil, fmt.Errorf("failed to init response body: %s", err) } + feed, err := parser.Parse(body) if err != nil { return nil, fmt.Errorf("failed to parse: %s", err) } - lastModified := res.Header.Get("Last-Modified") - etag := res.Header.Get("Etag") - if lastModified != "" || etag != "" { - db.SetHTTPState(f.Id, lastModified, etag) + lmod = res.Header.Get("Last-Modified") + etag = res.Header.Get("Etag") + if lmod != "" || etag != "" { + db.SetHTTPState(f.Id, lmod, etag) } + return ConvertItems(feed.Items, f), nil } - -func init() { - transport := &http.Transport{ - Proxy: http.ProxyFromEnvironment, - DialContext: (&net.Dialer{ - Timeout: 10 * time.Second, - }).DialContext, - DisableKeepAlives: true, - TLSHandshakeTimeout: time.Second * 10, - } - httpClient := &http.Client{ - Timeout: time.Second * 30, - Transport: transport, - } - defaultClient = &Client{ - httpClient: httpClient, - userAgent: "Yarr/1.0", - } -}