diff --git a/server/crawler.go b/server/crawler.go index 2ca73e8..4d9f758 100644 --- a/server/crawler.go +++ b/server/crawler.go @@ -47,6 +47,17 @@ func (c *Client) get(url string) (*http.Response, error) { return c.httpClient.Do(req) } +func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", c.userAgent) + req.Header.Set("If-Modified-Since", lastModified) + req.Header.Set("If-None-Match", etag) + return c.httpClient.Do(req) +} + var defaultClient *Client func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) { @@ -243,16 +254,37 @@ func convertItems(items []*gofeed.Item, feed storage.Feed) []storage.Item { return result } -func listItems(f storage.Feed) ([]storage.Item, error) { - res, err := defaultClient.get(f.FeedLink) +func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) { + var res *http.Response + var err error + + httpState := db.GetHTTPState(f.FeedLink) + if httpState != nil { + res, err = defaultClient.getConditional(f.FeedLink, httpState.LastModified, httpState.Etag) + } else { + res, err = defaultClient.get(f.FeedLink) + } + if err != nil { return nil, err } defer res.Body.Close() + if res.StatusCode == 404 { errmsg := fmt.Sprintf("Failed to list feed items for %s (status: 404)", f.FeedLink) return nil, errors.New(errmsg) } + + if res.StatusCode == 304 { + return nil, nil + } + + lastModified := res.Header.Get("Last-Modified") + etag := res.Header.Get("Etag") + if lastModified != "" || etag != "" { + db.SetHTTPState(f.FeedLink, storage.HTTPState{LastModified: lastModified, Etag: etag}) + } + feedparser := gofeed.NewParser() feed, err := feedparser.Parse(res.Body) if err != nil { diff --git a/server/server.go b/server/server.go index ffa9b13..bce365d 100644 --- a/server/server.go +++ b/server/server.go @@ -71,7 +71,7 @@ func (h *Handler) startJobs() { for { select { case feed := <-h.feedQueue: - items, err := listItems(feed) + items, err := listItems(feed, h.db) atomic.AddInt32(h.queueSize, -1) if err != nil { h.log.Printf("Failed to fetch %s (%d): %s", feed.FeedLink, feed.Id, err)