From 9ede816078974e8cef9f2b645b3d3861191627e5 Mon Sep 17 00:00:00 2001 From: Nazar Kanaev Date: Tue, 23 Mar 2021 15:14:58 +0000 Subject: [PATCH] rewrite crawler --- src/worker/crawler.go | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/worker/crawler.go b/src/worker/crawler.go index 01d59c9..6e21b20 100644 --- a/src/worker/crawler.go +++ b/src/worker/crawler.go @@ -14,6 +14,7 @@ import ( "github.com/nkanaev/yarr/src/crawler" feedparser "github.com/nkanaev/yarr/src/feed" "github.com/nkanaev/yarr/src/storage" + "golang.org/x/net/html/charset" ) type FeedSource struct { @@ -41,8 +42,12 @@ func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, return nil, err } req.Header.Set("User-Agent", c.userAgent) - req.Header.Set("If-Modified-Since", lastModified) - req.Header.Set("If-None-Match", etag) + if lastModified != "" { + req.Header.Set("If-Modified-Since", lastModified) + } + if etag != "" { + req.Header.Set("If-None-Match", etag) + } return c.httpClient.Do(req) } @@ -215,29 +220,32 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) { } if err != nil { - return nil, err + return nil, fmt.Errorf("unable to get: %s", err) } defer res.Body.Close() if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 { - errmsg := fmt.Sprintf("Failed to list feed items for %s (status: %d)", f.FeedLink, res.StatusCode) - return nil, errors.New(errmsg) + return nil, fmt.Errorf("status code %d", res.StatusCode) } if res.StatusCode == 304 { return nil, nil } + body, err := charset.NewReader(res.Body, res.Header.Get("Content-Type")) + if err != nil { + return nil, fmt.Errorf("failed to init response body: %s", err) + } + feed, err := feedparser.Parse(body) + if err != nil { + return nil, fmt.Errorf("failed to parse: %s", err) + } + lastModified := res.Header.Get("Last-Modified") etag := res.Header.Get("Etag") if lastModified != "" || etag != "" { db.SetHTTPState(f.Id, lastModified, etag) } - - feed, err := feedparser.Parse(res.Body) - if err != nil { - return nil, err - } return ConvertItems(feed.Items, f), nil }