mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
rewrite crawler
This commit is contained in:
parent
646519e074
commit
9ede816078
@ -14,6 +14,7 @@ import (
|
|||||||
"github.com/nkanaev/yarr/src/crawler"
|
"github.com/nkanaev/yarr/src/crawler"
|
||||||
feedparser "github.com/nkanaev/yarr/src/feed"
|
feedparser "github.com/nkanaev/yarr/src/feed"
|
||||||
"github.com/nkanaev/yarr/src/storage"
|
"github.com/nkanaev/yarr/src/storage"
|
||||||
|
"golang.org/x/net/html/charset"
|
||||||
)
|
)
|
||||||
|
|
||||||
type FeedSource struct {
|
type FeedSource struct {
|
||||||
@ -41,8 +42,12 @@ func (c *Client) getConditional(url, lastModified, etag string) (*http.Response,
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
req.Header.Set("User-Agent", c.userAgent)
|
req.Header.Set("User-Agent", c.userAgent)
|
||||||
req.Header.Set("If-Modified-Since", lastModified)
|
if lastModified != "" {
|
||||||
req.Header.Set("If-None-Match", etag)
|
req.Header.Set("If-Modified-Since", lastModified)
|
||||||
|
}
|
||||||
|
if etag != "" {
|
||||||
|
req.Header.Set("If-None-Match", etag)
|
||||||
|
}
|
||||||
return c.httpClient.Do(req)
|
return c.httpClient.Do(req)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -215,29 +220,32 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, fmt.Errorf("unable to get: %s", err)
|
||||||
}
|
}
|
||||||
defer res.Body.Close()
|
defer res.Body.Close()
|
||||||
|
|
||||||
if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 {
|
if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 {
|
||||||
errmsg := fmt.Sprintf("Failed to list feed items for %s (status: %d)", f.FeedLink, res.StatusCode)
|
return nil, fmt.Errorf("status code %d", res.StatusCode)
|
||||||
return nil, errors.New(errmsg)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if res.StatusCode == 304 {
|
if res.StatusCode == 304 {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
body, err := charset.NewReader(res.Body, res.Header.Get("Content-Type"))
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to init response body: %s", err)
|
||||||
|
}
|
||||||
|
feed, err := feedparser.Parse(body)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
lastModified := res.Header.Get("Last-Modified")
|
lastModified := res.Header.Get("Last-Modified")
|
||||||
etag := res.Header.Get("Etag")
|
etag := res.Header.Get("Etag")
|
||||||
if lastModified != "" || etag != "" {
|
if lastModified != "" || etag != "" {
|
||||||
db.SetHTTPState(f.Id, lastModified, etag)
|
db.SetHTTPState(f.Id, lastModified, etag)
|
||||||
}
|
}
|
||||||
|
|
||||||
feed, err := feedparser.Parse(res.Body)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return ConvertItems(feed.Items, f), nil
|
return ConvertItems(feed.Items, f), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user