refactor crawler

This commit is contained in:
Nazar Kanaev 2021-03-24 13:33:30 +00:00
parent e0e6166cdf
commit b40fe94147
3 changed files with 112 additions and 146 deletions

View File

@ -155,35 +155,32 @@ func (s *Server) handleFeedList(c *router.Context) {
return return
} }
feed, feedUrl, sources, err := worker.DiscoverFeed(form.Url) result, err := worker.DiscoverFeed(form.Url)
if err != nil { switch {
log.Print(err) case err != nil:
log.Printf("Faild to discover feed for %s: %s", form.Url, err)
c.JSON(http.StatusOK, map[string]string{"status": "notfound"}) c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
return case len(result.Sources) > 0:
} c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": result.Sources})
case result.Feed != nil:
if feed != nil { feed := s.db.CreateFeed(
storedFeed := s.db.CreateFeed( result.Feed.Title,
feed.Title,
"", "",
feed.SiteURL, result.Feed.SiteURL,
feedUrl, result.FeedLink,
form.FolderID, form.FolderID,
) )
s.db.CreateItems(worker.ConvertItems(feed.Items, *storedFeed)) s.db.CreateItems(worker.ConvertItems(result.Feed.Items, *feed))
icon, err := worker.FindFavicon(storedFeed.Link, storedFeed.FeedLink) icon, err := worker.FindFavicon(feed.Link, feed.FeedLink)
if icon != nil { if icon != nil {
s.db.UpdateFeedIcon(storedFeed.Id, icon) s.db.UpdateFeedIcon(feed.Id, icon)
} }
if err != nil { if err != nil {
log.Printf("Failed to find favicon for %s (%d): %s", storedFeed.FeedLink, storedFeed.Id, err) log.Printf("Failed to find favicon for %s (%d): %s", feed.FeedLink, feed.Id, err)
} }
c.JSON(http.StatusOK, map[string]string{"status": "success"}) c.JSON(http.StatusOK, map[string]string{"status": "success"})
} else if sources != nil { default:
c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": sources})
} else {
c.JSON(http.StatusOK, map[string]string{"status": "notfound"}) c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
} }
} }

52
src/worker/client.go Normal file
View File

@ -0,0 +1,52 @@
package worker
import (
"net"
"net/http"
"time"
)
type Client struct {
httpClient *http.Client
userAgent string
}
func (c *Client) get(url string) (*http.Response, error) {
return c.getConditional(url, "", "")
}
func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
if lastModified != "" {
req.Header.Set("If-Modified-Since", lastModified)
}
if etag != "" {
req.Header.Set("If-None-Match", etag)
}
return c.httpClient.Do(req)
}
var client *Client
func init() {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
DisableKeepAlives: true,
TLSHandshakeTimeout: time.Second * 10,
}
httpClient := &http.Client{
Timeout: time.Second * 30,
Transport: transport,
}
client = &Client{
httpClient: httpClient,
userAgent: "Yarr/1.0",
}
}

View File

@ -5,11 +5,8 @@ import (
"errors" "errors"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"log"
"net"
"net/http" "net/http"
"net/url" "net/url"
"time"
"github.com/nkanaev/yarr/src/scraper" "github.com/nkanaev/yarr/src/scraper"
"github.com/nkanaev/yarr/src/parser" "github.com/nkanaev/yarr/src/parser"
@ -22,102 +19,54 @@ type FeedSource struct {
Url string `json:"url"` Url string `json:"url"`
} }
type Client struct { type DiscoverResult struct {
httpClient *http.Client Feed *parser.Feed
userAgent string FeedLink string
Sources []FeedSource
} }
func (c *Client) get(url string) (*http.Response, error) { func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) {
req, err := http.NewRequest("GET", url, nil) result := &DiscoverResult{}
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
return c.httpClient.Do(req)
}
func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
if lastModified != "" {
req.Header.Set("If-Modified-Since", lastModified)
}
if etag != "" {
req.Header.Set("If-None-Match", etag)
}
return c.httpClient.Do(req)
}
var defaultClient *Client
func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) {
sources := make([]FeedSource, 0, 0)
for url, title := range scraper.FindFeeds(string(html), siteurl) {
sources = append(sources, FeedSource{Title: title, Url: url})
}
return sources, nil
}
func DiscoverFeed(candidateUrl string) (*parser.Feed, string, *[]FeedSource, error) {
// Query URL // Query URL
res, err := defaultClient.get(candidateUrl) res, err := client.get(candidateUrl)
if err != nil { if err != nil {
return nil, "", nil, err return nil, err
} }
defer res.Body.Close() defer res.Body.Close()
if res.StatusCode != 200 { if res.StatusCode != 200 {
errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", candidateUrl, res.StatusCode) return nil, fmt.Errorf("status code %d", res.StatusCode)
return nil, "", nil, errors.New(errmsg)
} }
content, err := ioutil.ReadAll(res.Body) content, err := ioutil.ReadAll(res.Body)
if err != nil { if err != nil {
return nil, "", nil, err return nil, err
} }
// Try to feed into parser // Try to feed into parser
feed, err := parser.Parse(bytes.NewReader(content)) feed, err := parser.Parse(bytes.NewReader(content))
if err == nil { if err == nil {
/* feed.TranslateURLs(candidateUrl)
// WILD: feeds may not always have link to themselves result.Feed = feed
if len(feed.FeedLink) == 0 { result.FeedLink = candidateUrl
feed.FeedLink = candidateUrl return result, nil
}
*/
// WILD: resolve relative links (path, without host)
/*
base, _ := url.Parse(candidateUrl)
if link, err := url.Parse(feed.Link); err == nil && link.Host == "" {
feed.Link = base.ResolveReference(link).String()
}
if link, err := url.Parse(feed.FeedLink); err == nil && link.Host == "" {
feed.FeedLink = base.ResolveReference(link).String()
}
*/
err := feed.TranslateURLs(candidateUrl)
if err != nil {
log.Printf("Failed to translate feed urls: %s", err)
}
return feed, candidateUrl, nil, nil
} }
// Possibly an html link. Search for feed links // Possibly an html link. Search for feed links
sources, err := searchFeedLinks(content, candidateUrl) sources := make([]FeedSource, 0)
if err != nil { for url, title := range scraper.FindFeeds(string(content), candidateUrl) {
return nil, "", nil, err sources = append(sources, FeedSource{Title: title, Url: url})
} else if len(sources) == 0 { }
return nil, "", nil, errors.New("No feeds found at the given url") switch {
} else if len(sources) == 1 { case len(sources) == 0:
return nil, errors.New("No feeds found at the given url")
case len(sources) == 1:
if sources[0].Url == candidateUrl { if sources[0].Url == candidateUrl {
return nil, "", nil, errors.New("Recursion!") return nil, errors.New("Recursion!")
} }
return DiscoverFeed(sources[0].Url) return DiscoverFeed(sources[0].Url)
} }
return nil, "", &sources, nil
result.Sources = sources
return result, nil
} }
func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) { func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
@ -132,7 +81,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
} }
if len(websiteUrl) != 0 { if len(websiteUrl) != 0 {
res, err := defaultClient.get(websiteUrl) res, err := client.get(websiteUrl)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -157,7 +106,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
"image/gif", "image/gif",
} }
for _, url := range candidateUrls { for _, url := range candidateUrls {
res, err := defaultClient.get(url) res, err := client.get(url)
if err != nil { if err != nil {
continue continue
} }
@ -180,18 +129,6 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item {
result := make([]storage.Item, len(items)) result := make([]storage.Item, len(items))
for i, item := range items { for i, item := range items {
item := item item := item
podcastUrl := item.PodcastURL
/*
var podcastUrl *string
if item.Enclosures != nil {
for _, enclosure := range item.Enclosures {
if strings.ToLower(enclosure.Type) == "audio/mpeg" {
podcastUrl = &enclosure.URL
}
}
}
*/
result[i] = storage.Item{ result[i] = storage.Item{
GUID: item.GUID, GUID: item.GUID,
FeedId: feed.Id, FeedId: feed.Id,
@ -203,33 +140,30 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item {
Date: &item.Date, Date: &item.Date,
Status: storage.UNREAD, Status: storage.UNREAD,
Image: item.ImageURL, Image: item.ImageURL,
PodcastURL: &podcastUrl, PodcastURL: nil,
} }
} }
return result return result
} }
func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) { func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
var res *http.Response lmod := ""
var err error etag := ""
if state := db.GetHTTPState(f.Id); state != nil {
httpState := db.GetHTTPState(f.Id) lmod = state.LastModified
if httpState != nil { etag = state.Etag
res, err = defaultClient.getConditional(f.FeedLink, httpState.LastModified, httpState.Etag)
} else {
res, err = defaultClient.get(f.FeedLink)
} }
res, err := client.getConditional(f.FeedLink, lmod, etag)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to get: %s", err) return nil, fmt.Errorf("unable to get: %s", err)
} }
defer res.Body.Close() defer res.Body.Close()
if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 { switch {
case res.StatusCode < 200 || res.StatusCode > 399:
return nil, fmt.Errorf("status code %d", res.StatusCode) return nil, fmt.Errorf("status code %d", res.StatusCode)
} case res.StatusCode == http.StatusNotModified:
if res.StatusCode == 304 {
return nil, nil return nil, nil
} }
@ -237,34 +171,17 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to init response body: %s", err) return nil, fmt.Errorf("failed to init response body: %s", err)
} }
feed, err := parser.Parse(body) feed, err := parser.Parse(body)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to parse: %s", err) return nil, fmt.Errorf("failed to parse: %s", err)
} }
lastModified := res.Header.Get("Last-Modified") lmod = res.Header.Get("Last-Modified")
etag := res.Header.Get("Etag") etag = res.Header.Get("Etag")
if lastModified != "" || etag != "" { if lmod != "" || etag != "" {
db.SetHTTPState(f.Id, lastModified, etag) db.SetHTTPState(f.Id, lmod, etag)
} }
return ConvertItems(feed.Items, f), nil return ConvertItems(feed.Items, f), nil
} }
func init() {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
DisableKeepAlives: true,
TLSHandshakeTimeout: time.Second * 10,
}
httpClient := &http.Client{
Timeout: time.Second * 30,
Transport: transport,
}
defaultClient = &Client{
httpClient: httpClient,
userAgent: "Yarr/1.0",
}
}