refactor crawler

This commit is contained in:
Nazar Kanaev 2021-03-24 13:33:30 +00:00
parent e0e6166cdf
commit b40fe94147
3 changed files with 112 additions and 146 deletions

View File

@ -155,35 +155,32 @@ func (s *Server) handleFeedList(c *router.Context) {
return
}
feed, feedUrl, sources, err := worker.DiscoverFeed(form.Url)
if err != nil {
log.Print(err)
result, err := worker.DiscoverFeed(form.Url)
switch {
case err != nil:
log.Printf("Faild to discover feed for %s: %s", form.Url, err)
c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
return
}
if feed != nil {
storedFeed := s.db.CreateFeed(
feed.Title,
case len(result.Sources) > 0:
c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": result.Sources})
case result.Feed != nil:
feed := s.db.CreateFeed(
result.Feed.Title,
"",
feed.SiteURL,
feedUrl,
result.Feed.SiteURL,
result.FeedLink,
form.FolderID,
)
s.db.CreateItems(worker.ConvertItems(feed.Items, *storedFeed))
s.db.CreateItems(worker.ConvertItems(result.Feed.Items, *feed))
icon, err := worker.FindFavicon(storedFeed.Link, storedFeed.FeedLink)
icon, err := worker.FindFavicon(feed.Link, feed.FeedLink)
if icon != nil {
s.db.UpdateFeedIcon(storedFeed.Id, icon)
s.db.UpdateFeedIcon(feed.Id, icon)
}
if err != nil {
log.Printf("Failed to find favicon for %s (%d): %s", storedFeed.FeedLink, storedFeed.Id, err)
log.Printf("Failed to find favicon for %s (%d): %s", feed.FeedLink, feed.Id, err)
}
c.JSON(http.StatusOK, map[string]string{"status": "success"})
} else if sources != nil {
c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": sources})
} else {
default:
c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
}
}

52
src/worker/client.go Normal file
View File

@ -0,0 +1,52 @@
package worker
import (
"net"
"net/http"
"time"
)
type Client struct {
httpClient *http.Client
userAgent string
}
func (c *Client) get(url string) (*http.Response, error) {
return c.getConditional(url, "", "")
}
func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
if lastModified != "" {
req.Header.Set("If-Modified-Since", lastModified)
}
if etag != "" {
req.Header.Set("If-None-Match", etag)
}
return c.httpClient.Do(req)
}
var client *Client
func init() {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
DisableKeepAlives: true,
TLSHandshakeTimeout: time.Second * 10,
}
httpClient := &http.Client{
Timeout: time.Second * 30,
Transport: transport,
}
client = &Client{
httpClient: httpClient,
userAgent: "Yarr/1.0",
}
}

View File

@ -5,11 +5,8 @@ import (
"errors"
"fmt"
"io/ioutil"
"log"
"net"
"net/http"
"net/url"
"time"
"github.com/nkanaev/yarr/src/scraper"
"github.com/nkanaev/yarr/src/parser"
@ -22,102 +19,54 @@ type FeedSource struct {
Url string `json:"url"`
}
type Client struct {
httpClient *http.Client
userAgent string
type DiscoverResult struct {
Feed *parser.Feed
FeedLink string
Sources []FeedSource
}
func (c *Client) get(url string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
return c.httpClient.Do(req)
}
func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
if lastModified != "" {
req.Header.Set("If-Modified-Since", lastModified)
}
if etag != "" {
req.Header.Set("If-None-Match", etag)
}
return c.httpClient.Do(req)
}
var defaultClient *Client
func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) {
sources := make([]FeedSource, 0, 0)
for url, title := range scraper.FindFeeds(string(html), siteurl) {
sources = append(sources, FeedSource{Title: title, Url: url})
}
return sources, nil
}
func DiscoverFeed(candidateUrl string) (*parser.Feed, string, *[]FeedSource, error) {
func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) {
result := &DiscoverResult{}
// Query URL
res, err := defaultClient.get(candidateUrl)
res, err := client.get(candidateUrl)
if err != nil {
return nil, "", nil, err
return nil, err
}
defer res.Body.Close()
if res.StatusCode != 200 {
errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", candidateUrl, res.StatusCode)
return nil, "", nil, errors.New(errmsg)
return nil, fmt.Errorf("status code %d", res.StatusCode)
}
content, err := ioutil.ReadAll(res.Body)
if err != nil {
return nil, "", nil, err
return nil, err
}
// Try to feed into parser
feed, err := parser.Parse(bytes.NewReader(content))
if err == nil {
/*
// WILD: feeds may not always have link to themselves
if len(feed.FeedLink) == 0 {
feed.FeedLink = candidateUrl
}
*/
// WILD: resolve relative links (path, without host)
/*
base, _ := url.Parse(candidateUrl)
if link, err := url.Parse(feed.Link); err == nil && link.Host == "" {
feed.Link = base.ResolveReference(link).String()
}
if link, err := url.Parse(feed.FeedLink); err == nil && link.Host == "" {
feed.FeedLink = base.ResolveReference(link).String()
}
*/
err := feed.TranslateURLs(candidateUrl)
if err != nil {
log.Printf("Failed to translate feed urls: %s", err)
}
return feed, candidateUrl, nil, nil
feed.TranslateURLs(candidateUrl)
result.Feed = feed
result.FeedLink = candidateUrl
return result, nil
}
// Possibly an html link. Search for feed links
sources, err := searchFeedLinks(content, candidateUrl)
if err != nil {
return nil, "", nil, err
} else if len(sources) == 0 {
return nil, "", nil, errors.New("No feeds found at the given url")
} else if len(sources) == 1 {
sources := make([]FeedSource, 0)
for url, title := range scraper.FindFeeds(string(content), candidateUrl) {
sources = append(sources, FeedSource{Title: title, Url: url})
}
switch {
case len(sources) == 0:
return nil, errors.New("No feeds found at the given url")
case len(sources) == 1:
if sources[0].Url == candidateUrl {
return nil, "", nil, errors.New("Recursion!")
return nil, errors.New("Recursion!")
}
return DiscoverFeed(sources[0].Url)
}
return nil, "", &sources, nil
result.Sources = sources
return result, nil
}
func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
@ -132,7 +81,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
}
if len(websiteUrl) != 0 {
res, err := defaultClient.get(websiteUrl)
res, err := client.get(websiteUrl)
if err != nil {
return nil, err
}
@ -157,7 +106,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
"image/gif",
}
for _, url := range candidateUrls {
res, err := defaultClient.get(url)
res, err := client.get(url)
if err != nil {
continue
}
@ -180,18 +129,6 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item {
result := make([]storage.Item, len(items))
for i, item := range items {
item := item
podcastUrl := item.PodcastURL
/*
var podcastUrl *string
if item.Enclosures != nil {
for _, enclosure := range item.Enclosures {
if strings.ToLower(enclosure.Type) == "audio/mpeg" {
podcastUrl = &enclosure.URL
}
}
}
*/
result[i] = storage.Item{
GUID: item.GUID,
FeedId: feed.Id,
@ -203,33 +140,30 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item {
Date: &item.Date,
Status: storage.UNREAD,
Image: item.ImageURL,
PodcastURL: &podcastUrl,
PodcastURL: nil,
}
}
return result
}
func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
var res *http.Response
var err error
httpState := db.GetHTTPState(f.Id)
if httpState != nil {
res, err = defaultClient.getConditional(f.FeedLink, httpState.LastModified, httpState.Etag)
} else {
res, err = defaultClient.get(f.FeedLink)
lmod := ""
etag := ""
if state := db.GetHTTPState(f.Id); state != nil {
lmod = state.LastModified
etag = state.Etag
}
res, err := client.getConditional(f.FeedLink, lmod, etag)
if err != nil {
return nil, fmt.Errorf("unable to get: %s", err)
}
defer res.Body.Close()
if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 {
switch {
case res.StatusCode < 200 || res.StatusCode > 399:
return nil, fmt.Errorf("status code %d", res.StatusCode)
}
if res.StatusCode == 304 {
case res.StatusCode == http.StatusNotModified:
return nil, nil
}
@ -237,34 +171,17 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
if err != nil {
return nil, fmt.Errorf("failed to init response body: %s", err)
}
feed, err := parser.Parse(body)
if err != nil {
return nil, fmt.Errorf("failed to parse: %s", err)
}
lastModified := res.Header.Get("Last-Modified")
etag := res.Header.Get("Etag")
if lastModified != "" || etag != "" {
db.SetHTTPState(f.Id, lastModified, etag)
}
return ConvertItems(feed.Items, f), nil
lmod = res.Header.Get("Last-Modified")
etag = res.Header.Get("Etag")
if lmod != "" || etag != "" {
db.SetHTTPState(f.Id, lmod, etag)
}
func init() {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
DisableKeepAlives: true,
TLSHandshakeTimeout: time.Second * 10,
}
httpClient := &http.Client{
Timeout: time.Second * 30,
Transport: transport,
}
defaultClient = &Client{
httpClient: httpClient,
userAgent: "Yarr/1.0",
}
return ConvertItems(feed.Items, f), nil
}