mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
refactor crawler
This commit is contained in:
parent
e0e6166cdf
commit
b40fe94147
@ -155,35 +155,32 @@ func (s *Server) handleFeedList(c *router.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
feed, feedUrl, sources, err := worker.DiscoverFeed(form.Url)
|
result, err := worker.DiscoverFeed(form.Url)
|
||||||
if err != nil {
|
switch {
|
||||||
log.Print(err)
|
case err != nil:
|
||||||
|
log.Printf("Faild to discover feed for %s: %s", form.Url, err)
|
||||||
c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
|
c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
|
||||||
return
|
case len(result.Sources) > 0:
|
||||||
}
|
c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": result.Sources})
|
||||||
|
case result.Feed != nil:
|
||||||
if feed != nil {
|
feed := s.db.CreateFeed(
|
||||||
storedFeed := s.db.CreateFeed(
|
result.Feed.Title,
|
||||||
feed.Title,
|
|
||||||
"",
|
"",
|
||||||
feed.SiteURL,
|
result.Feed.SiteURL,
|
||||||
feedUrl,
|
result.FeedLink,
|
||||||
form.FolderID,
|
form.FolderID,
|
||||||
)
|
)
|
||||||
s.db.CreateItems(worker.ConvertItems(feed.Items, *storedFeed))
|
s.db.CreateItems(worker.ConvertItems(result.Feed.Items, *feed))
|
||||||
|
|
||||||
icon, err := worker.FindFavicon(storedFeed.Link, storedFeed.FeedLink)
|
icon, err := worker.FindFavicon(feed.Link, feed.FeedLink)
|
||||||
if icon != nil {
|
if icon != nil {
|
||||||
s.db.UpdateFeedIcon(storedFeed.Id, icon)
|
s.db.UpdateFeedIcon(feed.Id, icon)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Failed to find favicon for %s (%d): %s", storedFeed.FeedLink, storedFeed.Id, err)
|
log.Printf("Failed to find favicon for %s (%d): %s", feed.FeedLink, feed.Id, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
c.JSON(http.StatusOK, map[string]string{"status": "success"})
|
c.JSON(http.StatusOK, map[string]string{"status": "success"})
|
||||||
} else if sources != nil {
|
default:
|
||||||
c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": sources})
|
|
||||||
} else {
|
|
||||||
c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
|
c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
52
src/worker/client.go
Normal file
52
src/worker/client.go
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
package worker
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net"
|
||||||
|
"net/http"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Client struct {
|
||||||
|
httpClient *http.Client
|
||||||
|
userAgent string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Client) get(url string) (*http.Response, error) {
|
||||||
|
return c.getConditional(url, "", "")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) {
|
||||||
|
req, err := http.NewRequest("GET", url, nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
req.Header.Set("User-Agent", c.userAgent)
|
||||||
|
if lastModified != "" {
|
||||||
|
req.Header.Set("If-Modified-Since", lastModified)
|
||||||
|
}
|
||||||
|
if etag != "" {
|
||||||
|
req.Header.Set("If-None-Match", etag)
|
||||||
|
}
|
||||||
|
return c.httpClient.Do(req)
|
||||||
|
}
|
||||||
|
|
||||||
|
var client *Client
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
transport := &http.Transport{
|
||||||
|
Proxy: http.ProxyFromEnvironment,
|
||||||
|
DialContext: (&net.Dialer{
|
||||||
|
Timeout: 10 * time.Second,
|
||||||
|
}).DialContext,
|
||||||
|
DisableKeepAlives: true,
|
||||||
|
TLSHandshakeTimeout: time.Second * 10,
|
||||||
|
}
|
||||||
|
httpClient := &http.Client{
|
||||||
|
Timeout: time.Second * 30,
|
||||||
|
Transport: transport,
|
||||||
|
}
|
||||||
|
client = &Client{
|
||||||
|
httpClient: httpClient,
|
||||||
|
userAgent: "Yarr/1.0",
|
||||||
|
}
|
||||||
|
}
|
@ -5,11 +5,8 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
|
||||||
"net"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/nkanaev/yarr/src/scraper"
|
"github.com/nkanaev/yarr/src/scraper"
|
||||||
"github.com/nkanaev/yarr/src/parser"
|
"github.com/nkanaev/yarr/src/parser"
|
||||||
@ -22,102 +19,54 @@ type FeedSource struct {
|
|||||||
Url string `json:"url"`
|
Url string `json:"url"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Client struct {
|
type DiscoverResult struct {
|
||||||
httpClient *http.Client
|
Feed *parser.Feed
|
||||||
userAgent string
|
FeedLink string
|
||||||
|
Sources []FeedSource
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Client) get(url string) (*http.Response, error) {
|
func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) {
|
||||||
req, err := http.NewRequest("GET", url, nil)
|
result := &DiscoverResult{}
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
req.Header.Set("User-Agent", c.userAgent)
|
|
||||||
return c.httpClient.Do(req)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) {
|
|
||||||
req, err := http.NewRequest("GET", url, nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
req.Header.Set("User-Agent", c.userAgent)
|
|
||||||
if lastModified != "" {
|
|
||||||
req.Header.Set("If-Modified-Since", lastModified)
|
|
||||||
}
|
|
||||||
if etag != "" {
|
|
||||||
req.Header.Set("If-None-Match", etag)
|
|
||||||
}
|
|
||||||
return c.httpClient.Do(req)
|
|
||||||
}
|
|
||||||
|
|
||||||
var defaultClient *Client
|
|
||||||
|
|
||||||
func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) {
|
|
||||||
sources := make([]FeedSource, 0, 0)
|
|
||||||
for url, title := range scraper.FindFeeds(string(html), siteurl) {
|
|
||||||
sources = append(sources, FeedSource{Title: title, Url: url})
|
|
||||||
}
|
|
||||||
return sources, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func DiscoverFeed(candidateUrl string) (*parser.Feed, string, *[]FeedSource, error) {
|
|
||||||
// Query URL
|
// Query URL
|
||||||
res, err := defaultClient.get(candidateUrl)
|
res, err := client.get(candidateUrl)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, "", nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
defer res.Body.Close()
|
defer res.Body.Close()
|
||||||
if res.StatusCode != 200 {
|
if res.StatusCode != 200 {
|
||||||
errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", candidateUrl, res.StatusCode)
|
return nil, fmt.Errorf("status code %d", res.StatusCode)
|
||||||
return nil, "", nil, errors.New(errmsg)
|
|
||||||
}
|
}
|
||||||
content, err := ioutil.ReadAll(res.Body)
|
content, err := ioutil.ReadAll(res.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, "", nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to feed into parser
|
// Try to feed into parser
|
||||||
feed, err := parser.Parse(bytes.NewReader(content))
|
feed, err := parser.Parse(bytes.NewReader(content))
|
||||||
if err == nil {
|
if err == nil {
|
||||||
/*
|
feed.TranslateURLs(candidateUrl)
|
||||||
// WILD: feeds may not always have link to themselves
|
result.Feed = feed
|
||||||
if len(feed.FeedLink) == 0 {
|
result.FeedLink = candidateUrl
|
||||||
feed.FeedLink = candidateUrl
|
return result, nil
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
// WILD: resolve relative links (path, without host)
|
|
||||||
/*
|
|
||||||
base, _ := url.Parse(candidateUrl)
|
|
||||||
if link, err := url.Parse(feed.Link); err == nil && link.Host == "" {
|
|
||||||
feed.Link = base.ResolveReference(link).String()
|
|
||||||
}
|
|
||||||
if link, err := url.Parse(feed.FeedLink); err == nil && link.Host == "" {
|
|
||||||
feed.FeedLink = base.ResolveReference(link).String()
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
err := feed.TranslateURLs(candidateUrl)
|
|
||||||
if err != nil {
|
|
||||||
log.Printf("Failed to translate feed urls: %s", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return feed, candidateUrl, nil, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Possibly an html link. Search for feed links
|
// Possibly an html link. Search for feed links
|
||||||
sources, err := searchFeedLinks(content, candidateUrl)
|
sources := make([]FeedSource, 0)
|
||||||
if err != nil {
|
for url, title := range scraper.FindFeeds(string(content), candidateUrl) {
|
||||||
return nil, "", nil, err
|
sources = append(sources, FeedSource{Title: title, Url: url})
|
||||||
} else if len(sources) == 0 {
|
}
|
||||||
return nil, "", nil, errors.New("No feeds found at the given url")
|
switch {
|
||||||
} else if len(sources) == 1 {
|
case len(sources) == 0:
|
||||||
|
return nil, errors.New("No feeds found at the given url")
|
||||||
|
case len(sources) == 1:
|
||||||
if sources[0].Url == candidateUrl {
|
if sources[0].Url == candidateUrl {
|
||||||
return nil, "", nil, errors.New("Recursion!")
|
return nil, errors.New("Recursion!")
|
||||||
}
|
}
|
||||||
return DiscoverFeed(sources[0].Url)
|
return DiscoverFeed(sources[0].Url)
|
||||||
}
|
}
|
||||||
return nil, "", &sources, nil
|
|
||||||
|
result.Sources = sources
|
||||||
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
|
func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
|
||||||
@ -132,7 +81,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(websiteUrl) != 0 {
|
if len(websiteUrl) != 0 {
|
||||||
res, err := defaultClient.get(websiteUrl)
|
res, err := client.get(websiteUrl)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -157,7 +106,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
|
|||||||
"image/gif",
|
"image/gif",
|
||||||
}
|
}
|
||||||
for _, url := range candidateUrls {
|
for _, url := range candidateUrls {
|
||||||
res, err := defaultClient.get(url)
|
res, err := client.get(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -180,18 +129,6 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item {
|
|||||||
result := make([]storage.Item, len(items))
|
result := make([]storage.Item, len(items))
|
||||||
for i, item := range items {
|
for i, item := range items {
|
||||||
item := item
|
item := item
|
||||||
podcastUrl := item.PodcastURL
|
|
||||||
|
|
||||||
/*
|
|
||||||
var podcastUrl *string
|
|
||||||
if item.Enclosures != nil {
|
|
||||||
for _, enclosure := range item.Enclosures {
|
|
||||||
if strings.ToLower(enclosure.Type) == "audio/mpeg" {
|
|
||||||
podcastUrl = &enclosure.URL
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
result[i] = storage.Item{
|
result[i] = storage.Item{
|
||||||
GUID: item.GUID,
|
GUID: item.GUID,
|
||||||
FeedId: feed.Id,
|
FeedId: feed.Id,
|
||||||
@ -203,33 +140,30 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item {
|
|||||||
Date: &item.Date,
|
Date: &item.Date,
|
||||||
Status: storage.UNREAD,
|
Status: storage.UNREAD,
|
||||||
Image: item.ImageURL,
|
Image: item.ImageURL,
|
||||||
PodcastURL: &podcastUrl,
|
PodcastURL: nil,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
|
func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
|
||||||
var res *http.Response
|
lmod := ""
|
||||||
var err error
|
etag := ""
|
||||||
|
if state := db.GetHTTPState(f.Id); state != nil {
|
||||||
httpState := db.GetHTTPState(f.Id)
|
lmod = state.LastModified
|
||||||
if httpState != nil {
|
etag = state.Etag
|
||||||
res, err = defaultClient.getConditional(f.FeedLink, httpState.LastModified, httpState.Etag)
|
|
||||||
} else {
|
|
||||||
res, err = defaultClient.get(f.FeedLink)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
res, err := client.getConditional(f.FeedLink, lmod, etag)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("unable to get: %s", err)
|
return nil, fmt.Errorf("unable to get: %s", err)
|
||||||
}
|
}
|
||||||
defer res.Body.Close()
|
defer res.Body.Close()
|
||||||
|
|
||||||
if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 {
|
switch {
|
||||||
|
case res.StatusCode < 200 || res.StatusCode > 399:
|
||||||
return nil, fmt.Errorf("status code %d", res.StatusCode)
|
return nil, fmt.Errorf("status code %d", res.StatusCode)
|
||||||
}
|
case res.StatusCode == http.StatusNotModified:
|
||||||
|
|
||||||
if res.StatusCode == 304 {
|
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -237,34 +171,17 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to init response body: %s", err)
|
return nil, fmt.Errorf("failed to init response body: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
feed, err := parser.Parse(body)
|
feed, err := parser.Parse(body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to parse: %s", err)
|
return nil, fmt.Errorf("failed to parse: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
lastModified := res.Header.Get("Last-Modified")
|
lmod = res.Header.Get("Last-Modified")
|
||||||
etag := res.Header.Get("Etag")
|
etag = res.Header.Get("Etag")
|
||||||
if lastModified != "" || etag != "" {
|
if lmod != "" || etag != "" {
|
||||||
db.SetHTTPState(f.Id, lastModified, etag)
|
db.SetHTTPState(f.Id, lmod, etag)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ConvertItems(feed.Items, f), nil
|
return ConvertItems(feed.Items, f), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
|
||||||
transport := &http.Transport{
|
|
||||||
Proxy: http.ProxyFromEnvironment,
|
|
||||||
DialContext: (&net.Dialer{
|
|
||||||
Timeout: 10 * time.Second,
|
|
||||||
}).DialContext,
|
|
||||||
DisableKeepAlives: true,
|
|
||||||
TLSHandshakeTimeout: time.Second * 10,
|
|
||||||
}
|
|
||||||
httpClient := &http.Client{
|
|
||||||
Timeout: time.Second * 30,
|
|
||||||
Transport: transport,
|
|
||||||
}
|
|
||||||
defaultClient = &Client{
|
|
||||||
httpClient: httpClient,
|
|
||||||
userAgent: "Yarr/1.0",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user