mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
refactor crawler
This commit is contained in:
parent
e0e6166cdf
commit
b40fe94147
@ -155,35 +155,32 @@ func (s *Server) handleFeedList(c *router.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
feed, feedUrl, sources, err := worker.DiscoverFeed(form.Url)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
result, err := worker.DiscoverFeed(form.Url)
|
||||
switch {
|
||||
case err != nil:
|
||||
log.Printf("Faild to discover feed for %s: %s", form.Url, err)
|
||||
c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
|
||||
return
|
||||
}
|
||||
|
||||
if feed != nil {
|
||||
storedFeed := s.db.CreateFeed(
|
||||
feed.Title,
|
||||
case len(result.Sources) > 0:
|
||||
c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": result.Sources})
|
||||
case result.Feed != nil:
|
||||
feed := s.db.CreateFeed(
|
||||
result.Feed.Title,
|
||||
"",
|
||||
feed.SiteURL,
|
||||
feedUrl,
|
||||
result.Feed.SiteURL,
|
||||
result.FeedLink,
|
||||
form.FolderID,
|
||||
)
|
||||
s.db.CreateItems(worker.ConvertItems(feed.Items, *storedFeed))
|
||||
s.db.CreateItems(worker.ConvertItems(result.Feed.Items, *feed))
|
||||
|
||||
icon, err := worker.FindFavicon(storedFeed.Link, storedFeed.FeedLink)
|
||||
icon, err := worker.FindFavicon(feed.Link, feed.FeedLink)
|
||||
if icon != nil {
|
||||
s.db.UpdateFeedIcon(storedFeed.Id, icon)
|
||||
s.db.UpdateFeedIcon(feed.Id, icon)
|
||||
}
|
||||
if err != nil {
|
||||
log.Printf("Failed to find favicon for %s (%d): %s", storedFeed.FeedLink, storedFeed.Id, err)
|
||||
log.Printf("Failed to find favicon for %s (%d): %s", feed.FeedLink, feed.Id, err)
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, map[string]string{"status": "success"})
|
||||
} else if sources != nil {
|
||||
c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": sources})
|
||||
} else {
|
||||
default:
|
||||
c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
|
||||
}
|
||||
}
|
||||
|
52
src/worker/client.go
Normal file
52
src/worker/client.go
Normal file
@ -0,0 +1,52 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"net"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
httpClient *http.Client
|
||||
userAgent string
|
||||
}
|
||||
|
||||
func (c *Client) get(url string) (*http.Response, error) {
|
||||
return c.getConditional(url, "", "")
|
||||
}
|
||||
|
||||
func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
if lastModified != "" {
|
||||
req.Header.Set("If-Modified-Since", lastModified)
|
||||
}
|
||||
if etag != "" {
|
||||
req.Header.Set("If-None-Match", etag)
|
||||
}
|
||||
return c.httpClient.Do(req)
|
||||
}
|
||||
|
||||
var client *Client
|
||||
|
||||
func init() {
|
||||
transport := &http.Transport{
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
}).DialContext,
|
||||
DisableKeepAlives: true,
|
||||
TLSHandshakeTimeout: time.Second * 10,
|
||||
}
|
||||
httpClient := &http.Client{
|
||||
Timeout: time.Second * 30,
|
||||
Transport: transport,
|
||||
}
|
||||
client = &Client{
|
||||
httpClient: httpClient,
|
||||
userAgent: "Yarr/1.0",
|
||||
}
|
||||
}
|
@ -5,11 +5,8 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"time"
|
||||
|
||||
"github.com/nkanaev/yarr/src/scraper"
|
||||
"github.com/nkanaev/yarr/src/parser"
|
||||
@ -22,102 +19,54 @@ type FeedSource struct {
|
||||
Url string `json:"url"`
|
||||
}
|
||||
|
||||
type Client struct {
|
||||
httpClient *http.Client
|
||||
userAgent string
|
||||
type DiscoverResult struct {
|
||||
Feed *parser.Feed
|
||||
FeedLink string
|
||||
Sources []FeedSource
|
||||
}
|
||||
|
||||
func (c *Client) get(url string) (*http.Response, error) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
return c.httpClient.Do(req)
|
||||
}
|
||||
|
||||
func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
if lastModified != "" {
|
||||
req.Header.Set("If-Modified-Since", lastModified)
|
||||
}
|
||||
if etag != "" {
|
||||
req.Header.Set("If-None-Match", etag)
|
||||
}
|
||||
return c.httpClient.Do(req)
|
||||
}
|
||||
|
||||
var defaultClient *Client
|
||||
|
||||
func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) {
|
||||
sources := make([]FeedSource, 0, 0)
|
||||
for url, title := range scraper.FindFeeds(string(html), siteurl) {
|
||||
sources = append(sources, FeedSource{Title: title, Url: url})
|
||||
}
|
||||
return sources, nil
|
||||
}
|
||||
|
||||
func DiscoverFeed(candidateUrl string) (*parser.Feed, string, *[]FeedSource, error) {
|
||||
func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) {
|
||||
result := &DiscoverResult{}
|
||||
// Query URL
|
||||
res, err := defaultClient.get(candidateUrl)
|
||||
res, err := client.get(candidateUrl)
|
||||
if err != nil {
|
||||
return nil, "", nil, err
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != 200 {
|
||||
errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", candidateUrl, res.StatusCode)
|
||||
return nil, "", nil, errors.New(errmsg)
|
||||
return nil, fmt.Errorf("status code %d", res.StatusCode)
|
||||
}
|
||||
content, err := ioutil.ReadAll(res.Body)
|
||||
if err != nil {
|
||||
return nil, "", nil, err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Try to feed into parser
|
||||
feed, err := parser.Parse(bytes.NewReader(content))
|
||||
if err == nil {
|
||||
/*
|
||||
// WILD: feeds may not always have link to themselves
|
||||
if len(feed.FeedLink) == 0 {
|
||||
feed.FeedLink = candidateUrl
|
||||
}
|
||||
*/
|
||||
|
||||
// WILD: resolve relative links (path, without host)
|
||||
/*
|
||||
base, _ := url.Parse(candidateUrl)
|
||||
if link, err := url.Parse(feed.Link); err == nil && link.Host == "" {
|
||||
feed.Link = base.ResolveReference(link).String()
|
||||
}
|
||||
if link, err := url.Parse(feed.FeedLink); err == nil && link.Host == "" {
|
||||
feed.FeedLink = base.ResolveReference(link).String()
|
||||
}
|
||||
*/
|
||||
err := feed.TranslateURLs(candidateUrl)
|
||||
if err != nil {
|
||||
log.Printf("Failed to translate feed urls: %s", err)
|
||||
}
|
||||
|
||||
return feed, candidateUrl, nil, nil
|
||||
feed.TranslateURLs(candidateUrl)
|
||||
result.Feed = feed
|
||||
result.FeedLink = candidateUrl
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// Possibly an html link. Search for feed links
|
||||
sources, err := searchFeedLinks(content, candidateUrl)
|
||||
if err != nil {
|
||||
return nil, "", nil, err
|
||||
} else if len(sources) == 0 {
|
||||
return nil, "", nil, errors.New("No feeds found at the given url")
|
||||
} else if len(sources) == 1 {
|
||||
sources := make([]FeedSource, 0)
|
||||
for url, title := range scraper.FindFeeds(string(content), candidateUrl) {
|
||||
sources = append(sources, FeedSource{Title: title, Url: url})
|
||||
}
|
||||
switch {
|
||||
case len(sources) == 0:
|
||||
return nil, errors.New("No feeds found at the given url")
|
||||
case len(sources) == 1:
|
||||
if sources[0].Url == candidateUrl {
|
||||
return nil, "", nil, errors.New("Recursion!")
|
||||
return nil, errors.New("Recursion!")
|
||||
}
|
||||
return DiscoverFeed(sources[0].Url)
|
||||
}
|
||||
return nil, "", &sources, nil
|
||||
|
||||
result.Sources = sources
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
|
||||
@ -132,7 +81,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
|
||||
}
|
||||
|
||||
if len(websiteUrl) != 0 {
|
||||
res, err := defaultClient.get(websiteUrl)
|
||||
res, err := client.get(websiteUrl)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -157,7 +106,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
|
||||
"image/gif",
|
||||
}
|
||||
for _, url := range candidateUrls {
|
||||
res, err := defaultClient.get(url)
|
||||
res, err := client.get(url)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
@ -180,18 +129,6 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item {
|
||||
result := make([]storage.Item, len(items))
|
||||
for i, item := range items {
|
||||
item := item
|
||||
podcastUrl := item.PodcastURL
|
||||
|
||||
/*
|
||||
var podcastUrl *string
|
||||
if item.Enclosures != nil {
|
||||
for _, enclosure := range item.Enclosures {
|
||||
if strings.ToLower(enclosure.Type) == "audio/mpeg" {
|
||||
podcastUrl = &enclosure.URL
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
result[i] = storage.Item{
|
||||
GUID: item.GUID,
|
||||
FeedId: feed.Id,
|
||||
@ -203,33 +140,30 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item {
|
||||
Date: &item.Date,
|
||||
Status: storage.UNREAD,
|
||||
Image: item.ImageURL,
|
||||
PodcastURL: &podcastUrl,
|
||||
PodcastURL: nil,
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
|
||||
var res *http.Response
|
||||
var err error
|
||||
|
||||
httpState := db.GetHTTPState(f.Id)
|
||||
if httpState != nil {
|
||||
res, err = defaultClient.getConditional(f.FeedLink, httpState.LastModified, httpState.Etag)
|
||||
} else {
|
||||
res, err = defaultClient.get(f.FeedLink)
|
||||
lmod := ""
|
||||
etag := ""
|
||||
if state := db.GetHTTPState(f.Id); state != nil {
|
||||
lmod = state.LastModified
|
||||
etag = state.Etag
|
||||
}
|
||||
|
||||
res, err := client.getConditional(f.FeedLink, lmod, etag)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to get: %s", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 {
|
||||
switch {
|
||||
case res.StatusCode < 200 || res.StatusCode > 399:
|
||||
return nil, fmt.Errorf("status code %d", res.StatusCode)
|
||||
}
|
||||
|
||||
if res.StatusCode == 304 {
|
||||
case res.StatusCode == http.StatusNotModified:
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@ -237,34 +171,17 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init response body: %s", err)
|
||||
}
|
||||
|
||||
feed, err := parser.Parse(body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse: %s", err)
|
||||
}
|
||||
|
||||
lastModified := res.Header.Get("Last-Modified")
|
||||
etag := res.Header.Get("Etag")
|
||||
if lastModified != "" || etag != "" {
|
||||
db.SetHTTPState(f.Id, lastModified, etag)
|
||||
}
|
||||
return ConvertItems(feed.Items, f), nil
|
||||
lmod = res.Header.Get("Last-Modified")
|
||||
etag = res.Header.Get("Etag")
|
||||
if lmod != "" || etag != "" {
|
||||
db.SetHTTPState(f.Id, lmod, etag)
|
||||
}
|
||||
|
||||
func init() {
|
||||
transport := &http.Transport{
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
}).DialContext,
|
||||
DisableKeepAlives: true,
|
||||
TLSHandshakeTimeout: time.Second * 10,
|
||||
}
|
||||
httpClient := &http.Client{
|
||||
Timeout: time.Second * 30,
|
||||
Transport: transport,
|
||||
}
|
||||
defaultClient = &Client{
|
||||
httpClient: httpClient,
|
||||
userAgent: "Yarr/1.0",
|
||||
}
|
||||
return ConvertItems(feed.Items, f), nil
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user