mirror of
https://github.com/nkanaev/yarr.git
synced 2025-09-13 09:55:36 +00:00
extract worker & crawler from server
This commit is contained in:
323
src/worker/crawler.go
Normal file
323
src/worker/crawler.go
Normal file
@@ -0,0 +1,323 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/mmcdole/gofeed"
|
||||
"github.com/nkanaev/yarr/src/storage"
|
||||
"io/ioutil"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type FeedSource struct {
|
||||
Title string `json:"title"`
|
||||
Url string `json:"url"`
|
||||
}
|
||||
|
||||
const feedLinks = `
|
||||
link[type='application/rss+xml'],
|
||||
link[type='application/atom+xml'],
|
||||
a[href$="/feed"],
|
||||
a[href$="/feed/"],
|
||||
a[href$="feed.xml"],
|
||||
a[href$="atom.xml"],
|
||||
a[href$="rss.xml"],
|
||||
a:contains("rss"),
|
||||
a:contains("RSS"),
|
||||
a:contains("feed"),
|
||||
a:contains("FEED")
|
||||
`
|
||||
|
||||
type Client struct {
|
||||
httpClient *http.Client
|
||||
userAgent string
|
||||
}
|
||||
|
||||
func (c *Client) get(url string) (*http.Response, error) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
return c.httpClient.Do(req)
|
||||
}
|
||||
|
||||
func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", c.userAgent)
|
||||
req.Header.Set("If-Modified-Since", lastModified)
|
||||
req.Header.Set("If-None-Match", etag)
|
||||
return c.httpClient.Do(req)
|
||||
}
|
||||
|
||||
var defaultClient *Client
|
||||
|
||||
func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) {
|
||||
sources := make([]FeedSource, 0, 0)
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(html))
|
||||
if err != nil {
|
||||
return sources, err
|
||||
}
|
||||
base, err := url.Parse(siteurl)
|
||||
if err != nil {
|
||||
return sources, err
|
||||
}
|
||||
|
||||
// feed {url: title} map
|
||||
feeds := make(map[string]string)
|
||||
|
||||
doc.Find(feedLinks).Each(func(i int, s *goquery.Selection) {
|
||||
// Unlikely to happen, but don't get more than N links
|
||||
if len(feeds) > 10 {
|
||||
return
|
||||
}
|
||||
if href, ok := s.Attr("href"); ok {
|
||||
feedUrl, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
title := s.AttrOr("title", "")
|
||||
url := base.ResolveReference(feedUrl).String()
|
||||
|
||||
if _, alreadyExists := feeds[url]; alreadyExists {
|
||||
if feeds[url] == "" {
|
||||
feeds[url] = title
|
||||
}
|
||||
} else {
|
||||
feeds[url] = title
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
for url, title := range feeds {
|
||||
sources = append(sources, FeedSource{Title: title, Url: url})
|
||||
}
|
||||
return sources, nil
|
||||
}
|
||||
|
||||
func DiscoverFeed(candidateUrl string) (*gofeed.Feed, *[]FeedSource, error) {
|
||||
// Query URL
|
||||
res, err := defaultClient.get(candidateUrl)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != 200 {
|
||||
errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", candidateUrl, res.StatusCode)
|
||||
return nil, nil, errors.New(errmsg)
|
||||
}
|
||||
content, err := ioutil.ReadAll(res.Body)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// Try to feed into parser
|
||||
feedparser := gofeed.NewParser()
|
||||
feed, err := feedparser.Parse(bytes.NewReader(content))
|
||||
if err == nil {
|
||||
// WILD: feeds may not always have link to themselves
|
||||
if len(feed.FeedLink) == 0 {
|
||||
feed.FeedLink = candidateUrl
|
||||
}
|
||||
|
||||
// WILD: resolve relative links (path, without host)
|
||||
base, _ := url.Parse(candidateUrl)
|
||||
if link, err := url.Parse(feed.Link); err == nil && link.Host == "" {
|
||||
feed.Link = base.ResolveReference(link).String()
|
||||
}
|
||||
if link, err := url.Parse(feed.FeedLink); err == nil && link.Host == "" {
|
||||
feed.FeedLink = base.ResolveReference(link).String()
|
||||
}
|
||||
|
||||
return feed, nil, nil
|
||||
}
|
||||
|
||||
// Possibly an html link. Search for feed links
|
||||
sources, err := searchFeedLinks(content, candidateUrl)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
} else if len(sources) == 0 {
|
||||
return nil, nil, errors.New("No feeds found at the given url")
|
||||
} else if len(sources) == 1 {
|
||||
if sources[0].Url == candidateUrl {
|
||||
return nil, nil, errors.New("Recursion!")
|
||||
}
|
||||
return DiscoverFeed(sources[0].Url)
|
||||
}
|
||||
return nil, &sources, nil
|
||||
}
|
||||
|
||||
func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
|
||||
candidateUrls := make([]string, 0)
|
||||
|
||||
favicon := func(link string) string {
|
||||
u, err := url.Parse(link)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf("%s://%s/favicon.ico", u.Scheme, u.Host)
|
||||
}
|
||||
|
||||
if len(websiteUrl) != 0 {
|
||||
base, err := url.Parse(websiteUrl)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res, err := defaultClient.get(websiteUrl)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
doc, err := goquery.NewDocumentFromReader(res.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
doc.Find(`link[rel=icon]`).EachWithBreak(func(i int, s *goquery.Selection) bool {
|
||||
if href, ok := s.Attr("href"); ok {
|
||||
if hrefUrl, err := url.Parse(href); err == nil {
|
||||
faviconUrl := base.ResolveReference(hrefUrl).String()
|
||||
candidateUrls = append(candidateUrls, faviconUrl)
|
||||
}
|
||||
}
|
||||
return true
|
||||
})
|
||||
|
||||
if c := favicon(websiteUrl); len(c) != 0 {
|
||||
candidateUrls = append(candidateUrls, c)
|
||||
}
|
||||
}
|
||||
if c := favicon(feedUrl); len(c) != 0 {
|
||||
candidateUrls = append(candidateUrls, c)
|
||||
}
|
||||
|
||||
imageTypes := [4]string{
|
||||
"image/x-icon",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/gif",
|
||||
}
|
||||
for _, url := range candidateUrls {
|
||||
res, err := defaultClient.get(url)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode == 200 {
|
||||
if content, err := ioutil.ReadAll(res.Body); err == nil {
|
||||
ctype := http.DetectContentType(content)
|
||||
for _, itype := range imageTypes {
|
||||
if ctype == itype {
|
||||
return &content, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func ConvertItems(items []*gofeed.Item, feed storage.Feed) []storage.Item {
|
||||
result := make([]storage.Item, len(items))
|
||||
for i, item := range items {
|
||||
imageURL := ""
|
||||
if item.Image != nil {
|
||||
imageURL = item.Image.URL
|
||||
}
|
||||
author := ""
|
||||
if item.Author != nil {
|
||||
author = item.Author.Name
|
||||
}
|
||||
var podcastUrl *string
|
||||
if item.Enclosures != nil {
|
||||
for _, enclosure := range item.Enclosures {
|
||||
if strings.ToLower(enclosure.Type) == "audio/mpeg" {
|
||||
podcastUrl = &enclosure.URL
|
||||
}
|
||||
}
|
||||
}
|
||||
result[i] = storage.Item{
|
||||
GUID: item.GUID,
|
||||
FeedId: feed.Id,
|
||||
Title: item.Title,
|
||||
Link: item.Link,
|
||||
Description: item.Description,
|
||||
Content: item.Content,
|
||||
Author: author,
|
||||
Date: item.PublishedParsed,
|
||||
DateUpdated: item.UpdatedParsed,
|
||||
Status: storage.UNREAD,
|
||||
Image: imageURL,
|
||||
PodcastURL: podcastUrl,
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
|
||||
var res *http.Response
|
||||
var err error
|
||||
|
||||
httpState := db.GetHTTPState(f.Id)
|
||||
if httpState != nil {
|
||||
res, err = defaultClient.getConditional(f.FeedLink, httpState.LastModified, httpState.Etag)
|
||||
} else {
|
||||
res, err = defaultClient.get(f.FeedLink)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 {
|
||||
errmsg := fmt.Sprintf("Failed to list feed items for %s (status: %d)", f.FeedLink, res.StatusCode)
|
||||
return nil, errors.New(errmsg)
|
||||
}
|
||||
|
||||
if res.StatusCode == 304 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
lastModified := res.Header.Get("Last-Modified")
|
||||
etag := res.Header.Get("Etag")
|
||||
if lastModified != "" || etag != "" {
|
||||
db.SetHTTPState(f.Id, lastModified, etag)
|
||||
}
|
||||
|
||||
feedparser := gofeed.NewParser()
|
||||
feed, err := feedparser.Parse(res.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return ConvertItems(feed.Items, f), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
transport := &http.Transport{
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
}).DialContext,
|
||||
DisableKeepAlives: true,
|
||||
TLSHandshakeTimeout: time.Second * 10,
|
||||
}
|
||||
httpClient := &http.Client{
|
||||
Timeout: time.Second * 30,
|
||||
Transport: transport,
|
||||
}
|
||||
defaultClient = &Client{
|
||||
httpClient: httpClient,
|
||||
userAgent: "Yarr/1.0",
|
||||
}
|
||||
}
|
132
src/worker/worker.go
Normal file
132
src/worker/worker.go
Normal file
@@ -0,0 +1,132 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"github.com/nkanaev/yarr/src/storage"
|
||||
"log"
|
||||
"sync/atomic"
|
||||
"runtime"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Worker struct {
|
||||
db *storage.Storage
|
||||
|
||||
feedQueue chan storage.Feed
|
||||
queueSize *int32
|
||||
refreshRate chan int64
|
||||
}
|
||||
|
||||
func NewWorker(db *storage.Storage) *Worker {
|
||||
queueSize := int32(0)
|
||||
return &Worker{
|
||||
db: db,
|
||||
feedQueue: make(chan storage.Feed, 3000),
|
||||
queueSize: &queueSize,
|
||||
refreshRate: make(chan int64),
|
||||
}
|
||||
}
|
||||
|
||||
func (w *Worker) Start() {
|
||||
delTicker := time.NewTicker(time.Hour * 24)
|
||||
|
||||
syncSearchChannel := make(chan bool, 10)
|
||||
var syncSearchTimer *time.Timer // TODO: should this be atomic?
|
||||
|
||||
syncSearch := func() {
|
||||
if syncSearchTimer == nil {
|
||||
syncSearchTimer = time.AfterFunc(time.Second*2, func() {
|
||||
syncSearchChannel <- true
|
||||
})
|
||||
} else {
|
||||
syncSearchTimer.Reset(time.Second * 2)
|
||||
}
|
||||
}
|
||||
|
||||
worker := func() {
|
||||
for {
|
||||
select {
|
||||
case feed := <-w.feedQueue:
|
||||
items, err := listItems(feed, w.db)
|
||||
atomic.AddInt32(w.queueSize, -1)
|
||||
if err != nil {
|
||||
log.Printf("Failed to fetch %s (%d): %s", feed.FeedLink, feed.Id, err)
|
||||
w.db.SetFeedError(feed.Id, err)
|
||||
continue
|
||||
}
|
||||
w.db.CreateItems(items)
|
||||
syncSearch()
|
||||
if !feed.HasIcon {
|
||||
icon, err := FindFavicon(feed.Link, feed.FeedLink)
|
||||
if icon != nil {
|
||||
w.db.UpdateFeedIcon(feed.Id, icon)
|
||||
}
|
||||
if err != nil {
|
||||
log.Printf("Failed to search favicon for %s (%s): %s", feed.Link, feed.FeedLink, err)
|
||||
}
|
||||
}
|
||||
case <-delTicker.C:
|
||||
w.db.DeleteOldItems()
|
||||
case <-syncSearchChannel:
|
||||
w.db.SyncSearch()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
num := runtime.NumCPU() - 1
|
||||
if num < 1 {
|
||||
num = 1
|
||||
}
|
||||
for i := 0; i < num; i++ {
|
||||
go worker()
|
||||
}
|
||||
go w.db.DeleteOldItems()
|
||||
go w.db.SyncSearch()
|
||||
|
||||
go func() {
|
||||
var refreshTicker *time.Ticker
|
||||
refreshTick := make(<-chan time.Time)
|
||||
for {
|
||||
select {
|
||||
case <-refreshTick:
|
||||
w.FetchAllFeeds()
|
||||
case val := <-w.refreshRate:
|
||||
if refreshTicker != nil {
|
||||
refreshTicker.Stop()
|
||||
if val == 0 {
|
||||
refreshTick = make(<-chan time.Time)
|
||||
}
|
||||
}
|
||||
if val > 0 {
|
||||
refreshTicker = time.NewTicker(time.Duration(val) * time.Minute)
|
||||
refreshTick = refreshTicker.C
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
refreshRate := w.db.GetSettingsValueInt64("refresh_rate")
|
||||
w.refreshRate <- refreshRate
|
||||
if refreshRate > 0 {
|
||||
w.FetchAllFeeds()
|
||||
}
|
||||
}
|
||||
|
||||
func (w *Worker) FetchAllFeeds() {
|
||||
log.Print("Refreshing all feeds")
|
||||
w.db.ResetFeedErrors()
|
||||
for _, feed := range w.db.ListFeeds() {
|
||||
w.fetchFeed(feed)
|
||||
}
|
||||
}
|
||||
|
||||
func (w *Worker) fetchFeed(feed storage.Feed) {
|
||||
atomic.AddInt32(w.queueSize, 1)
|
||||
w.feedQueue <- feed
|
||||
}
|
||||
|
||||
func (w *Worker) FeedsPending() int32 {
|
||||
return *w.queueSize
|
||||
}
|
||||
|
||||
func (w *Worker) SetRefreshRate(val int64) {
|
||||
w.refreshRate <- val
|
||||
}
|
Reference in New Issue
Block a user