extract worker & crawler from server

This commit is contained in:
Nazar Kanaev
2021-03-17 10:36:18 +00:00
parent 682b0c1729
commit dfb32d4ebe
4 changed files with 152 additions and 123 deletions

View File

@@ -1,323 +0,0 @@
package server
import (
"bytes"
"errors"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/mmcdole/gofeed"
"github.com/nkanaev/yarr/src/storage"
"io/ioutil"
"net"
"net/http"
"net/url"
"strings"
"time"
)
type FeedSource struct {
Title string `json:"title"`
Url string `json:"url"`
}
const feedLinks = `
link[type='application/rss+xml'],
link[type='application/atom+xml'],
a[href$="/feed"],
a[href$="/feed/"],
a[href$="feed.xml"],
a[href$="atom.xml"],
a[href$="rss.xml"],
a:contains("rss"),
a:contains("RSS"),
a:contains("feed"),
a:contains("FEED")
`
type Client struct {
httpClient *http.Client
userAgent string
}
func (c *Client) get(url string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
return c.httpClient.Do(req)
}
func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
req.Header.Set("If-Modified-Since", lastModified)
req.Header.Set("If-None-Match", etag)
return c.httpClient.Do(req)
}
var defaultClient *Client
func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) {
sources := make([]FeedSource, 0, 0)
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(html))
if err != nil {
return sources, err
}
base, err := url.Parse(siteurl)
if err != nil {
return sources, err
}
// feed {url: title} map
feeds := make(map[string]string)
doc.Find(feedLinks).Each(func(i int, s *goquery.Selection) {
// Unlikely to happen, but don't get more than N links
if len(feeds) > 10 {
return
}
if href, ok := s.Attr("href"); ok {
feedUrl, err := url.Parse(href)
if err != nil {
return
}
title := s.AttrOr("title", "")
url := base.ResolveReference(feedUrl).String()
if _, alreadyExists := feeds[url]; alreadyExists {
if feeds[url] == "" {
feeds[url] = title
}
} else {
feeds[url] = title
}
}
})
for url, title := range feeds {
sources = append(sources, FeedSource{Title: title, Url: url})
}
return sources, nil
}
func discoverFeed(candidateUrl string) (*gofeed.Feed, *[]FeedSource, error) {
// Query URL
res, err := defaultClient.get(candidateUrl)
if err != nil {
return nil, nil, err
}
defer res.Body.Close()
if res.StatusCode != 200 {
errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", candidateUrl, res.StatusCode)
return nil, nil, errors.New(errmsg)
}
content, err := ioutil.ReadAll(res.Body)
if err != nil {
return nil, nil, err
}
// Try to feed into parser
feedparser := gofeed.NewParser()
feed, err := feedparser.Parse(bytes.NewReader(content))
if err == nil {
// WILD: feeds may not always have link to themselves
if len(feed.FeedLink) == 0 {
feed.FeedLink = candidateUrl
}
// WILD: resolve relative links (path, without host)
base, _ := url.Parse(candidateUrl)
if link, err := url.Parse(feed.Link); err == nil && link.Host == "" {
feed.Link = base.ResolveReference(link).String()
}
if link, err := url.Parse(feed.FeedLink); err == nil && link.Host == "" {
feed.FeedLink = base.ResolveReference(link).String()
}
return feed, nil, nil
}
// Possibly an html link. Search for feed links
sources, err := searchFeedLinks(content, candidateUrl)
if err != nil {
return nil, nil, err
} else if len(sources) == 0 {
return nil, nil, errors.New("No feeds found at the given url")
} else if len(sources) == 1 {
if sources[0].Url == candidateUrl {
return nil, nil, errors.New("Recursion!")
}
return discoverFeed(sources[0].Url)
}
return nil, &sources, nil
}
func findFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
candidateUrls := make([]string, 0)
favicon := func(link string) string {
u, err := url.Parse(link)
if err != nil {
return ""
}
return fmt.Sprintf("%s://%s/favicon.ico", u.Scheme, u.Host)
}
if len(websiteUrl) != 0 {
base, err := url.Parse(websiteUrl)
if err != nil {
return nil, err
}
res, err := defaultClient.get(websiteUrl)
if err != nil {
return nil, err
}
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, err
}
doc.Find(`link[rel=icon]`).EachWithBreak(func(i int, s *goquery.Selection) bool {
if href, ok := s.Attr("href"); ok {
if hrefUrl, err := url.Parse(href); err == nil {
faviconUrl := base.ResolveReference(hrefUrl).String()
candidateUrls = append(candidateUrls, faviconUrl)
}
}
return true
})
if c := favicon(websiteUrl); len(c) != 0 {
candidateUrls = append(candidateUrls, c)
}
}
if c := favicon(feedUrl); len(c) != 0 {
candidateUrls = append(candidateUrls, c)
}
imageTypes := [4]string{
"image/x-icon",
"image/png",
"image/jpeg",
"image/gif",
}
for _, url := range candidateUrls {
res, err := defaultClient.get(url)
if err != nil {
continue
}
defer res.Body.Close()
if res.StatusCode == 200 {
if content, err := ioutil.ReadAll(res.Body); err == nil {
ctype := http.DetectContentType(content)
for _, itype := range imageTypes {
if ctype == itype {
return &content, nil
}
}
}
}
}
return nil, nil
}
func convertItems(items []*gofeed.Item, feed storage.Feed) []storage.Item {
result := make([]storage.Item, len(items))
for i, item := range items {
imageURL := ""
if item.Image != nil {
imageURL = item.Image.URL
}
author := ""
if item.Author != nil {
author = item.Author.Name
}
var podcastUrl *string
if item.Enclosures != nil {
for _, enclosure := range item.Enclosures {
if strings.ToLower(enclosure.Type) == "audio/mpeg" {
podcastUrl = &enclosure.URL
}
}
}
result[i] = storage.Item{
GUID: item.GUID,
FeedId: feed.Id,
Title: item.Title,
Link: item.Link,
Description: item.Description,
Content: item.Content,
Author: author,
Date: item.PublishedParsed,
DateUpdated: item.UpdatedParsed,
Status: storage.UNREAD,
Image: imageURL,
PodcastURL: podcastUrl,
}
}
return result
}
func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
var res *http.Response
var err error
httpState := db.GetHTTPState(f.Id)
if httpState != nil {
res, err = defaultClient.getConditional(f.FeedLink, httpState.LastModified, httpState.Etag)
} else {
res, err = defaultClient.get(f.FeedLink)
}
if err != nil {
return nil, err
}
defer res.Body.Close()
if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 {
errmsg := fmt.Sprintf("Failed to list feed items for %s (status: %d)", f.FeedLink, res.StatusCode)
return nil, errors.New(errmsg)
}
if res.StatusCode == 304 {
return nil, nil
}
lastModified := res.Header.Get("Last-Modified")
etag := res.Header.Get("Etag")
if lastModified != "" || etag != "" {
db.SetHTTPState(f.Id, lastModified, etag)
}
feedparser := gofeed.NewParser()
feed, err := feedparser.Parse(res.Body)
if err != nil {
return nil, err
}
return convertItems(feed.Items, f), nil
}
func init() {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
DisableKeepAlives: true,
TLSHandshakeTimeout: time.Second * 10,
}
httpClient := &http.Client{
Timeout: time.Second * 30,
Transport: transport,
}
defaultClient = &Client{
httpClient: httpClient,
userAgent: "Yarr/1.0",
}
}

View File

@@ -7,6 +7,7 @@ import (
"github.com/nkanaev/yarr/src/router"
"github.com/nkanaev/yarr/src/storage"
"github.com/nkanaev/yarr/src/opml"
"github.com/nkanaev/yarr/src/worker"
"io/ioutil"
"log"
"math"
@@ -67,7 +68,7 @@ func (s *Server) handleStatic(c *router.Context) {
func (s *Server) handleStatus(c *router.Context) {
c.JSON(http.StatusOK, map[string]interface{}{
"running": *s.queueSize,
"running": s.worker.FeedsPending(),
"stats": s.db.FeedStats(),
})
}
@@ -122,7 +123,7 @@ func (s *Server) handleFolder(c *router.Context) {
func (s *Server) handleFeedRefresh(c *router.Context) {
if c.Req.Method == "POST" {
s.fetchAllFeeds()
s.worker.FetchAllFeeds()
c.Out.WriteHeader(http.StatusOK)
} else {
c.Out.WriteHeader(http.StatusMethodNotAllowed)
@@ -161,7 +162,7 @@ func (s *Server) handleFeedList(c *router.Context) {
return
}
feed, sources, err := discoverFeed(form.Url)
feed, sources, err := worker.DiscoverFeed(form.Url)
if err != nil {
log.Print(err)
c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
@@ -176,9 +177,9 @@ func (s *Server) handleFeedList(c *router.Context) {
feed.FeedLink,
form.FolderID,
)
s.db.CreateItems(convertItems(feed.Items, *storedFeed))
s.db.CreateItems(worker.ConvertItems(feed.Items, *storedFeed))
icon, err := findFavicon(storedFeed.Link, storedFeed.FeedLink)
icon, err := worker.FindFavicon(storedFeed.Link, storedFeed.FeedLink)
if icon != nil {
s.db.UpdateFeedIcon(storedFeed.Id, icon)
}
@@ -316,7 +317,7 @@ func (s *Server) handleSettings(c *router.Context) {
}
if s.db.UpdateSettings(settings) {
if _, ok := settings["refresh_rate"]; ok {
s.refreshRate <- s.db.GetSettingsValueInt64("refresh_rate")
s.worker.SetRefreshRate(s.db.GetSettingsValueInt64("refresh_rate"))
}
c.Out.WriteHeader(http.StatusOK)
} else {
@@ -347,7 +348,7 @@ func (s *Server) handleOPMLImport(c *router.Context) {
}
}
}
s.fetchAllFeeds()
s.worker.FetchAllFeeds()
c.Out.WriteHeader(http.StatusOK)
} else {
c.Out.WriteHeader(http.StatusMethodNotAllowed)

View File

@@ -3,11 +3,9 @@ package server
import (
"log"
"net/http"
"runtime"
"sync/atomic"
"time"
"github.com/nkanaev/yarr/src/storage"
"github.com/nkanaev/yarr/src/worker"
)
var BasePath string = ""
@@ -15,9 +13,7 @@ var BasePath string = ""
type Server struct {
Addr string
db *storage.Storage
feedQueue chan storage.Feed
queueSize *int32
refreshRate chan int64
worker *worker.Worker
// auth
Username string
Password string
@@ -27,13 +23,10 @@ type Server struct {
}
func NewServer(db *storage.Storage, addr string) *Server {
queueSize := int32(0)
return &Server{
db: db,
feedQueue: make(chan storage.Feed, 3000),
queueSize: &queueSize,
Addr: addr,
refreshRate: make(chan int64),
db: db,
Addr: addr,
worker: worker.NewWorker(db),
}
}
@@ -46,7 +39,7 @@ func (h *Server) GetAddr() string {
}
func (s *Server) Start() {
s.startJobs()
s.worker.Start()
httpserver := &http.Server{Addr: s.Addr, Handler: s.handler()}
@@ -102,103 +95,6 @@ func (h Server) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
}
*/
func (h *Server) startJobs() {
delTicker := time.NewTicker(time.Hour * 24)
syncSearchChannel := make(chan bool, 10)
var syncSearchTimer *time.Timer // TODO: should this be atomic?
syncSearch := func() {
if syncSearchTimer == nil {
syncSearchTimer = time.AfterFunc(time.Second*2, func() {
syncSearchChannel <- true
})
} else {
syncSearchTimer.Reset(time.Second * 2)
}
}
worker := func() {
for {
select {
case feed := <-h.feedQueue:
items, err := listItems(feed, h.db)
atomic.AddInt32(h.queueSize, -1)
if err != nil {
log.Printf("Failed to fetch %s (%d): %s", feed.FeedLink, feed.Id, err)
h.db.SetFeedError(feed.Id, err)
continue
}
h.db.CreateItems(items)
syncSearch()
if !feed.HasIcon {
icon, err := findFavicon(feed.Link, feed.FeedLink)
if icon != nil {
h.db.UpdateFeedIcon(feed.Id, icon)
}
if err != nil {
log.Printf("Failed to search favicon for %s (%s): %s", feed.Link, feed.FeedLink, err)
}
}
case <-delTicker.C:
h.db.DeleteOldItems()
case <-syncSearchChannel:
h.db.SyncSearch()
}
}
}
num := runtime.NumCPU() - 1
if num < 1 {
num = 1
}
for i := 0; i < num; i++ {
go worker()
}
go h.db.DeleteOldItems()
go h.db.SyncSearch()
go func() {
var refreshTicker *time.Ticker
refreshTick := make(<-chan time.Time)
for {
select {
case <-refreshTick:
h.fetchAllFeeds()
case val := <-h.refreshRate:
if refreshTicker != nil {
refreshTicker.Stop()
if val == 0 {
refreshTick = make(<-chan time.Time)
}
}
if val > 0 {
refreshTicker = time.NewTicker(time.Duration(val) * time.Minute)
refreshTick = refreshTicker.C
}
}
}
}()
refreshRate := h.db.GetSettingsValueInt64("refresh_rate")
h.refreshRate <- refreshRate
if refreshRate > 0 {
h.fetchAllFeeds()
}
}
func (h Server) requiresAuth() bool {
return h.Username != "" && h.Password != ""
}
func (h *Server) fetchAllFeeds() {
log.Print("Refreshing all feeds")
h.db.ResetFeedErrors()
for _, feed := range h.db.ListFeeds() {
h.fetchFeed(feed)
}
}
func (h *Server) fetchFeed(feed storage.Feed) {
atomic.AddInt32(h.queueSize, 1)
h.feedQueue <- feed
}