switch to internal feed parser

This commit is contained in:
Nazar Kanaev 2021-03-23 10:49:25 +00:00
parent c91b439878
commit 5b36530f67
2 changed files with 37 additions and 35 deletions

View File

@ -155,7 +155,7 @@ func (s *Server) handleFeedList(c *router.Context) {
return return
} }
feed, sources, err := worker.DiscoverFeed(form.Url) feed, feedUrl, sources, err := worker.DiscoverFeed(form.Url)
if err != nil { if err != nil {
log.Print(err) log.Print(err)
c.JSON(http.StatusOK, map[string]string{"status": "notfound"}) c.JSON(http.StatusOK, map[string]string{"status": "notfound"})
@ -165,9 +165,9 @@ func (s *Server) handleFeedList(c *router.Context) {
if feed != nil { if feed != nil {
storedFeed := s.db.CreateFeed( storedFeed := s.db.CreateFeed(
feed.Title, feed.Title,
feed.Description, "",
feed.Link, feed.SiteURL,
feed.FeedLink, feedUrl,
form.FolderID, form.FolderID,
) )
s.db.CreateItems(worker.ConvertItems(feed.Items, *storedFeed)) s.db.CreateItems(worker.ConvertItems(feed.Items, *storedFeed))

View File

@ -4,15 +4,16 @@ import (
"bytes" "bytes"
"errors" "errors"
"fmt" "fmt"
"github.com/mmcdole/gofeed"
"github.com/nkanaev/yarr/src/crawler"
"github.com/nkanaev/yarr/src/storage"
"io/ioutil" "io/ioutil"
"log"
"net" "net"
"net/http" "net/http"
"net/url" "net/url"
"strings"
"time" "time"
"github.com/nkanaev/yarr/src/crawler"
feedparser "github.com/nkanaev/yarr/src/feed"
"github.com/nkanaev/yarr/src/storage"
) )
type FeedSource struct { type FeedSource struct {
@ -55,32 +56,34 @@ func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) {
return sources, nil return sources, nil
} }
func DiscoverFeed(candidateUrl string) (*gofeed.Feed, *[]FeedSource, error) { func DiscoverFeed(candidateUrl string) (*feedparser.Feed, string, *[]FeedSource, error) {
// Query URL // Query URL
res, err := defaultClient.get(candidateUrl) res, err := defaultClient.get(candidateUrl)
if err != nil { if err != nil {
return nil, nil, err return nil, "", nil, err
} }
defer res.Body.Close() defer res.Body.Close()
if res.StatusCode != 200 { if res.StatusCode != 200 {
errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", candidateUrl, res.StatusCode) errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", candidateUrl, res.StatusCode)
return nil, nil, errors.New(errmsg) return nil, "", nil, errors.New(errmsg)
} }
content, err := ioutil.ReadAll(res.Body) content, err := ioutil.ReadAll(res.Body)
if err != nil { if err != nil {
return nil, nil, err return nil, "", nil, err
} }
// Try to feed into parser // Try to feed into parser
feedparser := gofeed.NewParser()
feed, err := feedparser.Parse(bytes.NewReader(content)) feed, err := feedparser.Parse(bytes.NewReader(content))
if err == nil { if err == nil {
/*
// WILD: feeds may not always have link to themselves // WILD: feeds may not always have link to themselves
if len(feed.FeedLink) == 0 { if len(feed.FeedLink) == 0 {
feed.FeedLink = candidateUrl feed.FeedLink = candidateUrl
} }
*/
// WILD: resolve relative links (path, without host) // WILD: resolve relative links (path, without host)
/*
base, _ := url.Parse(candidateUrl) base, _ := url.Parse(candidateUrl)
if link, err := url.Parse(feed.Link); err == nil && link.Host == "" { if link, err := url.Parse(feed.Link); err == nil && link.Host == "" {
feed.Link = base.ResolveReference(link).String() feed.Link = base.ResolveReference(link).String()
@ -88,23 +91,28 @@ func DiscoverFeed(candidateUrl string) (*gofeed.Feed, *[]FeedSource, error) {
if link, err := url.Parse(feed.FeedLink); err == nil && link.Host == "" { if link, err := url.Parse(feed.FeedLink); err == nil && link.Host == "" {
feed.FeedLink = base.ResolveReference(link).String() feed.FeedLink = base.ResolveReference(link).String()
} }
*/
err := feed.TranslateURLs(candidateUrl)
if err != nil {
log.Printf("Failed to translate feed urls: %s", err)
}
return feed, nil, nil return feed, candidateUrl, nil, nil
} }
// Possibly an html link. Search for feed links // Possibly an html link. Search for feed links
sources, err := searchFeedLinks(content, candidateUrl) sources, err := searchFeedLinks(content, candidateUrl)
if err != nil { if err != nil {
return nil, nil, err return nil, "", nil, err
} else if len(sources) == 0 { } else if len(sources) == 0 {
return nil, nil, errors.New("No feeds found at the given url") return nil, "", nil, errors.New("No feeds found at the given url")
} else if len(sources) == 1 { } else if len(sources) == 1 {
if sources[0].Url == candidateUrl { if sources[0].Url == candidateUrl {
return nil, nil, errors.New("Recursion!") return nil, "", nil, errors.New("Recursion!")
} }
return DiscoverFeed(sources[0].Url) return DiscoverFeed(sources[0].Url)
} }
return nil, &sources, nil return nil, "", &sources, nil
} }
func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) { func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
@ -163,17 +171,12 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
return nil, nil return nil, nil
} }
func ConvertItems(items []*gofeed.Item, feed storage.Feed) []storage.Item { func ConvertItems(items []feedparser.Item, feed storage.Feed) []storage.Item {
result := make([]storage.Item, len(items)) result := make([]storage.Item, len(items))
for i, item := range items { for i, item := range items {
imageURL := "" podcastUrl := item.PodcastURL
if item.Image != nil {
imageURL = item.Image.URL /*
}
author := ""
if item.Author != nil {
author = item.Author.Name
}
var podcastUrl *string var podcastUrl *string
if item.Enclosures != nil { if item.Enclosures != nil {
for _, enclosure := range item.Enclosures { for _, enclosure := range item.Enclosures {
@ -182,19 +185,19 @@ func ConvertItems(items []*gofeed.Item, feed storage.Feed) []storage.Item {
} }
} }
} }
*/
result[i] = storage.Item{ result[i] = storage.Item{
GUID: item.GUID, GUID: item.GUID,
FeedId: feed.Id, FeedId: feed.Id,
Title: item.Title, Title: item.Title,
Link: item.Link, Link: item.URL,
Description: item.Description, Description: "",
Content: item.Content, Content: item.Content,
Author: author, Author: "",
Date: item.PublishedParsed, Date: &item.Date,
DateUpdated: item.UpdatedParsed,
Status: storage.UNREAD, Status: storage.UNREAD,
Image: imageURL, Image: item.ImageURL,
PodcastURL: podcastUrl, PodcastURL: &podcastUrl,
} }
} }
return result return result
@ -231,7 +234,6 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
db.SetHTTPState(f.Id, lastModified, etag) db.SetHTTPState(f.Id, lastModified, etag)
} }
feedparser := gofeed.NewParser()
feed, err := feedparser.Parse(res.Body) feed, err := feedparser.Parse(res.Body)
if err != nil { if err != nil {
return nil, err return nil, err