rewrite icon crawling

This commit is contained in:
Nazar Kanaev 2021-05-28 10:27:56 +01:00
parent f38dcfba3b
commit 851aa1a136
3 changed files with 77 additions and 47 deletions

View File

@ -76,10 +76,10 @@ func (s *Storage) UpdateFeedIcon(feedId int64, icon *[]byte) bool {
}
func (s *Storage) ListFeeds() []Feed {
result := make([]Feed, 0, 0)
result := make([]Feed, 0)
rows, err := s.db.Query(`
select id, folder_id, title, description, link, feed_link,
ifnull(icon, '') != '' as has_icon
ifnull(length(icon), 0) > 0 as has_icon
from feeds
order by title collate nocase
`)
@ -107,6 +107,36 @@ func (s *Storage) ListFeeds() []Feed {
return result
}
func (s *Storage) ListFeedsMissingIcons() []Feed {
result := make([]Feed, 0)
rows, err := s.db.Query(`
select id, folder_id, title, description, link, feed_link
from feeds
where icon is null
`)
if err != nil {
log.Print(err)
return result
}
for rows.Next() {
var f Feed
err = rows.Scan(
&f.Id,
&f.FolderId,
&f.Title,
&f.Description,
&f.Link,
&f.FeedLink,
)
if err != nil {
log.Print(err)
return result
}
result = append(result, f)
}
return result
}
func (s *Storage) GetFeed(id int64) *Feed {
var f Feed
err := s.db.QueryRow(`

View File

@ -76,8 +76,16 @@ func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) {
return result, nil
}
func findFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
candidateUrls := make([]string, 0)
var emptyIcon = make([]byte, 0)
var imageTypes = map[string]bool{
"image/x-icon": true,
"image/png": true,
"image/jpeg": true,
"image/gif": true,
}
func findFavicon(siteUrl, feedUrl string) (*[]byte, error) {
urls := make([]string, 0)
favicon := func(link string) string {
u, err := url.Parse(link)
@ -87,49 +95,43 @@ func findFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
return fmt.Sprintf("%s://%s/favicon.ico", u.Scheme, u.Host)
}
if len(websiteUrl) != 0 {
res, err := client.get(websiteUrl)
if err != nil {
return nil, err
}
body, err := ioutil.ReadAll(res.Body)
if siteUrl != "" {
if res, err := client.get(siteUrl); err == nil {
defer res.Body.Close()
if err != nil {
return nil, err
}
candidateUrls = append(candidateUrls, scraper.FindIcons(string(body), websiteUrl)...)
if c := favicon(websiteUrl); len(c) != 0 {
candidateUrls = append(candidateUrls, c)
if body, err := ioutil.ReadAll(res.Body); err == nil {
urls = append(urls, scraper.FindIcons(string(body), siteUrl)...)
if c := favicon(siteUrl); c != "" {
urls = append(urls, c)
}
}
}
if c := favicon(feedUrl); len(c) != 0 {
candidateUrls = append(candidateUrls, c)
}
imageTypes := [4]string{
"image/x-icon",
"image/png",
"image/jpeg",
"image/gif",
if c := favicon(feedUrl); c != "" {
urls = append(urls, c)
}
for _, url := range candidateUrls {
res, err := client.get(url)
for _, u := range urls {
res, err := client.get(u)
if err != nil {
continue
}
defer res.Body.Close()
if res.StatusCode == 200 {
if content, err := ioutil.ReadAll(res.Body); err == nil {
if res.StatusCode != 200 {
continue
}
content, err := ioutil.ReadAll(res.Body)
if err != nil {
continue
}
ctype := http.DetectContentType(content)
for _, itype := range imageTypes {
if ctype == itype {
if imageTypes[ctype] {
return &content, nil
}
}
}
}
}
return nil, nil
return &emptyIcon, nil
}
func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item {

View File

@ -41,11 +41,9 @@ func (w *Worker) StartFeedCleaner() {
func (w *Worker) FindFavicons() {
go func() {
for _, feed := range w.db.ListFeeds() {
if !feed.HasIcon {
for _, feed := range w.db.ListFeedsMissingIcons() {
w.FindFeedFavicon(feed)
}
}
}()
}