mirror of
https://github.com/nkanaev/yarr.git
synced 2025-11-10 03:18:57 +00:00
reorganizing content-related packages
This commit is contained in:
97
src/content/scraper/finder.go
Normal file
97
src/content/scraper/finder.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
func FindFeeds(body string, base string) map[string]string {
|
||||
candidates := make(map[string]string)
|
||||
|
||||
doc, err := html.Parse(strings.NewReader(body))
|
||||
if err != nil {
|
||||
return candidates
|
||||
}
|
||||
|
||||
// find direct links
|
||||
// css: link[type=application/atom+xml]
|
||||
linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"}
|
||||
isFeedLink := func(n *html.Node) bool {
|
||||
if n.Type == html.ElementNode && n.Data == "link" {
|
||||
t := htmlutil.Attr(n, "type")
|
||||
for _, tt := range linkTypes {
|
||||
if tt == t {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
for _, node := range htmlutil.FindNodes(doc, isFeedLink) {
|
||||
href := htmlutil.Attr(node, "href")
|
||||
name := htmlutil.Attr(node, "title")
|
||||
link := htmlutil.AbsoluteUrl(href, base)
|
||||
if link != "" {
|
||||
candidates[link] = name
|
||||
}
|
||||
}
|
||||
|
||||
// guess by hyperlink properties
|
||||
if len(candidates) == 0 {
|
||||
// css: a[href="feed"]
|
||||
// css: a:contains("rss")
|
||||
feedHrefs := []string{"feed", "feed.xml", "rss.xml", "atom.xml"}
|
||||
feedTexts := []string{"rss", "feed"}
|
||||
isFeedHyperLink := func(n *html.Node) bool {
|
||||
if n.Type == html.ElementNode && n.Data == "a" {
|
||||
href := strings.Trim(htmlutil.Attr(n, "href"), "/")
|
||||
for _, feedHref := range feedHrefs {
|
||||
if strings.HasSuffix(href, feedHref) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
text := htmlutil.Text(n)
|
||||
for _, feedText := range feedTexts {
|
||||
if strings.EqualFold(text, feedText) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
for _, node := range htmlutil.FindNodes(doc, isFeedHyperLink) {
|
||||
href := htmlutil.Attr(node, "href")
|
||||
link := htmlutil.AbsoluteUrl(href, base)
|
||||
if link != "" {
|
||||
candidates[link] = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return candidates
|
||||
}
|
||||
|
||||
func FindIcons(body string, base string) []string {
|
||||
icons := make([]string, 0)
|
||||
|
||||
doc, err := html.Parse(strings.NewReader(body))
|
||||
if err != nil {
|
||||
return icons
|
||||
}
|
||||
|
||||
// css: link[rel=icon]
|
||||
isLink := func(n *html.Node) bool {
|
||||
return n.Type == html.ElementNode && n.Data == "link"
|
||||
}
|
||||
for _, node := range htmlutil.FindNodes(doc, isLink) {
|
||||
rels := strings.Split(htmlutil.Attr(node, "rel"), " ")
|
||||
for _, rel := range rels {
|
||||
if strings.EqualFold(rel, "icon") {
|
||||
icons = append(icons, htmlutil.AbsoluteUrl(htmlutil.Attr(node, "href"), base))
|
||||
}
|
||||
}
|
||||
}
|
||||
return icons
|
||||
}
|
||||
97
src/content/scraper/finder_test.go
Normal file
97
src/content/scraper/finder_test.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
const base = "http://example.com"
|
||||
|
||||
func TestFindFeedsInvalidHTML(t *testing.T) {
|
||||
x := `some nonsense`
|
||||
r := FindFeeds(x, base)
|
||||
if len(r) != 0 {
|
||||
t.Fatal("not expecting results")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindFeedsLinks(t *testing.T) {
|
||||
x := `
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title></title>
|
||||
<link rel="alternate" href="/feed.xml" type="application/rss+xml" title="rss with title">
|
||||
<link rel="alternate" href="/atom.xml" type="application/atom+xml">
|
||||
<link rel="alternate" href="/feed.json" type="application/json">
|
||||
</head>
|
||||
<body>
|
||||
<a href="/feed.xml">rss</a>
|
||||
</body>
|
||||
</html>
|
||||
`
|
||||
have := FindFeeds(x, base)
|
||||
|
||||
want := map[string]string{
|
||||
base + "/feed.xml": "rss with title",
|
||||
base + "/atom.xml": "",
|
||||
base + "/feed.json": "",
|
||||
}
|
||||
if !reflect.DeepEqual(have, want) {
|
||||
t.Logf("want: %#v", want)
|
||||
t.Logf("have: %#v", have)
|
||||
t.Fatal("invalid result")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindFeedsGuess(t *testing.T) {
|
||||
body := `
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<body>
|
||||
<!-- negative -->
|
||||
<a href="/about">what is rss?</a>
|
||||
<a href="/feed/cows">moo</a>
|
||||
|
||||
<!-- positive -->
|
||||
<a href="/feed.xml">subscribe</a>
|
||||
<a href="/news">rss</a>
|
||||
</body>
|
||||
</html>
|
||||
`
|
||||
have := FindFeeds(body, base)
|
||||
want := map[string]string{
|
||||
base + "/feed.xml": "",
|
||||
base + "/news": "",
|
||||
}
|
||||
if !reflect.DeepEqual(want, have) {
|
||||
t.Logf("want: %#v", want)
|
||||
t.Logf("have: %#v", have)
|
||||
t.Fatal("invalid result")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindIcons(t *testing.T) {
|
||||
body := `
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title></title>
|
||||
<link rel="icon favicon" href="/favicon.ico">
|
||||
<link rel="icon macicon" href="path/to/favicon.png">
|
||||
</head>
|
||||
<body>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
`
|
||||
have := FindIcons(body, base)
|
||||
want := []string{base + "/favicon.ico", base + "/path/to/favicon.png"}
|
||||
if !reflect.DeepEqual(have, want) {
|
||||
t.Logf("want: %#v", want)
|
||||
t.Logf("have: %#v", have)
|
||||
t.Fatal("invalid result")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user