diff --git a/src/crawler/finder.go b/src/crawler/finder.go
new file mode 100644
index 0000000..3ffb48c
--- /dev/null
+++ b/src/crawler/finder.go
@@ -0,0 +1,73 @@
+package crawler
+
+import (
+ "strings"
+
+ "golang.org/x/net/html"
+)
+
+func FindFeeds(body string, base string) map[string]string {
+ candidates := make(map[string]string)
+
+ doc, err := html.Parse(strings.NewReader(body))
+ if err != nil {
+ return candidates
+ }
+
+ linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"}
+ isFeedLink := func(n *html.Node) bool {
+ if n.Type == html.ElementNode && n.Data == "link" {
+ t := getAttr(n, "type")
+ for _, tt := range linkTypes {
+ if tt == t {
+ return true
+ }
+ }
+ }
+ return false
+ }
+ for _, node := range getNodes(doc, isFeedLink) {
+ href := getAttr(node, "href")
+ name := getAttr(node, "title")
+ link := absoluteUrl(href, base)
+ if link != "" {
+ candidates[link] = name
+ }
+ }
+
+ if len(candidates) == 0 {
+ // guess by hyperlink properties:
+ // - a[href="feed"]
+ // - a:contains("rss")
+ // ...etc
+ feedHrefs := []string{"feed", "feed.xml", "rss.xml", "atom.xml"}
+ feedTexts := []string{"rss", "feed"}
+ isFeedHyperLink := func(n *html.Node) bool {
+ if n.Type == html.ElementNode && n.Data == "a" {
+ href := strings.Trim(getAttr(n, "href"), "/")
+ text := getText(n)
+
+ for _, feedHref := range feedHrefs {
+ if strings.HasSuffix(href, feedHref) {
+ return true
+ }
+ }
+ for _, feedText := range feedTexts {
+ if strings.EqualFold(text, feedText) {
+ return true
+ }
+ }
+ }
+ return false
+ }
+ for _, node := range getNodes(doc, isFeedHyperLink) {
+ href := getAttr(node, "href")
+ link := absoluteUrl(href, base)
+ if link != "" {
+ candidates[link] = ""
+ }
+ }
+ }
+
+ return candidates
+}
diff --git a/src/crawler/finder_test.go b/src/crawler/finder_test.go
new file mode 100644
index 0000000..90a5377
--- /dev/null
+++ b/src/crawler/finder_test.go
@@ -0,0 +1,74 @@
+package crawler
+
+import (
+ "testing"
+ "reflect"
+)
+
+const base = "http://example.com"
+
+func TestFindFeedsInvalidHTML(t *testing.T) {
+ x := `some nonsense`
+ r := FindFeeds(x, base)
+ if len(r) != 0 {
+ t.Fatal("not expecting results")
+ }
+}
+
+func TestFindFeedsLinks(t *testing.T) {
+ x := `
+
+
+
+
+
+
+
+
+
+
+ rss
+
+
+ `
+ have := FindFeeds(x, base)
+
+ want := map[string]string{
+ base + "/feed.xml": "rss with title",
+ base + "/atom.xml": "",
+ base + "/feed.json": "",
+ }
+ if !reflect.DeepEqual(have, want) {
+ t.Logf("want: %#v", want)
+ t.Logf("have: %#v", have)
+ t.Fatal("invalid result")
+ }
+}
+
+func TestFindFeedsGuess(t *testing.T) {
+ x := `
+
+
+
+
+ what is rss?
+ moo
+
+
+ subscribe
+ rss
+
+
+ `
+ r := FindFeeds(x, base)
+
+ e := map[string]string{
+ base + "/feed.xml": "",
+ base + "/news": "",
+ }
+ if !reflect.DeepEqual(e, r) {
+ t.Logf("want: %#v", e)
+ t.Logf("have: %#v", r)
+ t.Fatal("invalid result")
+ }
+}
diff --git a/src/crawler/utils.go b/src/crawler/utils.go
new file mode 100644
index 0000000..7291c4d
--- /dev/null
+++ b/src/crawler/utils.go
@@ -0,0 +1,58 @@
+package crawler
+
+import (
+ "net/url"
+ "strings"
+
+ "golang.org/x/net/html"
+)
+
+func getAttr(node *html.Node, key string) string {
+ for _, a := range node.Attr {
+ if a.Key == key {
+ return a.Val
+ }
+ }
+ return ""
+}
+
+func getText(node *html.Node) string {
+ text := make([]string, 0)
+ isTextNode := func(n *html.Node) bool {
+ return n.Type == html.TextNode
+ }
+ for _, n := range getNodes(node, isTextNode) {
+ text = append(text, strings.TrimSpace(n.Data))
+ }
+ return strings.Join(text, " ")
+}
+
+func getNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
+ nodes := make([]*html.Node, 0)
+
+ queue := make([]*html.Node, 0)
+ queue = append(queue, node)
+ for len(queue) > 0 {
+ var n *html.Node
+ n, queue = queue[0], queue[1:]
+ if match(n) {
+ nodes = append(nodes, n)
+ }
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ queue = append(queue, c)
+ }
+ }
+ return nodes
+}
+
+func absoluteUrl(href, base string) string {
+ baseUrl, err := url.Parse(base)
+ if err != nil {
+ return ""
+ }
+ hrefUrl, err := url.Parse(href)
+ if err != nil {
+ return ""
+ }
+ return baseUrl.ResolveReference(hrefUrl).String()
+}
diff --git a/src/worker/crawl.go b/src/worker/crawl.go
deleted file mode 100644
index df29d14..0000000
--- a/src/worker/crawl.go
+++ /dev/null
@@ -1,109 +0,0 @@
-package worker
-
-import (
- "net/url"
- "strings"
-
- "golang.org/x/net/html"
-)
-
-func getAttr(node *html.Node, key string) string {
- for _, a := range node.Attr {
- if a.Key == key {
- return a.Val
- }
- }
- return ""
-}
-
-func getText(node *html.Node) string {
- text := make([]string, 0)
- isTextNode := func(n *html.Node) bool {
- return n.Type == html.TextNode
- }
- for _, n := range getNodes(node, isTextNode) {
- text = append(text, strings.TrimSpace(n.Data))
- }
- return strings.Join(text, " ")
-}
-
-func getNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
- nodes := make([]*html.Node, 0)
-
- queue := make([]*html.Node, 0)
- queue = append(queue, node)
- for len(queue) > 0 {
- queue, n := queue[1:], queue[0]
- if match(n) {
- nodes = append(nodes, n)
- }
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- queue = append(queue, c)
- }
- }
- return nodes
-}
-
-func FindFeeds(doc *html.Node, baseUrl *url.URL) []*FeedSource {
- candidates := make(map[string]string)
-
- linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"}
- isFeedLink := func(n *html.Node) bool {
- if n.Type == html.ElementNode && n.Data == "link" {
- t := getAttr(n, "type")
- for _, tt := range linkTypes {
- if tt == t {
- return true
- }
- }
- }
- return false
- }
- for _, node := range getNodes(doc, isFeedLink) {
- href := getAttr(node, "href")
- name := getAttr(node, "title")
- link := baseUrl.ResolveReference(href).String()
-
- if href != "" {
- candidates[link] = name
- }
- }
-
- if len(candidates) == 0 {
- // guess by hyperlink properties:
- // - a[href="feed"]
- // - a:contains("rss")
- // ...etc
- feedHrefs := []string{"feed", "feed.xml", "rss.xml", "atom.xml"}
- feedTexts := []string{"rss", "feed"}
- isFeedHyperLink := func(n *html.Node) bool {
- if n.Type == html.ElementNode && n.Data == "a" {
- href := strings.Trim(getAttr(n, "href"), "/")
- text := strings.Lower(getText(n))
-
- for _, feedHref := range feedHrefs {
- if strings.HasSuffix(href, feedHref) {
- return true
- }
- }
- for _, feedText := range feedTexts {
- if strings.Contains(text, feedText) {
- return true
- }
- }
- }
- return false
- }
- for _, node := range getNodes(doc, isFeedHyperLink) {
- href := getAttr(node, "href")
- link := baseUrl.ResolveReference(href).String()
- candidates[link] = ""
- }
- }
-
- sources := make([]*FeedSource, 0, len(candidates))
- for url, title := range candidates {
- sources = append(sources, &FeedSource{Title: title, Url: url})
- }
- return sources
-}