start rewriting readability

2025-11-25 08:31:05 +00:00 · 2021-03-30 11:47:36 +01:00
parent 8c44d2fc87
commit e5920259b6
8 changed files with 238 additions and 69 deletions
--- a/bin/reader.go
+++ b/bin/reader.go
@@ -0,0 +1,29 @@
+package main
+
+import (
+	"fmt"
+	"log"
+	"net/http"
+	"os"
+
+	"github.com/nkanaev/yarr/src/reader"
+)
+
+func main() {
+	if len(os.Args) != 2 {
+		fmt.Println("usage: <script> [url]")
+		return
+	}
+	url := os.Args[1]
+	res, err := http.Get(url)
+	if err != nil {
+		log.Fatalf("failed to get url %s: %s", url, err)
+	}
+	defer res.Body.Close()
+
+	content, err := reader.ExtractContent(res.Body)
+	if err != nil {
+		log.Fatalf("failed to extract content: %s", err)
+	}
+	fmt.Println(content)
+}
--- a/src/htmlutil/query.go
+++ b/src/htmlutil/query.go
@@ -0,0 +1,76 @@
+package htmlutil
+
+import (
+	"regexp"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+var nodeNameRegex = regexp.MustCompile(`\w+`)
+
+func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
+	nodes := make([]*html.Node, 0)
+
+	queue := make([]*html.Node, 0)
+	queue = append(queue, node)
+	for len(queue) > 0 {
+		var n *html.Node
+		n, queue = queue[0], queue[1:]
+		if match(n) {
+			nodes = append(nodes, n)
+		}
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			queue = append(queue, c)
+		}
+	}
+	return nodes
+}
+
+func Query(node *html.Node, sel string) []*html.Node {
+	matcher := NewMatcher(sel)
+	return FindNodes(node, matcher.Match)
+}
+
+func NewMatcher(sel string) Matcher {
+	multi := MultiMatch{}
+	parts := strings.Split(sel, ",")
+	for _, part := range parts {
+		part := strings.TrimSpace(part)
+		if nodeNameRegex.MatchString(part) {
+			multi.Add(ElementMatch{Name: part})
+		} else {
+			panic("unsupported selector")
+		}
+	}
+	return multi
+}
+
+type Matcher interface {
+	Match(*html.Node) bool
+}
+
+type ElementMatch struct {
+	Name string
+}
+
+func (m ElementMatch) Match(n *html.Node) bool {
+	return n.Type == html.ElementNode && n.Data == m.Name
+}
+
+type MultiMatch struct {
+	matchers []Matcher
+}
+
+func (m *MultiMatch) Add(matcher Matcher) {
+	m.matchers = append(m.matchers, matcher)
+}
+
+func (m MultiMatch) Match(n *html.Node) bool {
+	for _, matcher := range m.matchers {
+		if matcher.Match(n) {
+			return true
+		}
+	}
+	return false
+}
--- a/src/htmlutil/query_test.go
+++ b/src/htmlutil/query_test.go
@@ -0,0 +1,64 @@
+package htmlutil
+
+import (
+	"strings"
+	"testing"
+
+	"golang.org/x/net/html"
+)
+
+func TestQuery(t *testing.T) {
+	node, _ := html.Parse(strings.NewReader(`
+		<!DOCTYPE html>
+		<html lang="en">
+		<head>
+			<meta charset="UTF-8">
+			<title></title>
+		</head>
+		<body>
+			<div>
+				<p>test</p>
+			</div>
+		</body>
+		</html>
+	`))
+	nodes := Query(node, "p")
+	match := (
+		len(nodes) == 1 &&
+		nodes[0].Type == html.ElementNode &&
+		nodes[0].Data == "p")		
+	if !match {
+		t.Fatalf("incorrect match: %#v", nodes)
+	}
+}
+
+func TestQueryMulti(t *testing.T) {
+	node, _ := html.Parse(strings.NewReader(`
+		<!DOCTYPE html>
+		<html lang="en">
+		<head>
+			<meta charset="UTF-8">
+			<title></title>
+		</head>
+		<body>
+			<p>foo</p>
+			<div>
+				<p>bar</p>
+				<span>baz</span>
+			</div>
+		</body>
+		</html>
+	`))
+	nodes := Query(node, "p , span")
+	match := (
+		len(nodes) == 3 &&
+		nodes[0].Type == html.ElementNode && nodes[0].Data == "p" &&
+		nodes[1].Type == html.ElementNode && nodes[1].Data == "p" &&
+		nodes[2].Type == html.ElementNode && nodes[2].Data == "span")	
+	if !match {
+		for i, n := range nodes {
+			t.Logf("%d: %s", i, HTML(n))
+		}
+		t.Fatal("incorrect match")
+	}
+}
--- a/src/htmlutil/utils.go
+++ b/src/htmlutil/utils.go
@@ -0,0 +1,33 @@
+package htmlutil
+
+import (
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+func HTML(node *html.Node) string {
+	writer := strings.Builder{}
+	html.Render(&writer, node)
+	return writer.String()
+}
+
+func Attr(node *html.Node, key string) string {
+	for _, a := range node.Attr {
+		if a.Key == key {
+			return a.Val
+		}
+	}
+	return ""
+}
+
+func Text(node *html.Node) string {
+	text := make([]string, 0)
+	isTextNode := func(n *html.Node) bool {
+		return n.Type == html.TextNode
+	}
+	for _, n := range FindNodes(node, isTextNode) {
+		text = append(text, strings.TrimSpace(n.Data))
+	}
+	return strings.Join(text, " ")
+}
--- a/src/scraper/readability.go
+++ b/src/scraper/readability.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by the Apache 2.0
 // license that can be found in the LICENSE file.

-package scraper
+package reader

 import (
 	"bytes"
@@ -13,6 +13,7 @@ import (
 	"regexp"
 	"strings"

+	"github.com/nkanaev/yarr/src/htmlutil"
 	"github.com/PuerkitoBio/goquery"
 	"golang.org/x/net/html"
 )
@@ -75,9 +76,12 @@ func ExtractContent(page io.Reader) (string, error) {
 		return "", err
 	}

-	document.Find("script,style").Each(func(i int, s *goquery.Selection) {
-		removeNodes(s)
-	})
+	root := document.Get(0)
+	for _, trash := range htmlutil.Query(root, "script,style") {
+		if trash.Parent != nil {
+			trash.Parent.RemoveChild(trash)
+		}
+	}

 	transformMisusedDivsIntoParagraphs(document)
 	removeUnlikelyCandidates(document)
@@ -142,7 +146,10 @@ func removeUnlikelyCandidates(document *goquery.Document) {
 		str := class + id

 		if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
-			removeNodes(s)
+			node := s.Get(0)
+			if node.Parent != nil {
+				node.Parent.RemoveChild(node)
+			}
 		}
 	})
 }
@@ -294,12 +301,3 @@ func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
 		}
 	})
 }
-
-func removeNodes(s *goquery.Selection) {
-	s.Each(func(i int, s *goquery.Selection) {
-		parent := s.Parent()
-		if parent.Length() > 0 {
-			parent.Get(0).RemoveChild(s.Get(0))
-		}
-	})
-}
--- a/src/scraper/finder.go
+++ b/src/scraper/finder.go
@@ -3,6 +3,7 @@ package scraper
 import (
 	"strings"

+	"github.com/nkanaev/yarr/src/htmlutil"
 	"golang.org/x/net/html"
 )

@@ -19,7 +20,7 @@ func FindFeeds(body string, base string) map[string]string {
 	linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"}
 	isFeedLink := func(n *html.Node) bool {
 		if n.Type == html.ElementNode && n.Data == "link" {
-			t := getAttr(n, "type")
+			t := htmlutil.Attr(n, "type")
 			for _, tt := range linkTypes {
 				if tt == t {
 					return true
@@ -28,9 +29,9 @@ func FindFeeds(body string, base string) map[string]string {
 		}
 		return false
 	}
-	for _, node := range getNodes(doc, isFeedLink) {
-		href := getAttr(node, "href")
-		name := getAttr(node, "title")
+	for _, node := range htmlutil.FindNodes(doc, isFeedLink) {
+		href := htmlutil.Attr(node, "href")
+		name := htmlutil.Attr(node, "title")
 		link := absoluteUrl(href, base)
 		if link != "" {
 			candidates[link] = name
@@ -45,17 +46,23 @@ func FindFeeds(body string, base string) map[string]string {
 		feedTexts := []string{"rss", "feed"}
 		isFeedHyperLink := func(n *html.Node) bool {
 			if n.Type == html.ElementNode && n.Data == "a" {
-				if any(feedHrefs, strings.Trim(getAttr(n, "href"), "/"), strings.HasSuffix) {
-					return true
+				href := strings.Trim(htmlutil.Attr(n, "href"), "/")
+				for _, feedHref := range feedHrefs {
+					if strings.HasSuffix(href, feedHref) {
+						return true
+					}
 				}
-				if any(feedTexts, getText(n), strings.EqualFold) {
-					return true
+				text := htmlutil.Text(n)
+				for _, feedText := range feedTexts {
+					if strings.EqualFold(text, feedText) {
+						return true
+					}
 				}
 			}
 			return false
 		}
-		for _, node := range getNodes(doc, isFeedHyperLink) {
-			href := getAttr(node, "href")
+		for _, node := range htmlutil.FindNodes(doc, isFeedHyperLink) {
+			href := htmlutil.Attr(node, "href")
 			link := absoluteUrl(href, base)
 			if link != "" {
 				candidates[link] = ""
@@ -78,9 +85,12 @@ func FindIcons(body string, base string) []string {
 	isLink := func(n *html.Node) bool {
 		return n.Type == html.ElementNode && n.Data == "link"
 	}
-	for _, node := range getNodes(doc, isLink) {
-		if any(strings.Split(getAttr(node, "rel"), " "), "icon", strings.EqualFold) {
-			icons = append(icons, absoluteUrl(getAttr(node, "href"), base))
+	for _, node := range htmlutil.FindNodes(doc, isLink) {
+		rels := strings.Split(htmlutil.Attr(node, "rel"), " ")
+		for _, rel := range rels {
+			if strings.EqualFold(rel, "icon") {
+				icons = append(icons, absoluteUrl(htmlutil.Attr(node, "href"), base))
+			}
 		}
 	}
 	return icons
--- a/src/scraper/sanitizer.go
+++ b/src/scraper/sanitizer.go
@@ -15,7 +15,7 @@ import (
 	"golang.org/x/net/html"
 )

-var splitSrcsetRegex  = regexp.MustCompile(`,\s+`)
+var splitSrcsetRegex = regexp.MustCompile(`,\s+`)

 // Sanitize returns safe HTML.
 func Sanitize(baseURL, input string) string {
--- a/src/scraper/utils.go
+++ b/src/scraper/utils.go
@@ -2,9 +2,6 @@ package scraper

 import (
 	"net/url"
-	"strings"
-
-	"golang.org/x/net/html"
 )

 func any(els []string, el string, match func(string, string) bool) bool {
@@ -16,44 +13,6 @@ func any(els []string, el string, match func(string, string) bool) bool {
 	return false
 }

-func getAttr(node *html.Node, key string) string {
-	for _, a := range node.Attr {
-		if a.Key == key {
-			return a.Val
-		}
-	}
-	return ""
-}
-
-func getText(node *html.Node) string {
-	text := make([]string, 0)
-	isTextNode := func(n *html.Node) bool {
-		return n.Type == html.TextNode
-	}
-	for _, n := range getNodes(node, isTextNode) {
-		text = append(text, strings.TrimSpace(n.Data))
-	}
-	return strings.Join(text, " ")
-}
-
-func getNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
-	nodes := make([]*html.Node, 0)
-
-	queue := make([]*html.Node, 0)
-	queue = append(queue, node)
-	for len(queue) > 0 {
-		var n *html.Node
-		n, queue = queue[0], queue[1:]
-		if match(n) {
-			nodes = append(nodes, n)
-		}
-		for c := n.FirstChild; c != nil; c = c.NextSibling {
-			queue = append(queue, c)
-		}
-	}
-	return nodes
-}
-
 func absoluteUrl(href, base string) string {
 	baseUrl, err := url.Parse(base)
 	if err != nil {