start rewriting readability

This commit is contained in:
Nazar Kanaev 2021-03-30 11:47:36 +01:00
parent 8c44d2fc87
commit e5920259b6
8 changed files with 238 additions and 69 deletions

29
bin/reader.go Normal file
View File

@ -0,0 +1,29 @@
package main
import (
"fmt"
"log"
"net/http"
"os"
"github.com/nkanaev/yarr/src/reader"
)
func main() {
if len(os.Args) != 2 {
fmt.Println("usage: <script> [url]")
return
}
url := os.Args[1]
res, err := http.Get(url)
if err != nil {
log.Fatalf("failed to get url %s: %s", url, err)
}
defer res.Body.Close()
content, err := reader.ExtractContent(res.Body)
if err != nil {
log.Fatalf("failed to extract content: %s", err)
}
fmt.Println(content)
}

76
src/htmlutil/query.go Normal file
View File

@ -0,0 +1,76 @@
package htmlutil
import (
"regexp"
"strings"
"golang.org/x/net/html"
)
var nodeNameRegex = regexp.MustCompile(`\w+`)
func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
nodes := make([]*html.Node, 0)
queue := make([]*html.Node, 0)
queue = append(queue, node)
for len(queue) > 0 {
var n *html.Node
n, queue = queue[0], queue[1:]
if match(n) {
nodes = append(nodes, n)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
queue = append(queue, c)
}
}
return nodes
}
func Query(node *html.Node, sel string) []*html.Node {
matcher := NewMatcher(sel)
return FindNodes(node, matcher.Match)
}
func NewMatcher(sel string) Matcher {
multi := MultiMatch{}
parts := strings.Split(sel, ",")
for _, part := range parts {
part := strings.TrimSpace(part)
if nodeNameRegex.MatchString(part) {
multi.Add(ElementMatch{Name: part})
} else {
panic("unsupported selector")
}
}
return multi
}
type Matcher interface {
Match(*html.Node) bool
}
type ElementMatch struct {
Name string
}
func (m ElementMatch) Match(n *html.Node) bool {
return n.Type == html.ElementNode && n.Data == m.Name
}
type MultiMatch struct {
matchers []Matcher
}
func (m *MultiMatch) Add(matcher Matcher) {
m.matchers = append(m.matchers, matcher)
}
func (m MultiMatch) Match(n *html.Node) bool {
for _, matcher := range m.matchers {
if matcher.Match(n) {
return true
}
}
return false
}

View File

@ -0,0 +1,64 @@
package htmlutil
import (
"strings"
"testing"
"golang.org/x/net/html"
)
func TestQuery(t *testing.T) {
node, _ := html.Parse(strings.NewReader(`
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<div>
<p>test</p>
</div>
</body>
</html>
`))
nodes := Query(node, "p")
match := (
len(nodes) == 1 &&
nodes[0].Type == html.ElementNode &&
nodes[0].Data == "p")
if !match {
t.Fatalf("incorrect match: %#v", nodes)
}
}
func TestQueryMulti(t *testing.T) {
node, _ := html.Parse(strings.NewReader(`
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<p>foo</p>
<div>
<p>bar</p>
<span>baz</span>
</div>
</body>
</html>
`))
nodes := Query(node, "p , span")
match := (
len(nodes) == 3 &&
nodes[0].Type == html.ElementNode && nodes[0].Data == "p" &&
nodes[1].Type == html.ElementNode && nodes[1].Data == "p" &&
nodes[2].Type == html.ElementNode && nodes[2].Data == "span")
if !match {
for i, n := range nodes {
t.Logf("%d: %s", i, HTML(n))
}
t.Fatal("incorrect match")
}
}

33
src/htmlutil/utils.go Normal file
View File

@ -0,0 +1,33 @@
package htmlutil
import (
"strings"
"golang.org/x/net/html"
)
func HTML(node *html.Node) string {
writer := strings.Builder{}
html.Render(&writer, node)
return writer.String()
}
func Attr(node *html.Node, key string) string {
for _, a := range node.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}
func Text(node *html.Node) string {
text := make([]string, 0)
isTextNode := func(n *html.Node) bool {
return n.Type == html.TextNode
}
for _, n := range FindNodes(node, isTextNode) {
text = append(text, strings.TrimSpace(n.Data))
}
return strings.Join(text, " ")
}

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by the Apache 2.0 // Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package scraper package reader
import ( import (
"bytes" "bytes"
@ -13,6 +13,7 @@ import (
"regexp" "regexp"
"strings" "strings"
"github.com/nkanaev/yarr/src/htmlutil"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"golang.org/x/net/html" "golang.org/x/net/html"
) )
@ -75,9 +76,12 @@ func ExtractContent(page io.Reader) (string, error) {
return "", err return "", err
} }
document.Find("script,style").Each(func(i int, s *goquery.Selection) { root := document.Get(0)
removeNodes(s) for _, trash := range htmlutil.Query(root, "script,style") {
}) if trash.Parent != nil {
trash.Parent.RemoveChild(trash)
}
}
transformMisusedDivsIntoParagraphs(document) transformMisusedDivsIntoParagraphs(document)
removeUnlikelyCandidates(document) removeUnlikelyCandidates(document)
@ -142,7 +146,10 @@ func removeUnlikelyCandidates(document *goquery.Document) {
str := class + id str := class + id
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) { if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
removeNodes(s) node := s.Get(0)
if node.Parent != nil {
node.Parent.RemoveChild(node)
}
} }
}) })
} }
@ -294,12 +301,3 @@ func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
} }
}) })
} }
func removeNodes(s *goquery.Selection) {
s.Each(func(i int, s *goquery.Selection) {
parent := s.Parent()
if parent.Length() > 0 {
parent.Get(0).RemoveChild(s.Get(0))
}
})
}

View File

@ -3,6 +3,7 @@ package scraper
import ( import (
"strings" "strings"
"github.com/nkanaev/yarr/src/htmlutil"
"golang.org/x/net/html" "golang.org/x/net/html"
) )
@ -19,7 +20,7 @@ func FindFeeds(body string, base string) map[string]string {
linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"} linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"}
isFeedLink := func(n *html.Node) bool { isFeedLink := func(n *html.Node) bool {
if n.Type == html.ElementNode && n.Data == "link" { if n.Type == html.ElementNode && n.Data == "link" {
t := getAttr(n, "type") t := htmlutil.Attr(n, "type")
for _, tt := range linkTypes { for _, tt := range linkTypes {
if tt == t { if tt == t {
return true return true
@ -28,9 +29,9 @@ func FindFeeds(body string, base string) map[string]string {
} }
return false return false
} }
for _, node := range getNodes(doc, isFeedLink) { for _, node := range htmlutil.FindNodes(doc, isFeedLink) {
href := getAttr(node, "href") href := htmlutil.Attr(node, "href")
name := getAttr(node, "title") name := htmlutil.Attr(node, "title")
link := absoluteUrl(href, base) link := absoluteUrl(href, base)
if link != "" { if link != "" {
candidates[link] = name candidates[link] = name
@ -45,17 +46,23 @@ func FindFeeds(body string, base string) map[string]string {
feedTexts := []string{"rss", "feed"} feedTexts := []string{"rss", "feed"}
isFeedHyperLink := func(n *html.Node) bool { isFeedHyperLink := func(n *html.Node) bool {
if n.Type == html.ElementNode && n.Data == "a" { if n.Type == html.ElementNode && n.Data == "a" {
if any(feedHrefs, strings.Trim(getAttr(n, "href"), "/"), strings.HasSuffix) { href := strings.Trim(htmlutil.Attr(n, "href"), "/")
return true for _, feedHref := range feedHrefs {
if strings.HasSuffix(href, feedHref) {
return true
}
} }
if any(feedTexts, getText(n), strings.EqualFold) { text := htmlutil.Text(n)
return true for _, feedText := range feedTexts {
if strings.EqualFold(text, feedText) {
return true
}
} }
} }
return false return false
} }
for _, node := range getNodes(doc, isFeedHyperLink) { for _, node := range htmlutil.FindNodes(doc, isFeedHyperLink) {
href := getAttr(node, "href") href := htmlutil.Attr(node, "href")
link := absoluteUrl(href, base) link := absoluteUrl(href, base)
if link != "" { if link != "" {
candidates[link] = "" candidates[link] = ""
@ -78,9 +85,12 @@ func FindIcons(body string, base string) []string {
isLink := func(n *html.Node) bool { isLink := func(n *html.Node) bool {
return n.Type == html.ElementNode && n.Data == "link" return n.Type == html.ElementNode && n.Data == "link"
} }
for _, node := range getNodes(doc, isLink) { for _, node := range htmlutil.FindNodes(doc, isLink) {
if any(strings.Split(getAttr(node, "rel"), " "), "icon", strings.EqualFold) { rels := strings.Split(htmlutil.Attr(node, "rel"), " ")
icons = append(icons, absoluteUrl(getAttr(node, "href"), base)) for _, rel := range rels {
if strings.EqualFold(rel, "icon") {
icons = append(icons, absoluteUrl(htmlutil.Attr(node, "href"), base))
}
} }
} }
return icons return icons

View File

@ -15,7 +15,7 @@ import (
"golang.org/x/net/html" "golang.org/x/net/html"
) )
var splitSrcsetRegex = regexp.MustCompile(`,\s+`) var splitSrcsetRegex = regexp.MustCompile(`,\s+`)
// Sanitize returns safe HTML. // Sanitize returns safe HTML.
func Sanitize(baseURL, input string) string { func Sanitize(baseURL, input string) string {

View File

@ -2,9 +2,6 @@ package scraper
import ( import (
"net/url" "net/url"
"strings"
"golang.org/x/net/html"
) )
func any(els []string, el string, match func(string, string) bool) bool { func any(els []string, el string, match func(string, string) bool) bool {
@ -16,44 +13,6 @@ func any(els []string, el string, match func(string, string) bool) bool {
return false return false
} }
func getAttr(node *html.Node, key string) string {
for _, a := range node.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}
func getText(node *html.Node) string {
text := make([]string, 0)
isTextNode := func(n *html.Node) bool {
return n.Type == html.TextNode
}
for _, n := range getNodes(node, isTextNode) {
text = append(text, strings.TrimSpace(n.Data))
}
return strings.Join(text, " ")
}
func getNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
nodes := make([]*html.Node, 0)
queue := make([]*html.Node, 0)
queue = append(queue, node)
for len(queue) > 0 {
var n *html.Node
n, queue = queue[0], queue[1:]
if match(n) {
nodes = append(nodes, n)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
queue = append(queue, c)
}
}
return nodes
}
func absoluteUrl(href, base string) string { func absoluteUrl(href, base string) string {
baseUrl, err := url.Parse(base) baseUrl, err := url.Parse(base)
if err != nil { if err != nil {