mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
start rewriting readability
This commit is contained in:
parent
8c44d2fc87
commit
e5920259b6
29
bin/reader.go
Normal file
29
bin/reader.go
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/nkanaev/yarr/src/reader"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
if len(os.Args) != 2 {
|
||||||
|
fmt.Println("usage: <script> [url]")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
url := os.Args[1]
|
||||||
|
res, err := http.Get(url)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("failed to get url %s: %s", url, err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
|
||||||
|
content, err := reader.ExtractContent(res.Body)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("failed to extract content: %s", err)
|
||||||
|
}
|
||||||
|
fmt.Println(content)
|
||||||
|
}
|
76
src/htmlutil/query.go
Normal file
76
src/htmlutil/query.go
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
package htmlutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
var nodeNameRegex = regexp.MustCompile(`\w+`)
|
||||||
|
|
||||||
|
func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
|
||||||
|
nodes := make([]*html.Node, 0)
|
||||||
|
|
||||||
|
queue := make([]*html.Node, 0)
|
||||||
|
queue = append(queue, node)
|
||||||
|
for len(queue) > 0 {
|
||||||
|
var n *html.Node
|
||||||
|
n, queue = queue[0], queue[1:]
|
||||||
|
if match(n) {
|
||||||
|
nodes = append(nodes, n)
|
||||||
|
}
|
||||||
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||||
|
queue = append(queue, c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nodes
|
||||||
|
}
|
||||||
|
|
||||||
|
func Query(node *html.Node, sel string) []*html.Node {
|
||||||
|
matcher := NewMatcher(sel)
|
||||||
|
return FindNodes(node, matcher.Match)
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewMatcher(sel string) Matcher {
|
||||||
|
multi := MultiMatch{}
|
||||||
|
parts := strings.Split(sel, ",")
|
||||||
|
for _, part := range parts {
|
||||||
|
part := strings.TrimSpace(part)
|
||||||
|
if nodeNameRegex.MatchString(part) {
|
||||||
|
multi.Add(ElementMatch{Name: part})
|
||||||
|
} else {
|
||||||
|
panic("unsupported selector")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return multi
|
||||||
|
}
|
||||||
|
|
||||||
|
type Matcher interface {
|
||||||
|
Match(*html.Node) bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type ElementMatch struct {
|
||||||
|
Name string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m ElementMatch) Match(n *html.Node) bool {
|
||||||
|
return n.Type == html.ElementNode && n.Data == m.Name
|
||||||
|
}
|
||||||
|
|
||||||
|
type MultiMatch struct {
|
||||||
|
matchers []Matcher
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MultiMatch) Add(matcher Matcher) {
|
||||||
|
m.matchers = append(m.matchers, matcher)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m MultiMatch) Match(n *html.Node) bool {
|
||||||
|
for _, matcher := range m.matchers {
|
||||||
|
if matcher.Match(n) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
64
src/htmlutil/query_test.go
Normal file
64
src/htmlutil/query_test.go
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
package htmlutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestQuery(t *testing.T) {
|
||||||
|
node, _ := html.Parse(strings.NewReader(`
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title></title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div>
|
||||||
|
<p>test</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`))
|
||||||
|
nodes := Query(node, "p")
|
||||||
|
match := (
|
||||||
|
len(nodes) == 1 &&
|
||||||
|
nodes[0].Type == html.ElementNode &&
|
||||||
|
nodes[0].Data == "p")
|
||||||
|
if !match {
|
||||||
|
t.Fatalf("incorrect match: %#v", nodes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQueryMulti(t *testing.T) {
|
||||||
|
node, _ := html.Parse(strings.NewReader(`
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title></title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>foo</p>
|
||||||
|
<div>
|
||||||
|
<p>bar</p>
|
||||||
|
<span>baz</span>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`))
|
||||||
|
nodes := Query(node, "p , span")
|
||||||
|
match := (
|
||||||
|
len(nodes) == 3 &&
|
||||||
|
nodes[0].Type == html.ElementNode && nodes[0].Data == "p" &&
|
||||||
|
nodes[1].Type == html.ElementNode && nodes[1].Data == "p" &&
|
||||||
|
nodes[2].Type == html.ElementNode && nodes[2].Data == "span")
|
||||||
|
if !match {
|
||||||
|
for i, n := range nodes {
|
||||||
|
t.Logf("%d: %s", i, HTML(n))
|
||||||
|
}
|
||||||
|
t.Fatal("incorrect match")
|
||||||
|
}
|
||||||
|
}
|
33
src/htmlutil/utils.go
Normal file
33
src/htmlutil/utils.go
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
package htmlutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
func HTML(node *html.Node) string {
|
||||||
|
writer := strings.Builder{}
|
||||||
|
html.Render(&writer, node)
|
||||||
|
return writer.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func Attr(node *html.Node, key string) string {
|
||||||
|
for _, a := range node.Attr {
|
||||||
|
if a.Key == key {
|
||||||
|
return a.Val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func Text(node *html.Node) string {
|
||||||
|
text := make([]string, 0)
|
||||||
|
isTextNode := func(n *html.Node) bool {
|
||||||
|
return n.Type == html.TextNode
|
||||||
|
}
|
||||||
|
for _, n := range FindNodes(node, isTextNode) {
|
||||||
|
text = append(text, strings.TrimSpace(n.Data))
|
||||||
|
}
|
||||||
|
return strings.Join(text, " ")
|
||||||
|
}
|
@ -2,7 +2,7 @@
|
|||||||
// Use of this source code is governed by the Apache 2.0
|
// Use of this source code is governed by the Apache 2.0
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package scraper
|
package reader
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
@ -13,6 +13,7 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/nkanaev/yarr/src/htmlutil"
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
@ -75,9 +76,12 @@ func ExtractContent(page io.Reader) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
|
root := document.Get(0)
|
||||||
removeNodes(s)
|
for _, trash := range htmlutil.Query(root, "script,style") {
|
||||||
})
|
if trash.Parent != nil {
|
||||||
|
trash.Parent.RemoveChild(trash)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
transformMisusedDivsIntoParagraphs(document)
|
transformMisusedDivsIntoParagraphs(document)
|
||||||
removeUnlikelyCandidates(document)
|
removeUnlikelyCandidates(document)
|
||||||
@ -142,7 +146,10 @@ func removeUnlikelyCandidates(document *goquery.Document) {
|
|||||||
str := class + id
|
str := class + id
|
||||||
|
|
||||||
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
|
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
|
||||||
removeNodes(s)
|
node := s.Get(0)
|
||||||
|
if node.Parent != nil {
|
||||||
|
node.Parent.RemoveChild(node)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -294,12 +301,3 @@ func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func removeNodes(s *goquery.Selection) {
|
|
||||||
s.Each(func(i int, s *goquery.Selection) {
|
|
||||||
parent := s.Parent()
|
|
||||||
if parent.Length() > 0 {
|
|
||||||
parent.Get(0).RemoveChild(s.Get(0))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
@ -3,6 +3,7 @@ package scraper
|
|||||||
import (
|
import (
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/nkanaev/yarr/src/htmlutil"
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -19,7 +20,7 @@ func FindFeeds(body string, base string) map[string]string {
|
|||||||
linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"}
|
linkTypes := []string{"application/atom+xml", "application/rss+xml", "application/json"}
|
||||||
isFeedLink := func(n *html.Node) bool {
|
isFeedLink := func(n *html.Node) bool {
|
||||||
if n.Type == html.ElementNode && n.Data == "link" {
|
if n.Type == html.ElementNode && n.Data == "link" {
|
||||||
t := getAttr(n, "type")
|
t := htmlutil.Attr(n, "type")
|
||||||
for _, tt := range linkTypes {
|
for _, tt := range linkTypes {
|
||||||
if tt == t {
|
if tt == t {
|
||||||
return true
|
return true
|
||||||
@ -28,9 +29,9 @@ func FindFeeds(body string, base string) map[string]string {
|
|||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
for _, node := range getNodes(doc, isFeedLink) {
|
for _, node := range htmlutil.FindNodes(doc, isFeedLink) {
|
||||||
href := getAttr(node, "href")
|
href := htmlutil.Attr(node, "href")
|
||||||
name := getAttr(node, "title")
|
name := htmlutil.Attr(node, "title")
|
||||||
link := absoluteUrl(href, base)
|
link := absoluteUrl(href, base)
|
||||||
if link != "" {
|
if link != "" {
|
||||||
candidates[link] = name
|
candidates[link] = name
|
||||||
@ -45,17 +46,23 @@ func FindFeeds(body string, base string) map[string]string {
|
|||||||
feedTexts := []string{"rss", "feed"}
|
feedTexts := []string{"rss", "feed"}
|
||||||
isFeedHyperLink := func(n *html.Node) bool {
|
isFeedHyperLink := func(n *html.Node) bool {
|
||||||
if n.Type == html.ElementNode && n.Data == "a" {
|
if n.Type == html.ElementNode && n.Data == "a" {
|
||||||
if any(feedHrefs, strings.Trim(getAttr(n, "href"), "/"), strings.HasSuffix) {
|
href := strings.Trim(htmlutil.Attr(n, "href"), "/")
|
||||||
return true
|
for _, feedHref := range feedHrefs {
|
||||||
|
if strings.HasSuffix(href, feedHref) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if any(feedTexts, getText(n), strings.EqualFold) {
|
text := htmlutil.Text(n)
|
||||||
return true
|
for _, feedText := range feedTexts {
|
||||||
|
if strings.EqualFold(text, feedText) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
for _, node := range getNodes(doc, isFeedHyperLink) {
|
for _, node := range htmlutil.FindNodes(doc, isFeedHyperLink) {
|
||||||
href := getAttr(node, "href")
|
href := htmlutil.Attr(node, "href")
|
||||||
link := absoluteUrl(href, base)
|
link := absoluteUrl(href, base)
|
||||||
if link != "" {
|
if link != "" {
|
||||||
candidates[link] = ""
|
candidates[link] = ""
|
||||||
@ -78,9 +85,12 @@ func FindIcons(body string, base string) []string {
|
|||||||
isLink := func(n *html.Node) bool {
|
isLink := func(n *html.Node) bool {
|
||||||
return n.Type == html.ElementNode && n.Data == "link"
|
return n.Type == html.ElementNode && n.Data == "link"
|
||||||
}
|
}
|
||||||
for _, node := range getNodes(doc, isLink) {
|
for _, node := range htmlutil.FindNodes(doc, isLink) {
|
||||||
if any(strings.Split(getAttr(node, "rel"), " "), "icon", strings.EqualFold) {
|
rels := strings.Split(htmlutil.Attr(node, "rel"), " ")
|
||||||
icons = append(icons, absoluteUrl(getAttr(node, "href"), base))
|
for _, rel := range rels {
|
||||||
|
if strings.EqualFold(rel, "icon") {
|
||||||
|
icons = append(icons, absoluteUrl(htmlutil.Attr(node, "href"), base))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return icons
|
return icons
|
||||||
|
@ -15,7 +15,7 @@ import (
|
|||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
var splitSrcsetRegex = regexp.MustCompile(`,\s+`)
|
var splitSrcsetRegex = regexp.MustCompile(`,\s+`)
|
||||||
|
|
||||||
// Sanitize returns safe HTML.
|
// Sanitize returns safe HTML.
|
||||||
func Sanitize(baseURL, input string) string {
|
func Sanitize(baseURL, input string) string {
|
||||||
|
@ -2,9 +2,6 @@ package scraper
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"net/url"
|
"net/url"
|
||||||
"strings"
|
|
||||||
|
|
||||||
"golang.org/x/net/html"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func any(els []string, el string, match func(string, string) bool) bool {
|
func any(els []string, el string, match func(string, string) bool) bool {
|
||||||
@ -16,44 +13,6 @@ func any(els []string, el string, match func(string, string) bool) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func getAttr(node *html.Node, key string) string {
|
|
||||||
for _, a := range node.Attr {
|
|
||||||
if a.Key == key {
|
|
||||||
return a.Val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func getText(node *html.Node) string {
|
|
||||||
text := make([]string, 0)
|
|
||||||
isTextNode := func(n *html.Node) bool {
|
|
||||||
return n.Type == html.TextNode
|
|
||||||
}
|
|
||||||
for _, n := range getNodes(node, isTextNode) {
|
|
||||||
text = append(text, strings.TrimSpace(n.Data))
|
|
||||||
}
|
|
||||||
return strings.Join(text, " ")
|
|
||||||
}
|
|
||||||
|
|
||||||
func getNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
|
|
||||||
nodes := make([]*html.Node, 0)
|
|
||||||
|
|
||||||
queue := make([]*html.Node, 0)
|
|
||||||
queue = append(queue, node)
|
|
||||||
for len(queue) > 0 {
|
|
||||||
var n *html.Node
|
|
||||||
n, queue = queue[0], queue[1:]
|
|
||||||
if match(n) {
|
|
||||||
nodes = append(nodes, n)
|
|
||||||
}
|
|
||||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
||||||
queue = append(queue, c)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nodes
|
|
||||||
}
|
|
||||||
|
|
||||||
func absoluteUrl(href, base string) string {
|
func absoluteUrl(href, base string) string {
|
||||||
baseUrl, err := url.Parse(base)
|
baseUrl, err := url.Parse(base)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user