mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
do not strip out content inside table & code
This commit is contained in:
parent
fa2fad0ff6
commit
f590c358d2
@ -1,5 +1,4 @@
|
|||||||
- fix: use only 1 item content field
|
- fix: use only 1 item content field
|
||||||
- etc: test new parser extensively
|
- etc: test new parser extensively
|
||||||
- etc: readability strips out code comments in https://2ality.com/2021/01/looping-over-arrays.html
|
|
||||||
- fix: loading items (by scrolling down) is glitching while feeds are refreshing
|
- fix: loading items (by scrolling down) is glitching while feeds are refreshing
|
||||||
|
|
||||||
|
@ -32,6 +32,16 @@ func Query(node *html.Node, sel string) []*html.Node {
|
|||||||
return FindNodes(node, matcher.Match)
|
return FindNodes(node, matcher.Match)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Closest(node *html.Node, sel string) *html.Node {
|
||||||
|
matcher := NewMatcher(sel)
|
||||||
|
for cur := node; cur != nil; cur = cur.Parent {
|
||||||
|
if matcher.Match(cur) {
|
||||||
|
return cur
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func NewMatcher(sel string) Matcher {
|
func NewMatcher(sel string) Matcher {
|
||||||
multi := MultiMatch{}
|
multi := MultiMatch{}
|
||||||
parts := strings.Split(sel, ",")
|
parts := strings.Split(sel, ",")
|
||||||
|
@ -62,3 +62,28 @@ func TestQueryMulti(t *testing.T) {
|
|||||||
t.Fatal("incorrect match")
|
t.Fatal("incorrect match")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestClosest(t *testing.T) {
|
||||||
|
html, _ := html.Parse(strings.NewReader(`
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title></title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="foo">
|
||||||
|
<p><a class="bar" href=""></a></p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`))
|
||||||
|
link := Query(html, "a")
|
||||||
|
if link == nil || Attr(link[0], "class") != "bar" {
|
||||||
|
t.FailNow()
|
||||||
|
}
|
||||||
|
wrap := Closest(link[0], "div")
|
||||||
|
if wrap == nil || Attr(wrap, "class") != "foo" {
|
||||||
|
t.FailNow()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -8,7 +8,6 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
//"log"
|
|
||||||
"math"
|
"math"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
@ -56,7 +55,10 @@ func ExtractContent(page io.Reader) (string, error) {
|
|||||||
|
|
||||||
best := getTopCandidate(scores)
|
best := getTopCandidate(scores)
|
||||||
if best == nil {
|
if best == nil {
|
||||||
best = root
|
for _, body := range htmlutil.Query(root, "body") {
|
||||||
|
best = body
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
|
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
|
||||||
|
|
||||||
@ -123,6 +125,10 @@ func removeUnlikelyCandidates(root *html.Node) {
|
|||||||
for _, node := range htmlutil.Query(body[0], "*") {
|
for _, node := range htmlutil.Query(body[0], "*") {
|
||||||
str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
|
str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
|
||||||
|
|
||||||
|
if htmlutil.Closest(node, "table,code") != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
blacklisted := (
|
blacklisted := (
|
||||||
blacklistCandidatesRegexp.MatchString(str) ||
|
blacklistCandidatesRegexp.MatchString(str) ||
|
||||||
(unlikelyCandidatesRegexp.MatchString(str) &&
|
(unlikelyCandidatesRegexp.MatchString(str) &&
|
||||||
|
Loading…
x
Reference in New Issue
Block a user