mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
do not strip out content inside table & code
This commit is contained in:
parent
fa2fad0ff6
commit
f590c358d2
@ -1,5 +1,4 @@
|
||||
- fix: use only 1 item content field
|
||||
- etc: test new parser extensively
|
||||
- etc: readability strips out code comments in https://2ality.com/2021/01/looping-over-arrays.html
|
||||
- fix: loading items (by scrolling down) is glitching while feeds are refreshing
|
||||
|
||||
|
@ -32,6 +32,16 @@ func Query(node *html.Node, sel string) []*html.Node {
|
||||
return FindNodes(node, matcher.Match)
|
||||
}
|
||||
|
||||
func Closest(node *html.Node, sel string) *html.Node {
|
||||
matcher := NewMatcher(sel)
|
||||
for cur := node; cur != nil; cur = cur.Parent {
|
||||
if matcher.Match(cur) {
|
||||
return cur
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func NewMatcher(sel string) Matcher {
|
||||
multi := MultiMatch{}
|
||||
parts := strings.Split(sel, ",")
|
||||
|
@ -62,3 +62,28 @@ func TestQueryMulti(t *testing.T) {
|
||||
t.Fatal("incorrect match")
|
||||
}
|
||||
}
|
||||
|
||||
func TestClosest(t *testing.T) {
|
||||
html, _ := html.Parse(strings.NewReader(`
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title></title>
|
||||
</head>
|
||||
<body>
|
||||
<div class="foo">
|
||||
<p><a class="bar" href=""></a></p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
`))
|
||||
link := Query(html, "a")
|
||||
if link == nil || Attr(link[0], "class") != "bar" {
|
||||
t.FailNow()
|
||||
}
|
||||
wrap := Closest(link[0], "div")
|
||||
if wrap == nil || Attr(wrap, "class") != "foo" {
|
||||
t.FailNow()
|
||||
}
|
||||
}
|
||||
|
@ -8,7 +8,6 @@ import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
//"log"
|
||||
"math"
|
||||
"regexp"
|
||||
"strings"
|
||||
@ -56,7 +55,10 @@ func ExtractContent(page io.Reader) (string, error) {
|
||||
|
||||
best := getTopCandidate(scores)
|
||||
if best == nil {
|
||||
best = root
|
||||
for _, body := range htmlutil.Query(root, "body") {
|
||||
best = body
|
||||
break
|
||||
}
|
||||
}
|
||||
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
|
||||
|
||||
@ -123,6 +125,10 @@ func removeUnlikelyCandidates(root *html.Node) {
|
||||
for _, node := range htmlutil.Query(body[0], "*") {
|
||||
str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
|
||||
|
||||
if htmlutil.Closest(node, "table,code") != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
blacklisted := (
|
||||
blacklistCandidatesRegexp.MatchString(str) ||
|
||||
(unlikelyCandidatesRegexp.MatchString(str) &&
|
||||
|
Loading…
x
Reference in New Issue
Block a user