do not strip out content inside table & code

This commit is contained in:
Nazar Kanaev 2021-04-05 11:04:24 +01:00
parent fa2fad0ff6
commit f590c358d2
4 changed files with 43 additions and 3 deletions

View File

@ -1,5 +1,4 @@
- fix: use only 1 item content field
- etc: test new parser extensively
- etc: readability strips out code comments in https://2ality.com/2021/01/looping-over-arrays.html
- fix: loading items (by scrolling down) is glitching while feeds are refreshing

View File

@ -32,6 +32,16 @@ func Query(node *html.Node, sel string) []*html.Node {
return FindNodes(node, matcher.Match)
}
func Closest(node *html.Node, sel string) *html.Node {
matcher := NewMatcher(sel)
for cur := node; cur != nil; cur = cur.Parent {
if matcher.Match(cur) {
return cur
}
}
return nil
}
func NewMatcher(sel string) Matcher {
multi := MultiMatch{}
parts := strings.Split(sel, ",")

View File

@ -62,3 +62,28 @@ func TestQueryMulti(t *testing.T) {
t.Fatal("incorrect match")
}
}
func TestClosest(t *testing.T) {
html, _ := html.Parse(strings.NewReader(`
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<div class="foo">
<p><a class="bar" href=""></a></p>
</div>
</body>
</html>
`))
link := Query(html, "a")
if link == nil || Attr(link[0], "class") != "bar" {
t.FailNow()
}
wrap := Closest(link[0], "div")
if wrap == nil || Attr(wrap, "class") != "foo" {
t.FailNow()
}
}

View File

@ -8,7 +8,6 @@ import (
"bytes"
"fmt"
"io"
//"log"
"math"
"regexp"
"strings"
@ -56,7 +55,10 @@ func ExtractContent(page io.Reader) (string, error) {
best := getTopCandidate(scores)
if best == nil {
best = root
for _, body := range htmlutil.Query(root, "body") {
best = body
break
}
}
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
@ -123,6 +125,10 @@ func removeUnlikelyCandidates(root *html.Node) {
for _, node := range htmlutil.Query(body[0], "*") {
str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
if htmlutil.Closest(node, "table,code") != nil {
continue
}
blacklisted := (
blacklistCandidatesRegexp.MatchString(str) ||
(unlikelyCandidatesRegexp.MatchString(str) &&