do not strip out content inside table & code

This commit is contained in:
Nazar Kanaev
2021-04-05 11:04:24 +01:00
parent fa2fad0ff6
commit f590c358d2
4 changed files with 43 additions and 3 deletions

View File

@@ -32,6 +32,16 @@ func Query(node *html.Node, sel string) []*html.Node {
return FindNodes(node, matcher.Match)
}
func Closest(node *html.Node, sel string) *html.Node {
matcher := NewMatcher(sel)
for cur := node; cur != nil; cur = cur.Parent {
if matcher.Match(cur) {
return cur
}
}
return nil
}
func NewMatcher(sel string) Matcher {
multi := MultiMatch{}
parts := strings.Split(sel, ",")

View File

@@ -62,3 +62,28 @@ func TestQueryMulti(t *testing.T) {
t.Fatal("incorrect match")
}
}
func TestClosest(t *testing.T) {
html, _ := html.Parse(strings.NewReader(`
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<div class="foo">
<p><a class="bar" href=""></a></p>
</div>
</body>
</html>
`))
link := Query(html, "a")
if link == nil || Attr(link[0], "class") != "bar" {
t.FailNow()
}
wrap := Closest(link[0], "div")
if wrap == nil || Attr(wrap, "class") != "foo" {
t.FailNow()
}
}

View File

@@ -8,7 +8,6 @@ import (
"bytes"
"fmt"
"io"
//"log"
"math"
"regexp"
"strings"
@@ -56,7 +55,10 @@ func ExtractContent(page io.Reader) (string, error) {
best := getTopCandidate(scores)
if best == nil {
best = root
for _, body := range htmlutil.Query(root, "body") {
best = body
break
}
}
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
@@ -123,6 +125,10 @@ func removeUnlikelyCandidates(root *html.Node) {
for _, node := range htmlutil.Query(body[0], "*") {
str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
if htmlutil.Closest(node, "table,code") != nil {
continue
}
blacklisted := (
blacklistCandidatesRegexp.MatchString(str) ||
(unlikelyCandidatesRegexp.MatchString(str) &&