do not strip out content inside table & code

This commit is contained in:
Nazar Kanaev
2021-04-05 11:04:24 +01:00
parent fa2fad0ff6
commit f590c358d2
4 changed files with 43 additions and 3 deletions

View File

@@ -8,7 +8,6 @@ import (
"bytes"
"fmt"
"io"
//"log"
"math"
"regexp"
"strings"
@@ -56,7 +55,10 @@ func ExtractContent(page io.Reader) (string, error) {
best := getTopCandidate(scores)
if best == nil {
best = root
for _, body := range htmlutil.Query(root, "body") {
best = body
break
}
}
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
@@ -123,6 +125,10 @@ func removeUnlikelyCandidates(root *html.Node) {
for _, node := range htmlutil.Query(body[0], "*") {
str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
if htmlutil.Closest(node, "table,code") != nil {
continue
}
blacklisted := (
blacklistCandidatesRegexp.MatchString(str) ||
(unlikelyCandidatesRegexp.MatchString(str) &&