From f590c358d21f212d3ed7f86b02fe09e3c7487b3d Mon Sep 17 00:00:00 2001 From: Nazar Kanaev Date: Mon, 5 Apr 2021 11:04:24 +0100 Subject: [PATCH] do not strip out content inside table & code --- doc/todo.txt | 1 - src/content/htmlutil/query.go | 10 ++++++++++ src/content/htmlutil/query_test.go | 25 +++++++++++++++++++++++++ src/content/readability/readability.go | 10 ++++++++-- 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/doc/todo.txt b/doc/todo.txt index f87a029..ef7682d 100644 --- a/doc/todo.txt +++ b/doc/todo.txt @@ -1,5 +1,4 @@ - fix: use only 1 item content field - etc: test new parser extensively -- etc: readability strips out code comments in https://2ality.com/2021/01/looping-over-arrays.html - fix: loading items (by scrolling down) is glitching while feeds are refreshing diff --git a/src/content/htmlutil/query.go b/src/content/htmlutil/query.go index bf17644..8c88814 100644 --- a/src/content/htmlutil/query.go +++ b/src/content/htmlutil/query.go @@ -32,6 +32,16 @@ func Query(node *html.Node, sel string) []*html.Node { return FindNodes(node, matcher.Match) } +func Closest(node *html.Node, sel string) *html.Node { + matcher := NewMatcher(sel) + for cur := node; cur != nil; cur = cur.Parent { + if matcher.Match(cur) { + return cur + } + } + return nil +} + func NewMatcher(sel string) Matcher { multi := MultiMatch{} parts := strings.Split(sel, ",") diff --git a/src/content/htmlutil/query_test.go b/src/content/htmlutil/query_test.go index 60c88ac..b0ab50e 100644 --- a/src/content/htmlutil/query_test.go +++ b/src/content/htmlutil/query_test.go @@ -62,3 +62,28 @@ func TestQueryMulti(t *testing.T) { t.Fatal("incorrect match") } } + +func TestClosest(t *testing.T) { + html, _ := html.Parse(strings.NewReader(` + + + + + + + +
+

+
+ + + `)) + link := Query(html, "a") + if link == nil || Attr(link[0], "class") != "bar" { + t.FailNow() + } + wrap := Closest(link[0], "div") + if wrap == nil || Attr(wrap, "class") != "foo" { + t.FailNow() + } +} diff --git a/src/content/readability/readability.go b/src/content/readability/readability.go index 37b7304..fd4c46d 100644 --- a/src/content/readability/readability.go +++ b/src/content/readability/readability.go @@ -8,7 +8,6 @@ import ( "bytes" "fmt" "io" - //"log" "math" "regexp" "strings" @@ -56,7 +55,10 @@ func ExtractContent(page io.Reader) (string, error) { best := getTopCandidate(scores) if best == nil { - best = root + for _, body := range htmlutil.Query(root, "body") { + best = body + break + } } //log.Printf("[Readability] TopCandidate: %v", topCandidate) @@ -123,6 +125,10 @@ func removeUnlikelyCandidates(root *html.Node) { for _, node := range htmlutil.Query(body[0], "*") { str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id") + if htmlutil.Closest(node, "table,code") != nil { + continue + } + blacklisted := ( blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) &&