finally getting rid of goquery in readability

This commit is contained in:
Nazar Kanaev 2021-03-30 21:50:04 +01:00
parent 37ddde1765
commit 401668e413
4 changed files with 45 additions and 78 deletions

1
go.mod
View File

@ -3,7 +3,6 @@ module github.com/nkanaev/yarr
go 1.16 go 1.16
require ( require (
github.com/PuerkitoBio/goquery v1.5.1
github.com/mattn/go-sqlite3 v1.14.0 github.com/mattn/go-sqlite3 v1.14.0
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e
golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13 golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13

2
go.sum
View File

@ -1,6 +1,4 @@
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA= github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA=
github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus= github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus=

View File

@ -14,7 +14,6 @@ import (
"strings" "strings"
"github.com/nkanaev/yarr/src/htmlutil" "github.com/nkanaev/yarr/src/htmlutil"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html" "golang.org/x/net/html"
) )
@ -34,51 +33,15 @@ var (
positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`) positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
) )
type candidate struct { type nodeScores map[*html.Node]float32
selection *goquery.Selection
score float32
}
func (c *candidate) Node() *html.Node {
return c.selection.Get(0)
}
type scorelist map[*html.Node]float32
func (c *candidate) String() string {
id, _ := c.selection.Attr("id")
class, _ := c.selection.Attr("class")
if id != "" && class != "" {
return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score)
} else if id != "" {
return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score)
} else if class != "" {
return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score)
}
return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score)
}
type candidateList map[*html.Node]*candidate
func (c candidateList) String() string {
var output []string
for _, candidate := range c {
output = append(output, candidate.String())
}
return strings.Join(output, ", ")
}
// ExtractContent returns relevant content. // ExtractContent returns relevant content.
func ExtractContent(page io.Reader) (string, error) { func ExtractContent(page io.Reader) (string, error) {
document, err := goquery.NewDocumentFromReader(page) root, err := html.Parse(page)
if err != nil { if err != nil {
return "", err return "", err
} }
root := document.Get(0)
for _, trash := range htmlutil.Query(root, "script,style") { for _, trash := range htmlutil.Query(root, "script,style") {
if trash.Parent != nil { if trash.Parent != nil {
trash.Parent.RemoveChild(trash) trash.Parent.RemoveChild(trash)
@ -97,31 +60,39 @@ func ExtractContent(page io.Reader) (string, error) {
} }
//log.Printf("[Readability] TopCandidate: %v", topCandidate) //log.Printf("[Readability] TopCandidate: %v", topCandidate)
output := getArticle(root, best, scores) output := getArticle(best, scores)
return output, nil return output, nil
} }
// Now that we have the top candidate, look through its siblings for content that might also be related. // Now that we have the top candidate, look through its siblings for content that might also be related.
// Things like preambles, content split by ads that we removed, etc. // Things like preambles, content split by ads that we removed, etc.
func getArticle(root, best *html.Node, scores scorelist) string { func getArticle(best *html.Node, scores nodeScores) string {
selection := goquery.NewDocumentFromNode(root).FindNodes(best)
topCandidate := &candidate{selection: selection, score: scores[best]}
output := bytes.NewBufferString("<div>") output := bytes.NewBufferString("<div>")
siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2))) siblingScoreThreshold := float32(math.Max(10, float64(scores[best]*.2)))
topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) { nodelist := make([]*html.Node, 0)
append := false nodelist = append(nodelist, best)
node := s.Get(0)
if node == topCandidate.Node() { // Get the candidate's siblings
append = true for n := best.NextSibling; n != nil; n = n.NextSibling {
} else if score, ok := scores[node]; ok && score >= siblingScoreThreshold { nodelist = append(nodelist, n)
append = true }
for n := best.PrevSibling; n != nil; n = n.PrevSibling {
nodelist = append(nodelist, n)
} }
if s.Is("p") { for _, node := range nodelist {
linkDensity := getLinkDensity(s.Get(0)) append := false
content := s.Text() isP := node.Data == "p"
if node == best {
append = true
} else if scores[node] >= siblingScoreThreshold {
append = true
} else {
if isP {
linkDensity := getLinkDensity(node)
content := htmlutil.Text(node)
contentLength := len(content) contentLength := len(content)
if contentLength >= 80 && linkDensity < .25 { if contentLength >= 80 && linkDensity < .25 {
@ -130,17 +101,15 @@ func getArticle(root, best *html.Node, scores scorelist) string {
append = true append = true
} }
} }
}
if append { if append {
tag := "div" tag := "div"
if s.Is("p") { if isP {
tag = node.Data tag = "p"
}
fmt.Fprintf(output, "<%s>%s</%s>", tag, htmlutil.InnerHTML(node), tag)
} }
html, _ := s.Html()
fmt.Fprintf(output, "<%s>%s</%s>", tag, html, tag)
} }
})
output.Write([]byte("</div>")) output.Write([]byte("</div>"))
return output.String() return output.String()
@ -164,26 +133,26 @@ func removeUnlikelyCandidates(root *html.Node) {
} }
} }
func getTopCandidate(scores scorelist) *html.Node { func getTopCandidate(scores nodeScores) *html.Node {
var best *html.Node var top *html.Node
var maxScore float32 var max float32
for node, score := range scores { for node, score := range scores {
if score > maxScore { if score > max {
best = node top = node
maxScore = score max = score
} }
} }
return best return top
} }
// Loop through all paragraphs, and assign a score to them based on how content-y they look. // Loop through all paragraphs, and assign a score to them based on how content-y they look.
// Then add their score to their parent node. // Then add their score to their parent node.
// A score is determined by things like number of commas, class names, etc. // A score is determined by things like number of commas, class names, etc.
// Maybe eventually link density. // Maybe eventually link density.
func getCandidates(root *html.Node) scorelist { func getCandidates(root *html.Node) nodeScores {
scores := make(scorelist) scores := make(nodeScores)
for _, node := range htmlutil.Query(root, defaultTagsToScore) { for _, node := range htmlutil.Query(root, defaultTagsToScore) {
text := htmlutil.Text(node) text := htmlutil.Text(node)

View File

@ -10,6 +10,7 @@ import (
"github.com/nkanaev/yarr/src/assets" "github.com/nkanaev/yarr/src/assets"
"github.com/nkanaev/yarr/src/auth" "github.com/nkanaev/yarr/src/auth"
"github.com/nkanaev/yarr/src/opml" "github.com/nkanaev/yarr/src/opml"
"github.com/nkanaev/yarr/src/reader"
"github.com/nkanaev/yarr/src/router" "github.com/nkanaev/yarr/src/router"
"github.com/nkanaev/yarr/src/scraper" "github.com/nkanaev/yarr/src/scraper"
"github.com/nkanaev/yarr/src/storage" "github.com/nkanaev/yarr/src/storage"
@ -415,7 +416,7 @@ func (s *Server) handlePageCrawl(c *router.Context) {
return return
} }
defer res.Body.Close() defer res.Body.Close()
content, err := scraper.ExtractContent(res.Body) content, err := reader.ExtractContent(res.Body)
if err != nil { if err != nil {
log.Print(err) log.Print(err)
c.Out.WriteHeader(http.StatusNoContent) c.Out.WriteHeader(http.StatusNoContent)