finally getting rid of goquery in readability

2025-07-09 00:10:09 +00:00 · 2021-03-30 21:50:04 +01:00 · 2021-03-30 21:50:04 +01:00 · 401668e413
commit 401668e413
parent 37ddde1765
4 changed files with 45 additions and 78 deletions
--- a/go.mod
+++ b/go.mod
@ -3,7 +3,6 @@ module github.com/nkanaev/yarr
 go 1.16
 require (
 	github.com/PuerkitoBio/goquery v1.5.1
 	github.com/mattn/go-sqlite3 v1.14.0
 	golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e
 	golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13
--- a/go.sum
+++ b/go.sum
@ -1,6 +1,4 @@
 github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
 github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
 github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
 github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
 github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA=
 github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus=
--- a/src/reader/readability.go
+++ b/src/reader/readability.go
@ -14,7 +14,6 @@ import (
 	"strings"
 	"github.com/nkanaev/yarr/src/htmlutil"
 	"github.com/PuerkitoBio/goquery"
 	"golang.org/x/net/html"
 )
@ -34,51 +33,15 @@ var (
 	positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
 )
-type candidate struct {
+type nodeScores map[*html.Node]float32
 	selection *goquery.Selection
 	score     float32
 }
 func (c *candidate) Node() *html.Node {
 	return c.selection.Get(0)
 }
 type scorelist map[*html.Node]float32
 func (c *candidate) String() string {
 	id, _ := c.selection.Attr("id")
 	class, _ := c.selection.Attr("class")
 	if id != "" && class != "" {
 		return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score)
 	} else if id != "" {
 		return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score)
 	} else if class != "" {
 		return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score)
 	}
 	return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score)
 }
 type candidateList map[*html.Node]*candidate
 func (c candidateList) String() string {
 	var output []string
 	for _, candidate := range c {
 		output = append(output, candidate.String())
 	}
 	return strings.Join(output, ", ")
 }
 // ExtractContent returns relevant content.
 func ExtractContent(page io.Reader) (string, error) {
-	document, err := goquery.NewDocumentFromReader(page)
+	root, err := html.Parse(page)
 	if err != nil {
 		return "", err
 	}
 	root := document.Get(0)
 	for _, trash := range htmlutil.Query(root, "script,style") {
 		if trash.Parent != nil {
 			trash.Parent.RemoveChild(trash)
@ -97,50 +60,56 @@ func ExtractContent(page io.Reader) (string, error) {
 	}
 	//log.Printf("[Readability] TopCandidate: %v", topCandidate)
-	output := getArticle(root, best, scores)
+	output := getArticle(best, scores)
 	return output, nil
 }
 // Now that we have the top candidate, look through its siblings for content that might also be related.
 // Things like preambles, content split by ads that we removed, etc.
-func getArticle(root, best *html.Node, scores scorelist) string {
+func getArticle(best *html.Node, scores nodeScores) string {
 	selection := goquery.NewDocumentFromNode(root).FindNodes(best)
 	topCandidate := &candidate{selection: selection, score: scores[best]}
 	output := bytes.NewBufferString("<div>")
-	siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2)))
+	siblingScoreThreshold := float32(math.Max(10, float64(scores[best]*.2)))
-	topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
+	nodelist := make([]*html.Node, 0)
 	nodelist = append(nodelist, best)
 	// Get the candidate's siblings
 	for n := best.NextSibling; n != nil; n = n.NextSibling {
 		nodelist = append(nodelist, n)
 	}
 	for n := best.PrevSibling; n != nil; n = n.PrevSibling {
 		nodelist = append(nodelist, n)
 	}
 	for _, node := range nodelist {
 		append := false
-		node := s.Get(0)
+		isP := node.Data == "p"
-		if node == topCandidate.Node() {
+		if node == best {
 			append = true
-		} else if score, ok := scores[node]; ok && score >= siblingScoreThreshold {
+		} else if scores[node] >= siblingScoreThreshold {
 			append = true
-		}
+		} else {
 			if isP {
 				linkDensity := getLinkDensity(node)
 				content := htmlutil.Text(node)
 				contentLength := len(content)
-		if s.Is("p") {
+				if contentLength >= 80 && linkDensity < .25 {
-			linkDensity := getLinkDensity(s.Get(0))
+					append = true
-			content := s.Text()
+				} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
-			contentLength := len(content)
+					append = true
-
+				}
 			if contentLength >= 80 && linkDensity < .25 {
 				append = true
 			} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
 				append = true
 			}
 		}
 		if append {
 			tag := "div"
-			if s.Is("p") {
+			if isP {
-				tag = node.Data
+				tag = "p"
 			}
-
+			fmt.Fprintf(output, "<%s>%s</%s>", tag, htmlutil.InnerHTML(node), tag)
 			html, _ := s.Html()
 			fmt.Fprintf(output, "<%s>%s</%s>", tag, html, tag)
 		}
-	})
+	}
 	output.Write([]byte("</div>"))
 	return output.String()
@ -164,26 +133,26 @@ func removeUnlikelyCandidates(root *html.Node) {
 	}
 }
-func getTopCandidate(scores scorelist) *html.Node {
+func getTopCandidate(scores nodeScores) *html.Node {
-	var best *html.Node
+	var top *html.Node
-	var maxScore float32
+	var max float32
 	for node, score := range scores {
-		if score > maxScore {
+		if score > max {
-			best = node
+			top = node
-			maxScore = score
+			max = score
 		}
 	}
-	return best
+	return top
 }
 // Loop through all paragraphs, and assign a score to them based on how content-y they look.
 // Then add their score to their parent node.
 // A score is determined by things like number of commas, class names, etc.
 // Maybe eventually link density.
-func getCandidates(root *html.Node) scorelist {
+func getCandidates(root *html.Node) nodeScores {
-	scores := make(scorelist)
+	scores := make(nodeScores)
 	for _, node := range htmlutil.Query(root, defaultTagsToScore) {
 		text := htmlutil.Text(node)
--- a/src/server/routes.go
+++ b/src/server/routes.go
@ -10,6 +10,7 @@ import (
 	"github.com/nkanaev/yarr/src/assets"
 	"github.com/nkanaev/yarr/src/auth"
 	"github.com/nkanaev/yarr/src/opml"
 	"github.com/nkanaev/yarr/src/reader"
 	"github.com/nkanaev/yarr/src/router"
 	"github.com/nkanaev/yarr/src/scraper"
 	"github.com/nkanaev/yarr/src/storage"
@ -415,7 +416,7 @@ func (s *Server) handlePageCrawl(c *router.Context) {
 		return
 	}
 	defer res.Body.Close()
-	content, err := scraper.ExtractContent(res.Body)
+	content, err := reader.ExtractContent(res.Body)
 	if err != nil {
 		log.Print(err)
 		c.Out.WriteHeader(http.StatusNoContent)