diff --git a/go.mod b/go.mod index cbc46b2..c55fecf 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,6 @@ module github.com/nkanaev/yarr go 1.16 require ( - github.com/PuerkitoBio/goquery v1.5.1 github.com/mattn/go-sqlite3 v1.14.0 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13 diff --git a/go.sum b/go.sum index 02859e5..054e8f1 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,4 @@ -github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= -github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA= github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus= diff --git a/src/reader/readability.go b/src/reader/readability.go index 8442f49..992a5e7 100644 --- a/src/reader/readability.go +++ b/src/reader/readability.go @@ -14,7 +14,6 @@ import ( "strings" "github.com/nkanaev/yarr/src/htmlutil" - "github.com/PuerkitoBio/goquery" "golang.org/x/net/html" ) @@ -34,51 +33,15 @@ var ( positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`) ) -type candidate struct { - selection *goquery.Selection - score float32 -} - -func (c *candidate) Node() *html.Node { - return c.selection.Get(0) -} - -type scorelist map[*html.Node]float32 - -func (c *candidate) String() string { - id, _ := c.selection.Attr("id") - class, _ := c.selection.Attr("class") - - if id != "" && class != "" { - return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score) - } else if id != "" { - return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score) - } else if class != "" { - return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score) - } - - return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score) -} - -type candidateList map[*html.Node]*candidate - -func (c candidateList) String() string { - var output []string - for _, candidate := range c { - output = append(output, candidate.String()) - } - - return strings.Join(output, ", ") -} +type nodeScores map[*html.Node]float32 // ExtractContent returns relevant content. func ExtractContent(page io.Reader) (string, error) { - document, err := goquery.NewDocumentFromReader(page) + root, err := html.Parse(page) if err != nil { return "", err } - root := document.Get(0) for _, trash := range htmlutil.Query(root, "script,style") { if trash.Parent != nil { trash.Parent.RemoveChild(trash) @@ -97,50 +60,56 @@ func ExtractContent(page io.Reader) (string, error) { } //log.Printf("[Readability] TopCandidate: %v", topCandidate) - output := getArticle(root, best, scores) + output := getArticle(best, scores) return output, nil } // Now that we have the top candidate, look through its siblings for content that might also be related. // Things like preambles, content split by ads that we removed, etc. -func getArticle(root, best *html.Node, scores scorelist) string { - selection := goquery.NewDocumentFromNode(root).FindNodes(best) - topCandidate := &candidate{selection: selection, score: scores[best]} +func getArticle(best *html.Node, scores nodeScores) string { output := bytes.NewBufferString("
") - siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2))) + siblingScoreThreshold := float32(math.Max(10, float64(scores[best]*.2))) - topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) { + nodelist := make([]*html.Node, 0) + nodelist = append(nodelist, best) + + // Get the candidate's siblings + for n := best.NextSibling; n != nil; n = n.NextSibling { + nodelist = append(nodelist, n) + } + for n := best.PrevSibling; n != nil; n = n.PrevSibling { + nodelist = append(nodelist, n) + } + + for _, node := range nodelist { append := false - node := s.Get(0) + isP := node.Data == "p" - if node == topCandidate.Node() { + if node == best { append = true - } else if score, ok := scores[node]; ok && score >= siblingScoreThreshold { + } else if scores[node] >= siblingScoreThreshold { append = true - } + } else { + if isP { + linkDensity := getLinkDensity(node) + content := htmlutil.Text(node) + contentLength := len(content) - if s.Is("p") { - linkDensity := getLinkDensity(s.Get(0)) - content := s.Text() - contentLength := len(content) - - if contentLength >= 80 && linkDensity < .25 { - append = true - } else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) { - append = true + if contentLength >= 80 && linkDensity < .25 { + append = true + } else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) { + append = true + } } } - if append { tag := "div" - if s.Is("p") { - tag = node.Data + if isP { + tag = "p" } - - html, _ := s.Html() - fmt.Fprintf(output, "<%s>%s", tag, html, tag) + fmt.Fprintf(output, "<%s>%s", tag, htmlutil.InnerHTML(node), tag) } - }) + } output.Write([]byte("
")) return output.String() @@ -164,26 +133,26 @@ func removeUnlikelyCandidates(root *html.Node) { } } -func getTopCandidate(scores scorelist) *html.Node { - var best *html.Node - var maxScore float32 +func getTopCandidate(scores nodeScores) *html.Node { + var top *html.Node + var max float32 for node, score := range scores { - if score > maxScore { - best = node - maxScore = score + if score > max { + top = node + max = score } } - return best + return top } // Loop through all paragraphs, and assign a score to them based on how content-y they look. // Then add their score to their parent node. // A score is determined by things like number of commas, class names, etc. // Maybe eventually link density. -func getCandidates(root *html.Node) scorelist { - scores := make(scorelist) +func getCandidates(root *html.Node) nodeScores { + scores := make(nodeScores) for _, node := range htmlutil.Query(root, defaultTagsToScore) { text := htmlutil.Text(node) diff --git a/src/server/routes.go b/src/server/routes.go index b27b9b2..376c0d0 100644 --- a/src/server/routes.go +++ b/src/server/routes.go @@ -10,6 +10,7 @@ import ( "github.com/nkanaev/yarr/src/assets" "github.com/nkanaev/yarr/src/auth" "github.com/nkanaev/yarr/src/opml" + "github.com/nkanaev/yarr/src/reader" "github.com/nkanaev/yarr/src/router" "github.com/nkanaev/yarr/src/scraper" "github.com/nkanaev/yarr/src/storage" @@ -415,7 +416,7 @@ func (s *Server) handlePageCrawl(c *router.Context) { return } defer res.Body.Close() - content, err := scraper.ExtractContent(res.Body) + content, err := reader.ExtractContent(res.Body) if err != nil { log.Print(err) c.Out.WriteHeader(http.StatusNoContent)