mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
finally getting rid of goquery in readability
This commit is contained in:
parent
37ddde1765
commit
401668e413
1
go.mod
1
go.mod
@ -3,7 +3,6 @@ module github.com/nkanaev/yarr
|
||||
go 1.16
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.5.1
|
||||
github.com/mattn/go-sqlite3 v1.14.0
|
||||
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e
|
||||
golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13
|
||||
|
2
go.sum
2
go.sum
@ -1,6 +1,4 @@
|
||||
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
|
||||
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
|
||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA=
|
||||
github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus=
|
||||
|
@ -14,7 +14,6 @@ import (
|
||||
"strings"
|
||||
|
||||
"github.com/nkanaev/yarr/src/htmlutil"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
@ -34,51 +33,15 @@ var (
|
||||
positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
|
||||
)
|
||||
|
||||
type candidate struct {
|
||||
selection *goquery.Selection
|
||||
score float32
|
||||
}
|
||||
|
||||
func (c *candidate) Node() *html.Node {
|
||||
return c.selection.Get(0)
|
||||
}
|
||||
|
||||
type scorelist map[*html.Node]float32
|
||||
|
||||
func (c *candidate) String() string {
|
||||
id, _ := c.selection.Attr("id")
|
||||
class, _ := c.selection.Attr("class")
|
||||
|
||||
if id != "" && class != "" {
|
||||
return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score)
|
||||
} else if id != "" {
|
||||
return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score)
|
||||
} else if class != "" {
|
||||
return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score)
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score)
|
||||
}
|
||||
|
||||
type candidateList map[*html.Node]*candidate
|
||||
|
||||
func (c candidateList) String() string {
|
||||
var output []string
|
||||
for _, candidate := range c {
|
||||
output = append(output, candidate.String())
|
||||
}
|
||||
|
||||
return strings.Join(output, ", ")
|
||||
}
|
||||
type nodeScores map[*html.Node]float32
|
||||
|
||||
// ExtractContent returns relevant content.
|
||||
func ExtractContent(page io.Reader) (string, error) {
|
||||
document, err := goquery.NewDocumentFromReader(page)
|
||||
root, err := html.Parse(page)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
root := document.Get(0)
|
||||
for _, trash := range htmlutil.Query(root, "script,style") {
|
||||
if trash.Parent != nil {
|
||||
trash.Parent.RemoveChild(trash)
|
||||
@ -97,31 +60,39 @@ func ExtractContent(page io.Reader) (string, error) {
|
||||
}
|
||||
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
|
||||
|
||||
output := getArticle(root, best, scores)
|
||||
output := getArticle(best, scores)
|
||||
return output, nil
|
||||
}
|
||||
|
||||
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||
// Things like preambles, content split by ads that we removed, etc.
|
||||
func getArticle(root, best *html.Node, scores scorelist) string {
|
||||
selection := goquery.NewDocumentFromNode(root).FindNodes(best)
|
||||
topCandidate := &candidate{selection: selection, score: scores[best]}
|
||||
func getArticle(best *html.Node, scores nodeScores) string {
|
||||
output := bytes.NewBufferString("<div>")
|
||||
siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2)))
|
||||
siblingScoreThreshold := float32(math.Max(10, float64(scores[best]*.2)))
|
||||
|
||||
topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
|
||||
append := false
|
||||
node := s.Get(0)
|
||||
nodelist := make([]*html.Node, 0)
|
||||
nodelist = append(nodelist, best)
|
||||
|
||||
if node == topCandidate.Node() {
|
||||
append = true
|
||||
} else if score, ok := scores[node]; ok && score >= siblingScoreThreshold {
|
||||
append = true
|
||||
// Get the candidate's siblings
|
||||
for n := best.NextSibling; n != nil; n = n.NextSibling {
|
||||
nodelist = append(nodelist, n)
|
||||
}
|
||||
for n := best.PrevSibling; n != nil; n = n.PrevSibling {
|
||||
nodelist = append(nodelist, n)
|
||||
}
|
||||
|
||||
if s.Is("p") {
|
||||
linkDensity := getLinkDensity(s.Get(0))
|
||||
content := s.Text()
|
||||
for _, node := range nodelist {
|
||||
append := false
|
||||
isP := node.Data == "p"
|
||||
|
||||
if node == best {
|
||||
append = true
|
||||
} else if scores[node] >= siblingScoreThreshold {
|
||||
append = true
|
||||
} else {
|
||||
if isP {
|
||||
linkDensity := getLinkDensity(node)
|
||||
content := htmlutil.Text(node)
|
||||
contentLength := len(content)
|
||||
|
||||
if contentLength >= 80 && linkDensity < .25 {
|
||||
@ -130,17 +101,15 @@ func getArticle(root, best *html.Node, scores scorelist) string {
|
||||
append = true
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if append {
|
||||
tag := "div"
|
||||
if s.Is("p") {
|
||||
tag = node.Data
|
||||
if isP {
|
||||
tag = "p"
|
||||
}
|
||||
fmt.Fprintf(output, "<%s>%s</%s>", tag, htmlutil.InnerHTML(node), tag)
|
||||
}
|
||||
|
||||
html, _ := s.Html()
|
||||
fmt.Fprintf(output, "<%s>%s</%s>", tag, html, tag)
|
||||
}
|
||||
})
|
||||
|
||||
output.Write([]byte("</div>"))
|
||||
return output.String()
|
||||
@ -164,26 +133,26 @@ func removeUnlikelyCandidates(root *html.Node) {
|
||||
}
|
||||
}
|
||||
|
||||
func getTopCandidate(scores scorelist) *html.Node {
|
||||
var best *html.Node
|
||||
var maxScore float32
|
||||
func getTopCandidate(scores nodeScores) *html.Node {
|
||||
var top *html.Node
|
||||
var max float32
|
||||
|
||||
for node, score := range scores {
|
||||
if score > maxScore {
|
||||
best = node
|
||||
maxScore = score
|
||||
if score > max {
|
||||
top = node
|
||||
max = score
|
||||
}
|
||||
}
|
||||
|
||||
return best
|
||||
return top
|
||||
}
|
||||
|
||||
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
|
||||
// Then add their score to their parent node.
|
||||
// A score is determined by things like number of commas, class names, etc.
|
||||
// Maybe eventually link density.
|
||||
func getCandidates(root *html.Node) scorelist {
|
||||
scores := make(scorelist)
|
||||
func getCandidates(root *html.Node) nodeScores {
|
||||
scores := make(nodeScores)
|
||||
for _, node := range htmlutil.Query(root, defaultTagsToScore) {
|
||||
text := htmlutil.Text(node)
|
||||
|
||||
|
@ -10,6 +10,7 @@ import (
|
||||
"github.com/nkanaev/yarr/src/assets"
|
||||
"github.com/nkanaev/yarr/src/auth"
|
||||
"github.com/nkanaev/yarr/src/opml"
|
||||
"github.com/nkanaev/yarr/src/reader"
|
||||
"github.com/nkanaev/yarr/src/router"
|
||||
"github.com/nkanaev/yarr/src/scraper"
|
||||
"github.com/nkanaev/yarr/src/storage"
|
||||
@ -415,7 +416,7 @@ func (s *Server) handlePageCrawl(c *router.Context) {
|
||||
return
|
||||
}
|
||||
defer res.Body.Close()
|
||||
content, err := scraper.ExtractContent(res.Body)
|
||||
content, err := reader.ExtractContent(res.Body)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
c.Out.WriteHeader(http.StatusNoContent)
|
||||
|
Loading…
x
Reference in New Issue
Block a user