mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
finally getting rid of goquery in readability
This commit is contained in:
parent
37ddde1765
commit
401668e413
1
go.mod
1
go.mod
@ -3,7 +3,6 @@ module github.com/nkanaev/yarr
|
|||||||
go 1.16
|
go 1.16
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/PuerkitoBio/goquery v1.5.1
|
|
||||||
github.com/mattn/go-sqlite3 v1.14.0
|
github.com/mattn/go-sqlite3 v1.14.0
|
||||||
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e
|
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e
|
||||||
golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13
|
golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13
|
||||||
|
2
go.sum
2
go.sum
@ -1,6 +1,4 @@
|
|||||||
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
|
|
||||||
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||||
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
|
|
||||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||||
github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA=
|
github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA=
|
||||||
github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus=
|
github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus=
|
||||||
|
@ -14,7 +14,6 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/nkanaev/yarr/src/htmlutil"
|
"github.com/nkanaev/yarr/src/htmlutil"
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -34,51 +33,15 @@ var (
|
|||||||
positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
|
positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
|
||||||
)
|
)
|
||||||
|
|
||||||
type candidate struct {
|
type nodeScores map[*html.Node]float32
|
||||||
selection *goquery.Selection
|
|
||||||
score float32
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *candidate) Node() *html.Node {
|
|
||||||
return c.selection.Get(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
type scorelist map[*html.Node]float32
|
|
||||||
|
|
||||||
func (c *candidate) String() string {
|
|
||||||
id, _ := c.selection.Attr("id")
|
|
||||||
class, _ := c.selection.Attr("class")
|
|
||||||
|
|
||||||
if id != "" && class != "" {
|
|
||||||
return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score)
|
|
||||||
} else if id != "" {
|
|
||||||
return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score)
|
|
||||||
} else if class != "" {
|
|
||||||
return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score)
|
|
||||||
}
|
|
||||||
|
|
||||||
return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score)
|
|
||||||
}
|
|
||||||
|
|
||||||
type candidateList map[*html.Node]*candidate
|
|
||||||
|
|
||||||
func (c candidateList) String() string {
|
|
||||||
var output []string
|
|
||||||
for _, candidate := range c {
|
|
||||||
output = append(output, candidate.String())
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.Join(output, ", ")
|
|
||||||
}
|
|
||||||
|
|
||||||
// ExtractContent returns relevant content.
|
// ExtractContent returns relevant content.
|
||||||
func ExtractContent(page io.Reader) (string, error) {
|
func ExtractContent(page io.Reader) (string, error) {
|
||||||
document, err := goquery.NewDocumentFromReader(page)
|
root, err := html.Parse(page)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
root := document.Get(0)
|
|
||||||
for _, trash := range htmlutil.Query(root, "script,style") {
|
for _, trash := range htmlutil.Query(root, "script,style") {
|
||||||
if trash.Parent != nil {
|
if trash.Parent != nil {
|
||||||
trash.Parent.RemoveChild(trash)
|
trash.Parent.RemoveChild(trash)
|
||||||
@ -97,50 +60,56 @@ func ExtractContent(page io.Reader) (string, error) {
|
|||||||
}
|
}
|
||||||
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
|
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
|
||||||
|
|
||||||
output := getArticle(root, best, scores)
|
output := getArticle(best, scores)
|
||||||
return output, nil
|
return output, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||||
// Things like preambles, content split by ads that we removed, etc.
|
// Things like preambles, content split by ads that we removed, etc.
|
||||||
func getArticle(root, best *html.Node, scores scorelist) string {
|
func getArticle(best *html.Node, scores nodeScores) string {
|
||||||
selection := goquery.NewDocumentFromNode(root).FindNodes(best)
|
|
||||||
topCandidate := &candidate{selection: selection, score: scores[best]}
|
|
||||||
output := bytes.NewBufferString("<div>")
|
output := bytes.NewBufferString("<div>")
|
||||||
siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2)))
|
siblingScoreThreshold := float32(math.Max(10, float64(scores[best]*.2)))
|
||||||
|
|
||||||
topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
|
nodelist := make([]*html.Node, 0)
|
||||||
|
nodelist = append(nodelist, best)
|
||||||
|
|
||||||
|
// Get the candidate's siblings
|
||||||
|
for n := best.NextSibling; n != nil; n = n.NextSibling {
|
||||||
|
nodelist = append(nodelist, n)
|
||||||
|
}
|
||||||
|
for n := best.PrevSibling; n != nil; n = n.PrevSibling {
|
||||||
|
nodelist = append(nodelist, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, node := range nodelist {
|
||||||
append := false
|
append := false
|
||||||
node := s.Get(0)
|
isP := node.Data == "p"
|
||||||
|
|
||||||
if node == topCandidate.Node() {
|
if node == best {
|
||||||
append = true
|
append = true
|
||||||
} else if score, ok := scores[node]; ok && score >= siblingScoreThreshold {
|
} else if scores[node] >= siblingScoreThreshold {
|
||||||
append = true
|
append = true
|
||||||
}
|
} else {
|
||||||
|
if isP {
|
||||||
|
linkDensity := getLinkDensity(node)
|
||||||
|
content := htmlutil.Text(node)
|
||||||
|
contentLength := len(content)
|
||||||
|
|
||||||
if s.Is("p") {
|
if contentLength >= 80 && linkDensity < .25 {
|
||||||
linkDensity := getLinkDensity(s.Get(0))
|
append = true
|
||||||
content := s.Text()
|
} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
|
||||||
contentLength := len(content)
|
append = true
|
||||||
|
}
|
||||||
if contentLength >= 80 && linkDensity < .25 {
|
|
||||||
append = true
|
|
||||||
} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
|
|
||||||
append = true
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if append {
|
if append {
|
||||||
tag := "div"
|
tag := "div"
|
||||||
if s.Is("p") {
|
if isP {
|
||||||
tag = node.Data
|
tag = "p"
|
||||||
}
|
}
|
||||||
|
fmt.Fprintf(output, "<%s>%s</%s>", tag, htmlutil.InnerHTML(node), tag)
|
||||||
html, _ := s.Html()
|
|
||||||
fmt.Fprintf(output, "<%s>%s</%s>", tag, html, tag)
|
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
|
|
||||||
output.Write([]byte("</div>"))
|
output.Write([]byte("</div>"))
|
||||||
return output.String()
|
return output.String()
|
||||||
@ -164,26 +133,26 @@ func removeUnlikelyCandidates(root *html.Node) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getTopCandidate(scores scorelist) *html.Node {
|
func getTopCandidate(scores nodeScores) *html.Node {
|
||||||
var best *html.Node
|
var top *html.Node
|
||||||
var maxScore float32
|
var max float32
|
||||||
|
|
||||||
for node, score := range scores {
|
for node, score := range scores {
|
||||||
if score > maxScore {
|
if score > max {
|
||||||
best = node
|
top = node
|
||||||
maxScore = score
|
max = score
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return best
|
return top
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
|
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
|
||||||
// Then add their score to their parent node.
|
// Then add their score to their parent node.
|
||||||
// A score is determined by things like number of commas, class names, etc.
|
// A score is determined by things like number of commas, class names, etc.
|
||||||
// Maybe eventually link density.
|
// Maybe eventually link density.
|
||||||
func getCandidates(root *html.Node) scorelist {
|
func getCandidates(root *html.Node) nodeScores {
|
||||||
scores := make(scorelist)
|
scores := make(nodeScores)
|
||||||
for _, node := range htmlutil.Query(root, defaultTagsToScore) {
|
for _, node := range htmlutil.Query(root, defaultTagsToScore) {
|
||||||
text := htmlutil.Text(node)
|
text := htmlutil.Text(node)
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@ import (
|
|||||||
"github.com/nkanaev/yarr/src/assets"
|
"github.com/nkanaev/yarr/src/assets"
|
||||||
"github.com/nkanaev/yarr/src/auth"
|
"github.com/nkanaev/yarr/src/auth"
|
||||||
"github.com/nkanaev/yarr/src/opml"
|
"github.com/nkanaev/yarr/src/opml"
|
||||||
|
"github.com/nkanaev/yarr/src/reader"
|
||||||
"github.com/nkanaev/yarr/src/router"
|
"github.com/nkanaev/yarr/src/router"
|
||||||
"github.com/nkanaev/yarr/src/scraper"
|
"github.com/nkanaev/yarr/src/scraper"
|
||||||
"github.com/nkanaev/yarr/src/storage"
|
"github.com/nkanaev/yarr/src/storage"
|
||||||
@ -415,7 +416,7 @@ func (s *Server) handlePageCrawl(c *router.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer res.Body.Close()
|
defer res.Body.Close()
|
||||||
content, err := scraper.ExtractContent(res.Body)
|
content, err := reader.ExtractContent(res.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
c.Out.WriteHeader(http.StatusNoContent)
|
c.Out.WriteHeader(http.StatusNoContent)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user