still rewriting readability

This commit is contained in:
Nazar Kanaev 2021-03-30 15:41:24 +01:00
parent 82586dedff
commit 37ddde1765

View File

@ -43,6 +43,8 @@ func (c *candidate) Node() *html.Node {
return c.selection.Get(0) return c.selection.Get(0)
} }
type scorelist map[*html.Node]float32
func (c *candidate) String() string { func (c *candidate) String() string {
id, _ := c.selection.Attr("id") id, _ := c.selection.Attr("id")
class, _ := c.selection.Attr("class") class, _ := c.selection.Attr("class")
@ -86,19 +88,24 @@ func ExtractContent(page io.Reader) (string, error) {
transformMisusedDivsIntoParagraphs(root) transformMisusedDivsIntoParagraphs(root)
removeUnlikelyCandidates(root) removeUnlikelyCandidates(root)
candidates := getCandidates(document) scores := getCandidates(root)
//log.Printf("[Readability] Candidates: %v", candidates) //log.Printf("[Readability] Candidates: %v", candidates)
topCandidate := getTopCandidate(document, candidates) best := getTopCandidate(scores)
if best == nil {
best = root
}
//log.Printf("[Readability] TopCandidate: %v", topCandidate) //log.Printf("[Readability] TopCandidate: %v", topCandidate)
output := getArticle(topCandidate, candidates) output := getArticle(root, best, scores)
return output, nil return output, nil
} }
// Now that we have the top candidate, look through its siblings for content that might also be related. // Now that we have the top candidate, look through its siblings for content that might also be related.
// Things like preambles, content split by ads that we removed, etc. // Things like preambles, content split by ads that we removed, etc.
func getArticle(topCandidate *candidate, candidates candidateList) string { func getArticle(root, best *html.Node, scores scorelist) string {
selection := goquery.NewDocumentFromNode(root).FindNodes(best)
topCandidate := &candidate{selection: selection, score: scores[best]}
output := bytes.NewBufferString("<div>") output := bytes.NewBufferString("<div>")
siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2))) siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2)))
@ -108,12 +115,12 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
if node == topCandidate.Node() { if node == topCandidate.Node() {
append = true append = true
} else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold { } else if score, ok := scores[node]; ok && score >= siblingScoreThreshold {
append = true append = true
} }
if s.Is("p") { if s.Is("p") {
linkDensity := getLinkDensity(s) linkDensity := getLinkDensity(s.Get(0))
content := s.Text() content := s.Text()
contentLength := len(content) contentLength := len(content)
@ -157,21 +164,17 @@ func removeUnlikelyCandidates(root *html.Node) {
} }
} }
func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate { func getTopCandidate(scores scorelist) *html.Node {
var best *candidate var best *html.Node
var maxScore float32
for _, c := range candidates { for node, score := range scores {
if best == nil { if score > maxScore {
best = c best = node
} else if best.score < c.score { maxScore = score
best = c
} }
} }
if best == nil {
best = &candidate{document.Find("body"), 0}
}
return best return best
} }
@ -179,33 +182,26 @@ func getTopCandidate(document *goquery.Document, candidates candidateList) *cand
// Then add their score to their parent node. // Then add their score to their parent node.
// A score is determined by things like number of commas, class names, etc. // A score is determined by things like number of commas, class names, etc.
// Maybe eventually link density. // Maybe eventually link density.
func getCandidates(document *goquery.Document) candidateList { func getCandidates(root *html.Node) scorelist {
candidates := make(candidateList) scores := make(scorelist)
for _, node := range htmlutil.Query(root, defaultTagsToScore) {
document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) { text := htmlutil.Text(node)
text := s.Text()
// If this paragraph is less than 25 characters, don't even count it. // If this paragraph is less than 25 characters, don't even count it.
if len(text) < 25 { if len(text) < 25 {
return continue
} }
parent := s.Parent() parentNode := node.Parent
parentNode := parent.Get(0) grandParentNode := parentNode.Parent
grandParent := parent.Parent() if _, found := scores[parentNode]; !found {
var grandParentNode *html.Node scores[parentNode] = scoreNode(parentNode)
if grandParent.Length() > 0 {
grandParentNode = grandParent.Get(0)
}
if _, found := candidates[parentNode]; !found {
candidates[parentNode] = scoreNode(parent)
} }
if grandParentNode != nil { if grandParentNode != nil {
if _, found := candidates[grandParentNode]; !found { if _, found := scores[grandParentNode]; !found {
candidates[grandParentNode] = scoreNode(grandParent) scores[grandParentNode] = scoreNode(grandParentNode)
} }
} }
@ -218,50 +214,52 @@ func getCandidates(document *goquery.Document) candidateList {
// For every 100 characters in this paragraph, add another point. Up to 3 points. // For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3)) contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3))
candidates[parentNode].score += contentScore scores[parentNode] += contentScore
if grandParentNode != nil { if grandParentNode != nil {
candidates[grandParentNode].score += contentScore / 2.0 scores[grandParentNode] += contentScore / 2.0
} }
}) }
// Scale the final candidates score based on link density. Good content // Scale the final candidates score based on link density. Good content
// should have a relatively small link density (5% or less) and be mostly // should have a relatively small link density (5% or less) and be mostly
// unaffected by this operation // unaffected by this operation
for _, candidate := range candidates { for node, _ := range scores {
candidate.score = candidate.score * (1 - getLinkDensity(candidate.selection)) scores[node] *= (1 - getLinkDensity(node))
} }
return candidates return scores
} }
func scoreNode(s *goquery.Selection) *candidate { func scoreNode(node *html.Node) float32 {
c := &candidate{selection: s, score: 0} var score float32
switch s.Get(0).DataAtom.String() { switch node.Data {
case "div": case "div":
c.score += 5 score += 5
case "pre", "td", "blockquote", "img": case "pre", "td", "blockquote", "img":
c.score += 3 score += 3
case "address", "ol", "ul", "dl", "dd", "dt", "li", "form": case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
c.score -= 3 score -= 3
case "h1", "h2", "h3", "h4", "h5", "h6", "th": case "h1", "h2", "h3", "h4", "h5", "h6", "th":
c.score -= 5 score -= 5
} }
c.score += getClassWeight(s.Get(0)) return score + getClassWeight(node)
return c
} }
// Get the density of links as a percentage of the content // Get the density of links as a percentage of the content
// This is the amount of text that is inside a link divided by the total text in the node. // This is the amount of text that is inside a link divided by the total text in the node.
func getLinkDensity(s *goquery.Selection) float32 { func getLinkDensity(n *html.Node) float32 {
linkLength := len(s.Find("a").Text()) textLength := len(htmlutil.Text(n))
textLength := len(s.Text())
if textLength == 0 { if textLength == 0 {
return 0 return 0
} }
linkLength := 0.0
for _, a := range htmlutil.Query(n, "a") {
linkLength += float64(len(htmlutil.Text(a)))
}
return float32(linkLength) / float32(textLength) return float32(linkLength) / float32(textLength)
} }