mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
still rewriting readability
This commit is contained in:
parent
82586dedff
commit
37ddde1765
@ -43,6 +43,8 @@ func (c *candidate) Node() *html.Node {
|
|||||||
return c.selection.Get(0)
|
return c.selection.Get(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type scorelist map[*html.Node]float32
|
||||||
|
|
||||||
func (c *candidate) String() string {
|
func (c *candidate) String() string {
|
||||||
id, _ := c.selection.Attr("id")
|
id, _ := c.selection.Attr("id")
|
||||||
class, _ := c.selection.Attr("class")
|
class, _ := c.selection.Attr("class")
|
||||||
@ -86,19 +88,24 @@ func ExtractContent(page io.Reader) (string, error) {
|
|||||||
transformMisusedDivsIntoParagraphs(root)
|
transformMisusedDivsIntoParagraphs(root)
|
||||||
removeUnlikelyCandidates(root)
|
removeUnlikelyCandidates(root)
|
||||||
|
|
||||||
candidates := getCandidates(document)
|
scores := getCandidates(root)
|
||||||
//log.Printf("[Readability] Candidates: %v", candidates)
|
//log.Printf("[Readability] Candidates: %v", candidates)
|
||||||
|
|
||||||
topCandidate := getTopCandidate(document, candidates)
|
best := getTopCandidate(scores)
|
||||||
|
if best == nil {
|
||||||
|
best = root
|
||||||
|
}
|
||||||
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
|
//log.Printf("[Readability] TopCandidate: %v", topCandidate)
|
||||||
|
|
||||||
output := getArticle(topCandidate, candidates)
|
output := getArticle(root, best, scores)
|
||||||
return output, nil
|
return output, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||||
// Things like preambles, content split by ads that we removed, etc.
|
// Things like preambles, content split by ads that we removed, etc.
|
||||||
func getArticle(topCandidate *candidate, candidates candidateList) string {
|
func getArticle(root, best *html.Node, scores scorelist) string {
|
||||||
|
selection := goquery.NewDocumentFromNode(root).FindNodes(best)
|
||||||
|
topCandidate := &candidate{selection: selection, score: scores[best]}
|
||||||
output := bytes.NewBufferString("<div>")
|
output := bytes.NewBufferString("<div>")
|
||||||
siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2)))
|
siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2)))
|
||||||
|
|
||||||
@ -108,12 +115,12 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
|||||||
|
|
||||||
if node == topCandidate.Node() {
|
if node == topCandidate.Node() {
|
||||||
append = true
|
append = true
|
||||||
} else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold {
|
} else if score, ok := scores[node]; ok && score >= siblingScoreThreshold {
|
||||||
append = true
|
append = true
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.Is("p") {
|
if s.Is("p") {
|
||||||
linkDensity := getLinkDensity(s)
|
linkDensity := getLinkDensity(s.Get(0))
|
||||||
content := s.Text()
|
content := s.Text()
|
||||||
contentLength := len(content)
|
contentLength := len(content)
|
||||||
|
|
||||||
@ -157,21 +164,17 @@ func removeUnlikelyCandidates(root *html.Node) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {
|
func getTopCandidate(scores scorelist) *html.Node {
|
||||||
var best *candidate
|
var best *html.Node
|
||||||
|
var maxScore float32
|
||||||
|
|
||||||
for _, c := range candidates {
|
for node, score := range scores {
|
||||||
if best == nil {
|
if score > maxScore {
|
||||||
best = c
|
best = node
|
||||||
} else if best.score < c.score {
|
maxScore = score
|
||||||
best = c
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if best == nil {
|
|
||||||
best = &candidate{document.Find("body"), 0}
|
|
||||||
}
|
|
||||||
|
|
||||||
return best
|
return best
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -179,33 +182,26 @@ func getTopCandidate(document *goquery.Document, candidates candidateList) *cand
|
|||||||
// Then add their score to their parent node.
|
// Then add their score to their parent node.
|
||||||
// A score is determined by things like number of commas, class names, etc.
|
// A score is determined by things like number of commas, class names, etc.
|
||||||
// Maybe eventually link density.
|
// Maybe eventually link density.
|
||||||
func getCandidates(document *goquery.Document) candidateList {
|
func getCandidates(root *html.Node) scorelist {
|
||||||
candidates := make(candidateList)
|
scores := make(scorelist)
|
||||||
|
for _, node := range htmlutil.Query(root, defaultTagsToScore) {
|
||||||
document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
|
text := htmlutil.Text(node)
|
||||||
text := s.Text()
|
|
||||||
|
|
||||||
// If this paragraph is less than 25 characters, don't even count it.
|
// If this paragraph is less than 25 characters, don't even count it.
|
||||||
if len(text) < 25 {
|
if len(text) < 25 {
|
||||||
return
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
parent := s.Parent()
|
parentNode := node.Parent
|
||||||
parentNode := parent.Get(0)
|
grandParentNode := parentNode.Parent
|
||||||
|
|
||||||
grandParent := parent.Parent()
|
if _, found := scores[parentNode]; !found {
|
||||||
var grandParentNode *html.Node
|
scores[parentNode] = scoreNode(parentNode)
|
||||||
if grandParent.Length() > 0 {
|
|
||||||
grandParentNode = grandParent.Get(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
if _, found := candidates[parentNode]; !found {
|
|
||||||
candidates[parentNode] = scoreNode(parent)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if grandParentNode != nil {
|
if grandParentNode != nil {
|
||||||
if _, found := candidates[grandParentNode]; !found {
|
if _, found := scores[grandParentNode]; !found {
|
||||||
candidates[grandParentNode] = scoreNode(grandParent)
|
scores[grandParentNode] = scoreNode(grandParentNode)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -218,50 +214,52 @@ func getCandidates(document *goquery.Document) candidateList {
|
|||||||
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
||||||
contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3))
|
contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3))
|
||||||
|
|
||||||
candidates[parentNode].score += contentScore
|
scores[parentNode] += contentScore
|
||||||
if grandParentNode != nil {
|
if grandParentNode != nil {
|
||||||
candidates[grandParentNode].score += contentScore / 2.0
|
scores[grandParentNode] += contentScore / 2.0
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
|
|
||||||
// Scale the final candidates score based on link density. Good content
|
// Scale the final candidates score based on link density. Good content
|
||||||
// should have a relatively small link density (5% or less) and be mostly
|
// should have a relatively small link density (5% or less) and be mostly
|
||||||
// unaffected by this operation
|
// unaffected by this operation
|
||||||
for _, candidate := range candidates {
|
for node, _ := range scores {
|
||||||
candidate.score = candidate.score * (1 - getLinkDensity(candidate.selection))
|
scores[node] *= (1 - getLinkDensity(node))
|
||||||
}
|
}
|
||||||
|
|
||||||
return candidates
|
return scores
|
||||||
}
|
}
|
||||||
|
|
||||||
func scoreNode(s *goquery.Selection) *candidate {
|
func scoreNode(node *html.Node) float32 {
|
||||||
c := &candidate{selection: s, score: 0}
|
var score float32
|
||||||
|
|
||||||
switch s.Get(0).DataAtom.String() {
|
switch node.Data {
|
||||||
case "div":
|
case "div":
|
||||||
c.score += 5
|
score += 5
|
||||||
case "pre", "td", "blockquote", "img":
|
case "pre", "td", "blockquote", "img":
|
||||||
c.score += 3
|
score += 3
|
||||||
case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
|
case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
|
||||||
c.score -= 3
|
score -= 3
|
||||||
case "h1", "h2", "h3", "h4", "h5", "h6", "th":
|
case "h1", "h2", "h3", "h4", "h5", "h6", "th":
|
||||||
c.score -= 5
|
score -= 5
|
||||||
}
|
}
|
||||||
|
|
||||||
c.score += getClassWeight(s.Get(0))
|
return score + getClassWeight(node)
|
||||||
return c
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the density of links as a percentage of the content
|
// Get the density of links as a percentage of the content
|
||||||
// This is the amount of text that is inside a link divided by the total text in the node.
|
// This is the amount of text that is inside a link divided by the total text in the node.
|
||||||
func getLinkDensity(s *goquery.Selection) float32 {
|
func getLinkDensity(n *html.Node) float32 {
|
||||||
linkLength := len(s.Find("a").Text())
|
textLength := len(htmlutil.Text(n))
|
||||||
textLength := len(s.Text())
|
|
||||||
|
|
||||||
if textLength == 0 {
|
if textLength == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
linkLength := 0.0
|
||||||
|
for _, a := range htmlutil.Query(n, "a") {
|
||||||
|
linkLength += float64(len(htmlutil.Text(a)))
|
||||||
|
}
|
||||||
|
|
||||||
return float32(linkLength) / float32(textLength)
|
return float32(linkLength) / float32(textLength)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user