rewriting readability

This commit is contained in:
Nazar Kanaev 2021-03-30 12:22:59 +01:00
parent e5920259b6
commit c958ee9116
3 changed files with 31 additions and 22 deletions

View File

@ -7,7 +7,7 @@ import (
"golang.org/x/net/html" "golang.org/x/net/html"
) )
var nodeNameRegex = regexp.MustCompile(`\w+`) var nodeNameRegex = regexp.MustCompile(`\w+|\*`)
func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node { func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
nodes := make([]*html.Node, 0) nodes := make([]*html.Node, 0)
@ -40,7 +40,7 @@ func NewMatcher(sel string) Matcher {
if nodeNameRegex.MatchString(part) { if nodeNameRegex.MatchString(part) {
multi.Add(ElementMatch{Name: part}) multi.Add(ElementMatch{Name: part})
} else { } else {
panic("unsupported selector") panic("unsupported selector: " + part)
} }
} }
return multi return multi
@ -55,7 +55,7 @@ type ElementMatch struct {
} }
func (m ElementMatch) Match(n *html.Node) bool { func (m ElementMatch) Match(n *html.Node) bool {
return n.Type == html.ElementNode && n.Data == m.Name return n.Type == html.ElementNode && (n.Data == m.Name || m.Name == "*")
} }
type MultiMatch struct { type MultiMatch struct {

View File

@ -12,6 +12,14 @@ func HTML(node *html.Node) string {
return writer.String() return writer.String()
} }
func InnerHTML(node *html.Node) string {
writer := strings.Builder{}
for c := node.FirstChild; c != nil; c = c.NextSibling {
html.Render(&writer, c)
}
return writer.String()
}
func Attr(node *html.Node, key string) string { func Attr(node *html.Node, key string) string {
for _, a := range node.Attr { for _, a := range node.Attr {
if a.Key == key { if a.Key == key {

View File

@ -83,8 +83,8 @@ func ExtractContent(page io.Reader) (string, error) {
} }
} }
transformMisusedDivsIntoParagraphs(document) transformMisusedDivsIntoParagraphs(root)
removeUnlikelyCandidates(document) removeUnlikelyCandidates(root)
candidates := getCandidates(document) candidates := getCandidates(document)
//log.Printf("[Readability] Candidates: %v", candidates) //log.Printf("[Readability] Candidates: %v", candidates)
@ -139,19 +139,22 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
return output.String() return output.String()
} }
func removeUnlikelyCandidates(document *goquery.Document) { func removeUnlikelyCandidates(root *html.Node) {
document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) { body := htmlutil.Query(root, "body")
class, _ := s.Attr("class") if len(body) == 0 {
id, _ := s.Attr("id") return
str := class + id }
for _, node := range htmlutil.Query(body[0], "*") {
str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) { blacklisted := (
node := s.Get(0) blacklistCandidatesRegexp.MatchString(str) ||
if node.Parent != nil { (unlikelyCandidatesRegexp.MatchString(str) &&
!okMaybeItsACandidateRegexp.MatchString(str)))
if blacklisted && node.Parent != nil {
node.Parent.RemoveChild(node) node.Parent.RemoveChild(node)
} }
} }
})
} }
func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate { func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {
@ -292,12 +295,10 @@ func getClassWeight(s *goquery.Selection) float32 {
return float32(weight) return float32(weight)
} }
func transformMisusedDivsIntoParagraphs(document *goquery.Document) { func transformMisusedDivsIntoParagraphs(root *html.Node) {
document.Find("div").Each(func(i int, s *goquery.Selection) { for _, node := range htmlutil.Query(root, "div") {
html, _ := s.Html() if !divToPElementsRegexp.MatchString(htmlutil.InnerHTML(node)) {
if !divToPElementsRegexp.MatchString(html) {
node := s.Get(0)
node.Data = "p" node.Data = "p"
} }
}) }
} }