rewriting readability

This commit is contained in:
Nazar Kanaev 2021-03-30 12:22:59 +01:00
parent e5920259b6
commit c958ee9116
3 changed files with 31 additions and 22 deletions

View File

@ -7,7 +7,7 @@ import (
"golang.org/x/net/html"
)
var nodeNameRegex = regexp.MustCompile(`\w+`)
var nodeNameRegex = regexp.MustCompile(`\w+|\*`)
func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
nodes := make([]*html.Node, 0)
@ -40,7 +40,7 @@ func NewMatcher(sel string) Matcher {
if nodeNameRegex.MatchString(part) {
multi.Add(ElementMatch{Name: part})
} else {
panic("unsupported selector")
panic("unsupported selector: " + part)
}
}
return multi
@ -55,7 +55,7 @@ type ElementMatch struct {
}
func (m ElementMatch) Match(n *html.Node) bool {
return n.Type == html.ElementNode && n.Data == m.Name
return n.Type == html.ElementNode && (n.Data == m.Name || m.Name == "*")
}
type MultiMatch struct {

View File

@ -12,6 +12,14 @@ func HTML(node *html.Node) string {
return writer.String()
}
func InnerHTML(node *html.Node) string {
writer := strings.Builder{}
for c := node.FirstChild; c != nil; c = c.NextSibling {
html.Render(&writer, c)
}
return writer.String()
}
func Attr(node *html.Node, key string) string {
for _, a := range node.Attr {
if a.Key == key {

View File

@ -83,8 +83,8 @@ func ExtractContent(page io.Reader) (string, error) {
}
}
transformMisusedDivsIntoParagraphs(document)
removeUnlikelyCandidates(document)
transformMisusedDivsIntoParagraphs(root)
removeUnlikelyCandidates(root)
candidates := getCandidates(document)
//log.Printf("[Readability] Candidates: %v", candidates)
@ -139,19 +139,22 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
return output.String()
}
func removeUnlikelyCandidates(document *goquery.Document) {
document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) {
class, _ := s.Attr("class")
id, _ := s.Attr("id")
str := class + id
func removeUnlikelyCandidates(root *html.Node) {
body := htmlutil.Query(root, "body")
if len(body) == 0 {
return
}
for _, node := range htmlutil.Query(body[0], "*") {
str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
node := s.Get(0)
if node.Parent != nil {
node.Parent.RemoveChild(node)
}
blacklisted := (
blacklistCandidatesRegexp.MatchString(str) ||
(unlikelyCandidatesRegexp.MatchString(str) &&
!okMaybeItsACandidateRegexp.MatchString(str)))
if blacklisted && node.Parent != nil {
node.Parent.RemoveChild(node)
}
})
}
}
func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {
@ -292,12 +295,10 @@ func getClassWeight(s *goquery.Selection) float32 {
return float32(weight)
}
func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
document.Find("div").Each(func(i int, s *goquery.Selection) {
html, _ := s.Html()
if !divToPElementsRegexp.MatchString(html) {
node := s.Get(0)
func transformMisusedDivsIntoParagraphs(root *html.Node) {
for _, node := range htmlutil.Query(root, "div") {
if !divToPElementsRegexp.MatchString(htmlutil.InnerHTML(node)) {
node.Data = "p"
}
})
}
}