mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
rewriting readability
This commit is contained in:
parent
e5920259b6
commit
c958ee9116
@ -7,7 +7,7 @@ import (
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
var nodeNameRegex = regexp.MustCompile(`\w+`)
|
||||
var nodeNameRegex = regexp.MustCompile(`\w+|\*`)
|
||||
|
||||
func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
|
||||
nodes := make([]*html.Node, 0)
|
||||
@ -40,7 +40,7 @@ func NewMatcher(sel string) Matcher {
|
||||
if nodeNameRegex.MatchString(part) {
|
||||
multi.Add(ElementMatch{Name: part})
|
||||
} else {
|
||||
panic("unsupported selector")
|
||||
panic("unsupported selector: " + part)
|
||||
}
|
||||
}
|
||||
return multi
|
||||
@ -55,7 +55,7 @@ type ElementMatch struct {
|
||||
}
|
||||
|
||||
func (m ElementMatch) Match(n *html.Node) bool {
|
||||
return n.Type == html.ElementNode && n.Data == m.Name
|
||||
return n.Type == html.ElementNode && (n.Data == m.Name || m.Name == "*")
|
||||
}
|
||||
|
||||
type MultiMatch struct {
|
||||
|
@ -12,6 +12,14 @@ func HTML(node *html.Node) string {
|
||||
return writer.String()
|
||||
}
|
||||
|
||||
func InnerHTML(node *html.Node) string {
|
||||
writer := strings.Builder{}
|
||||
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
||||
html.Render(&writer, c)
|
||||
}
|
||||
return writer.String()
|
||||
}
|
||||
|
||||
func Attr(node *html.Node, key string) string {
|
||||
for _, a := range node.Attr {
|
||||
if a.Key == key {
|
||||
|
@ -83,8 +83,8 @@ func ExtractContent(page io.Reader) (string, error) {
|
||||
}
|
||||
}
|
||||
|
||||
transformMisusedDivsIntoParagraphs(document)
|
||||
removeUnlikelyCandidates(document)
|
||||
transformMisusedDivsIntoParagraphs(root)
|
||||
removeUnlikelyCandidates(root)
|
||||
|
||||
candidates := getCandidates(document)
|
||||
//log.Printf("[Readability] Candidates: %v", candidates)
|
||||
@ -139,19 +139,22 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
||||
return output.String()
|
||||
}
|
||||
|
||||
func removeUnlikelyCandidates(document *goquery.Document) {
|
||||
document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) {
|
||||
class, _ := s.Attr("class")
|
||||
id, _ := s.Attr("id")
|
||||
str := class + id
|
||||
func removeUnlikelyCandidates(root *html.Node) {
|
||||
body := htmlutil.Query(root, "body")
|
||||
if len(body) == 0 {
|
||||
return
|
||||
}
|
||||
for _, node := range htmlutil.Query(body[0], "*") {
|
||||
str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
|
||||
|
||||
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
|
||||
node := s.Get(0)
|
||||
if node.Parent != nil {
|
||||
node.Parent.RemoveChild(node)
|
||||
}
|
||||
blacklisted := (
|
||||
blacklistCandidatesRegexp.MatchString(str) ||
|
||||
(unlikelyCandidatesRegexp.MatchString(str) &&
|
||||
!okMaybeItsACandidateRegexp.MatchString(str)))
|
||||
if blacklisted && node.Parent != nil {
|
||||
node.Parent.RemoveChild(node)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {
|
||||
@ -292,12 +295,10 @@ func getClassWeight(s *goquery.Selection) float32 {
|
||||
return float32(weight)
|
||||
}
|
||||
|
||||
func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
|
||||
document.Find("div").Each(func(i int, s *goquery.Selection) {
|
||||
html, _ := s.Html()
|
||||
if !divToPElementsRegexp.MatchString(html) {
|
||||
node := s.Get(0)
|
||||
func transformMisusedDivsIntoParagraphs(root *html.Node) {
|
||||
for _, node := range htmlutil.Query(root, "div") {
|
||||
if !divToPElementsRegexp.MatchString(htmlutil.InnerHTML(node)) {
|
||||
node.Data = "p"
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user