mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
rewriting readability
This commit is contained in:
parent
e5920259b6
commit
c958ee9116
@ -7,7 +7,7 @@ import (
|
|||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
var nodeNameRegex = regexp.MustCompile(`\w+`)
|
var nodeNameRegex = regexp.MustCompile(`\w+|\*`)
|
||||||
|
|
||||||
func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
|
func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
|
||||||
nodes := make([]*html.Node, 0)
|
nodes := make([]*html.Node, 0)
|
||||||
@ -40,7 +40,7 @@ func NewMatcher(sel string) Matcher {
|
|||||||
if nodeNameRegex.MatchString(part) {
|
if nodeNameRegex.MatchString(part) {
|
||||||
multi.Add(ElementMatch{Name: part})
|
multi.Add(ElementMatch{Name: part})
|
||||||
} else {
|
} else {
|
||||||
panic("unsupported selector")
|
panic("unsupported selector: " + part)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return multi
|
return multi
|
||||||
@ -55,7 +55,7 @@ type ElementMatch struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m ElementMatch) Match(n *html.Node) bool {
|
func (m ElementMatch) Match(n *html.Node) bool {
|
||||||
return n.Type == html.ElementNode && n.Data == m.Name
|
return n.Type == html.ElementNode && (n.Data == m.Name || m.Name == "*")
|
||||||
}
|
}
|
||||||
|
|
||||||
type MultiMatch struct {
|
type MultiMatch struct {
|
||||||
|
@ -12,6 +12,14 @@ func HTML(node *html.Node) string {
|
|||||||
return writer.String()
|
return writer.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func InnerHTML(node *html.Node) string {
|
||||||
|
writer := strings.Builder{}
|
||||||
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
||||||
|
html.Render(&writer, c)
|
||||||
|
}
|
||||||
|
return writer.String()
|
||||||
|
}
|
||||||
|
|
||||||
func Attr(node *html.Node, key string) string {
|
func Attr(node *html.Node, key string) string {
|
||||||
for _, a := range node.Attr {
|
for _, a := range node.Attr {
|
||||||
if a.Key == key {
|
if a.Key == key {
|
||||||
|
@ -83,8 +83,8 @@ func ExtractContent(page io.Reader) (string, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
transformMisusedDivsIntoParagraphs(document)
|
transformMisusedDivsIntoParagraphs(root)
|
||||||
removeUnlikelyCandidates(document)
|
removeUnlikelyCandidates(root)
|
||||||
|
|
||||||
candidates := getCandidates(document)
|
candidates := getCandidates(document)
|
||||||
//log.Printf("[Readability] Candidates: %v", candidates)
|
//log.Printf("[Readability] Candidates: %v", candidates)
|
||||||
@ -139,19 +139,22 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
|||||||
return output.String()
|
return output.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
func removeUnlikelyCandidates(document *goquery.Document) {
|
func removeUnlikelyCandidates(root *html.Node) {
|
||||||
document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) {
|
body := htmlutil.Query(root, "body")
|
||||||
class, _ := s.Attr("class")
|
if len(body) == 0 {
|
||||||
id, _ := s.Attr("id")
|
return
|
||||||
str := class + id
|
}
|
||||||
|
for _, node := range htmlutil.Query(body[0], "*") {
|
||||||
|
str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
|
||||||
|
|
||||||
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
|
blacklisted := (
|
||||||
node := s.Get(0)
|
blacklistCandidatesRegexp.MatchString(str) ||
|
||||||
if node.Parent != nil {
|
(unlikelyCandidatesRegexp.MatchString(str) &&
|
||||||
|
!okMaybeItsACandidateRegexp.MatchString(str)))
|
||||||
|
if blacklisted && node.Parent != nil {
|
||||||
node.Parent.RemoveChild(node)
|
node.Parent.RemoveChild(node)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {
|
func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {
|
||||||
@ -292,12 +295,10 @@ func getClassWeight(s *goquery.Selection) float32 {
|
|||||||
return float32(weight)
|
return float32(weight)
|
||||||
}
|
}
|
||||||
|
|
||||||
func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
|
func transformMisusedDivsIntoParagraphs(root *html.Node) {
|
||||||
document.Find("div").Each(func(i int, s *goquery.Selection) {
|
for _, node := range htmlutil.Query(root, "div") {
|
||||||
html, _ := s.Html()
|
if !divToPElementsRegexp.MatchString(htmlutil.InnerHTML(node)) {
|
||||||
if !divToPElementsRegexp.MatchString(html) {
|
|
||||||
node := s.Get(0)
|
|
||||||
node.Data = "p"
|
node.Data = "p"
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user